In [235]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
import matplotlib.pyplot as plt
%matplotlib inline

In [236]:
df = pd.read_csv('Files/dss_Alb h 2017-09.csv')
key = pd.read_excel('Files/Ethicon Alb Points List.xlsx')

In [237]:
# collects DataPointNames from the points list spreadsheet that are BAS point types
key_bas = key.loc[key['PointType'].str.contains("BAS")==True,'DataPointName']

#converts pandas series to a list for future use
val = key_bas.values.T.tolist()

#removes DataPointNames that containt the prefix CHWV
vals = [x for x in val if not x.startswith('CHWV')]

#tests whether all values from the point list spreadsheet are column headings of the dataset
for x in vals:
    if x not in df.columns:
        #prints and removes any string not found in the data
        print(x)
        vals.remove(x)
        
#expresses data using columns specified by the vals list
bas = df[vals+['OptimumControl', 'kW/Ton']]
print('Original data contains '+str(df.shape[0])+' points and '+str(df.shape[1])+ ' dimensions.')
print('Filtered data contains '+str(bas.dropna().shape[0])+' points and '+str(bas.dropna().shape[1])+ ' dimensions.')

CommunicationFailure_COV
Original data contains 1465 points and 414 dimensions.
Filtered data contains 1432 points and 66 dimensions.


In [238]:
bas = df[vals].dropna()
bas.shape

(1432, 64)

In [239]:
n = len(bas.columns)

In [240]:
bas.columns

Index(['CDWVLV', 'CDWVLV2', 'CHkW', 'CHWDP2', 'CHWFLO', 'CHWRT', 'CHWRT2',
       'CHWRT3', 'CHWST', 'CHWST2', 'CHWST3', 'CLGMODE',
       'CommunicationFailure', 'CTTR_ALARM', 'HX1CDWRT', 'HX1CDWST',
       'HX1CDWVLV', 'HX1CHWRT', 'HX1CHWST', 'HX1CHWVLV', 'LOOPREQ',
       'MaxCHDPLift', 'MaxCHDTLift', 'MinCHDPLift', 'MinCHDTLift', 'OAH',
       'OAT', 'OAWB', 'PCHWP3Failed', 'PCHWP3HZ', 'PCHWP3kW', 'PCHWP3S',
       'PCHWP3SPD', 'PCHWP3SS', 'PCHWP4Failed', 'PCHWP4HZ', 'PCHWP4kW',
       'PCHWP4S', 'PCHWP4SPD', 'PCHWP4SS', 'PCHWP5Failed', 'PCHWP5HZ',
       'PCHWP5kW', 'PCHWP5S', 'PCHWP5SPD', 'PCHWP5SS', 'SCHWP3Failed',
       'SCHWP3HZ', 'SCHWP3kW', 'SCHWP3S', 'SCHWP3SPD', 'SCHWP3SS',
       'SCHWP4Failed', 'SCHWP4HZ', 'SCHWP4kW', 'SCHWP4S', 'SCHWP4SPD',
       'SCHWP4SS', 'SCHWP5Failed', 'SCHWP5HZ', 'SCHWP5kW', 'SCHWP5S',
       'SCHWP5SPD', 'SCHWP5SS'],
      dtype='object')

In [241]:
xtrain, xtest, ytrain, ytest = train_test_split(bas.values[:, 0:(n-2)], bas.values[:, (n-1)], test_size = 0.25)
#xtrain, xtest, ytrain, ytest = train_test_split(bas.values, bas.values, test_size = 0.25)

#### LASSO

In [245]:
reg_lasso = linear_model.Lasso()
reg_lasso.fit(xtrain, ytrain)
coef = reg_lasso.coef_
lasso_coef = pd.DataFrame(coef, bas.columns[:-2])
lasso_coef = lasso_coef[coef!=0]
lasso_coef.columns = ['lasso_coef']
lasso_coef

Unnamed: 0,lasso_coef
CHWFLO,0.000264
SCHWP5HZ,0.008586


#### LINEAR REGRESSION

In [248]:
reg_linear = linear_model.LinearRegression()
reg_linear.fit(xtrain, ytrain)
coef = reg_linear.coef_
linear_coef = pd.DataFrame(coef, bas.columns[:-2])
linear_coef = linear_coef[abs(coef) > 1e-05]
linear_coef.columns = ['linear_coef']
linear_coef

Unnamed: 0,linear_coef
CDWVLV2,-0.001242
CLGMODE,2.6e-05
CommunicationFailure,-3.7e-05
HX1CDWVLV,-1.2e-05
SCHWP5S,1.0
