In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('PredictEnergy.csv')

In [3]:
data.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [4]:
data.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [5]:
data.tail()

Unnamed: 0,AT,V,AP,RH,PE
9563,16.65,49.69,1014.01,91.0,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74
9567,21.6,62.52,1017.23,67.87,453.28


In [6]:
data.dtypes

AT    float64
V     float64
AP    float64
RH    float64
PE    float64
dtype: object

In [7]:
data.shape

(9568, 5)

## Data Pre-processing

In [8]:
# Overall NA values
data.isna().sum(axis=1).sum()

0

In [9]:
# Independent attributes
X = data.iloc[:,0:4]
X.head()

Unnamed: 0,AT,V,AP,RH
0,14.96,41.76,1024.07,73.17
1,25.18,62.96,1020.04,59.08
2,5.11,39.4,1012.16,92.14
3,20.86,57.32,1010.24,76.64
4,10.82,37.5,1009.23,96.62


In [10]:
# Target attribute
y = data.iloc[:, -1]
y.head()

0    463.26
1    444.37
2    488.56
3    446.48
4    473.90
Name: PE, dtype: float64

In [11]:
# Split the data in TRAIN and TEST data-set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4)

## Linear Regression

In [12]:
# Build a linear regression model
linReg = LinearRegression().fit(X_train, y_train)

In [13]:
# Accuracy on Train and Test data-set
print('Train Accuracy : {:.2f}'.format(linReg.score(X_train, y_train)*100))
print('Test Accuracy  : {:.2f}'.format(linReg.score(X_test, y_test)*100))

Train Accuracy : 92.85
Test Accuracy  : 92.91


In [14]:
# Co-efficient and Intercept values
print('Co-efficient (w) : {}'.format(linReg.coef_))
print('Intercept (b) : {:.2f}'.format(linReg.intercept_))

# PE = -1.98468862*AT - 0.22963276*V + 0.06694701*AP - -0.15805687*RH + 449.58

Co-efficient (w) : [-1.98468862 -0.22963276  0.06694701 -0.15805687]
Intercept (b) : 449.58


## Ridge and Lasso Regression

In [15]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [16]:
print('Train data: \n', X_train_scaled)
print('---------------------------------------------------------')
print('Test data: \n', X_test_scaled)

Train data: 
 [[0.68186969 0.68274021 0.41054195 0.69182306]
 [0.60764873 0.6797153  0.51348676 0.61313673]
 [0.17592068 0.26459075 0.60702796 0.86729223]
 ...
 [0.68583569 0.82882562 0.40806731 0.51689008]
 [0.60226629 0.80266904 0.38975501 0.77493298]
 [0.23371105 0.31779359 0.52487008 0.80616622]]
---------------------------------------------------------
Test data: 
 [[0.16671765 0.32415177 0.66433217 0.80070024]
 [0.27347813 0.27708865 0.32816408 0.96808511]
 [0.56347507 0.79697191 0.2196098  0.69983841]
 ...
 [0.71275619 0.82743524 0.42646323 0.78252087]
 [0.55429795 0.61747537 0.63306653 0.80797199]
 [0.18721322 0.2608537  0.55102551 0.80110423]]


### a) Ridge Regression

In [17]:
# With some random alpha value = 12
linRidge = Ridge(alpha = 12).fit(X_train_scaled, y_train)

In [18]:
# Accuracy on Train and Test data
print('Train Accuracy : {:.3f}'.format(linRidge.score(X_train_scaled, y_train)*100))
print('Test Accuracy  : {:.3f}'.format(linRidge.score(X_test_scaled, y_test)*100))

Train Accuracy : 92.501
Test Accuracy  : 91.543


In [19]:
# Co-efficient and Intercept values
print('Co-efficient (w) :', linRidge.coef_)
print('Intercept (b) : {:.2f}'.format(linRidge.intercept_))

Co-efficient (w) : [-58.41279798 -19.26657117   6.05124241  -7.18169929]
Intercept (b) : 495.35


### b) Lasso Regression

In [20]:
# With some random alpha value = 20

linLasso = Ridge(alpha = 8).fit(X_train_scaled, y_train)

In [21]:
# Accuracy on Train and Test data
print('Train Accuracy : {:.3f}'.format(linLasso.score(X_train_scaled, y_train)*100))
print('Test Accuracy  : {:.3f}'.format(linLasso.score(X_test_scaled, y_test)*100))

Train Accuracy : 92.664
Test Accuracy  : 91.510


In [22]:
# Co-efficient and Intercept values
print('Co-efficient (w) :', linLasso.coef_)
print('Intercept (b) : {:.2f}'.format(linLasso.intercept_))

Co-efficient (w) : [-61.51038234 -17.6531085    5.1951727   -8.42302509]
Intercept (b) : 497.31


### Best Alpha value for Ridge Regularization

In [23]:
for this_alpha in np.arange(0, 15, 0.25):
    linRidge1 = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
    train_acc = linRidge1.score(X_train_scaled, y_train)
    test_acc  = linRidge1.score(X_test_scaled, y_test)
    
    print('Alpha = {:.2f} \n\
           Accuracy - Train: {:.5f}, \
           Accuracy - Train: {:.5f} \n'
          .format(this_alpha, train_acc, test_acc))

Alpha = 0.00 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91120 

Alpha = 0.25 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91143 

Alpha = 0.50 
           Accuracy - Train: 0.92850,            Accuracy - Train: 0.91166 

Alpha = 0.75 
           Accuracy - Train: 0.92849,            Accuracy - Train: 0.91187 

Alpha = 1.00 
           Accuracy - Train: 0.92847,            Accuracy - Train: 0.91208 

Alpha = 1.25 
           Accuracy - Train: 0.92845,            Accuracy - Train: 0.91227 

Alpha = 1.50 
           Accuracy - Train: 0.92842,            Accuracy - Train: 0.91246 

Alpha = 1.75 
           Accuracy - Train: 0.92839,            Accuracy - Train: 0.91264 

Alpha = 2.00 
           Accuracy - Train: 0.92835,            Accuracy - Train: 0.91281 

Alpha = 2.25 
           Accuracy - Train: 0.92831,            Accuracy - Train: 0.91297 

Alpha = 2.50 
           Accuracy - Train: 0.92827,            Accuracy - Train: 0.91313 


### Best Alpha value for Lasso Regularization 

In [24]:
for this_alpha in np.arange(0, 0.1, 0.001):
    linLasso1 = Lasso(alpha = this_alpha).fit(X_train_scaled, y_train)
    train_acc = linLasso1.score(X_train_scaled, y_train)
    test_acc  = linLasso1.score(X_test_scaled, y_test)
    
    print('Alpha = {:.3f} \n\
           Accuracy - Train: {:.5f}, \
           Accuracy - Train: {:.5f} \n'
          .format(this_alpha, train_acc, test_acc))

  
  positive)


Alpha = 0.000 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91120 

Alpha = 0.001 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91125 

Alpha = 0.002 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91133 

Alpha = 0.003 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91140 

Alpha = 0.004 
           Accuracy - Train: 0.92851,            Accuracy - Train: 0.91147 

Alpha = 0.005 
           Accuracy - Train: 0.92850,            Accuracy - Train: 0.91154 

Alpha = 0.006 
           Accuracy - Train: 0.92850,            Accuracy - Train: 0.91161 

Alpha = 0.007 
           Accuracy - Train: 0.92849,            Accuracy - Train: 0.91168 

Alpha = 0.008 
           Accuracy - Train: 0.92849,            Accuracy - Train: 0.91175 

Alpha = 0.009 
           Accuracy - Train: 0.92848,            Accuracy - Train: 0.91181 

Alpha = 0.010 
           Accuracy - Train: 0.92847,            Accuracy - Train

## PCA

In [42]:
pca_obj = PCA(n_components=3).fit(X_train_scaled)
pca_train = pca_obj.transform(X_train_scaled)

print('Explained Variance: ', pca_obj.explained_variance_ratio_)

principalDf1 = pd.DataFrame(data = pca_train, columns = ['comp1', 'comp2','comp3'])
principalDf1.head()

Explained Variance:  [0.67093955 0.19829349 0.10210391]


Unnamed: 0,comp1,comp2,comp3
0,0.219314,-0.140995,0.012479
1,0.174304,-0.039465,-0.073098
2,-0.479469,-0.071964,-0.019581
3,-0.230714,-0.200442,-0.117765
4,-0.276081,-0.039161,0.235227


In [45]:
################# OR ##################
pca = PCA(n_components=3)
prin_comp = pca.fit_transform(X_train_scaled)

print('Explained Variance: ', pca.explained_variance_ratio_)

principalDf2 = pd.DataFrame(data = prin_comp, columns = ['comp1', 'comp2','comp3'])
principalDf2.head()

Explained Variance:  [0.67093955 0.19829349 0.10210391]


Unnamed: 0,comp1,comp2,comp3
0,0.219314,-0.140995,0.012479
1,0.174304,-0.039465,-0.073098
2,-0.479469,-0.071964,-0.019581
3,-0.230714,-0.200442,-0.117765
4,-0.276081,-0.039161,0.235227


In [40]:
finalDf = pd.concat([principalDf2, data[['PE']]], axis = 1)
finalDf.head()

Unnamed: 0,comp1,comp2,comp3,PE
0,0.219314,-0.140995,0.012479,463.26
1,0.174304,-0.039465,-0.073098,444.37
2,-0.479469,-0.071964,-0.019581,488.56
3,-0.230714,-0.200442,-0.117765,446.48
4,-0.276081,-0.039161,0.235227,473.9
