In [66]:
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [4]:
data = pd.read_csv('energydata_complete.csv')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [7]:
from sklearn.linear_model import LinearRegression

In [20]:
pd.DataFrame(data['T2'])

Unnamed: 0,T2
0,19.200000
1,19.200000
2,19.200000
3,19.200000
4,19.200000
...,...
19730,25.890000
19731,25.754000
19732,25.628571
19733,25.414000


In [21]:
model = LinearRegression()
model.fit(pd.DataFrame(data['T2']),data['T6'])

LinearRegression()

In [88]:
a = np.array([1,2,3,4,5,6])
b = np.array([0,1,2,3,4,5])
(b - np.mean(b))**2

array([6.25, 2.25, 0.25, 0.25, 2.25, 6.25])

In [105]:
def get_r_squared(pred,y):
    assert len(y) == len(pred)
    _pred = np.array(pred)
    _y = np.array(y)
    res_sum_of_squares = np.sum((_y - _pred)**2)
    tot_sum_of_squares = np.sum((_y - np.mean(_y))**2)
    r_sqr = 1 - (res_sum_of_squares/tot_sum_of_squares)
    return r_sqr

def get_MAE(pred, y):
    assert len(y) == len(pred)
    _pred = np.array(pred)
    _y = np.array(y)
    mae = np.sum(np.abs(_y - _pred)) / len(y)
    return mae

def get_res_sum_squares(pred, y):
    assert len(y) == len(pred)
    _pred = np.array(pred)
    _y = np.array(y)
    rss = np.sum((_y - _pred)**2)
    return rss
def get_r2_score(pred, y):   # copied from a stackoverflow solution to confirm mismatch in self implemented r2score and scikitlearns
    assert len(y) == len(pred)  # als to confirm if there was an error in my own implemented rsuared function
    _pred = np.array(pred)
    _y = np.array(y)
    res_sum_squares = sum(map(lambda x: (x[0]-x[1]) **2, zip(_y,_pred)))
    total_sum_squares = sum([(x - np.mean(_y))**2 for x in _y])
    r_sqr = 1 - (res_sum_squares/total_sum_squares)
    return r_sqr

In [104]:
preds = model.predict(pd.DataFrame(data['T2']))
print(get_r_squared(preds,data['T6']))
print(r2_score(preds, data['T6']))

#Disparity noticed in self implemented r2 score and scikitlearns; r2 score

0.6418990830855492
0.4421227162483323


In [35]:
df = data.drop(['date', 'lights'], axis = 1)

In [37]:
scaler = MinMaxScaler()
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Appliances'], axis = 1), df['Appliances'], test_size = 0.3, random_state = 42)

In [38]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
model2 = LinearRegression()
model2.fit(x_train_scaled, y_train)
preds2 = model2.predict(x_test_scaled)

In [106]:
# Using bothe my implemented metrics and already implemented metrics in the sklearn library
print(get_MAE(preds2,y_test))
print(mean_absolute_error(preds2,y_test))

print(get_res_sum_squares(preds2, y_test))

print(np.sqrt(mean_squared_error(preds2,y_test)))

print(get_r_squared(preds2, y_test))
print(r2_score(preds2, y_test))

53.64297765584959
53.64297765584959
51918501.21223274
93.64046093998029
0.14890246319303535
-4.411893747294911


In [57]:
weights = model2.coef_
weights

array([  -3.51072478,  495.52601962, -252.710373  , -469.52136238,
        310.97103712,  102.77165244,   31.00966845,   27.31182902,
        -16.75282177,   17.03563561,  252.71692835,   40.7120577 ,
         11.00293428,  -47.56355153,  109.13469841, -168.62716102,
       -203.23662712,  -42.58633878, -344.3898449 ,    7.31524276,
        -83.1075989 ,   31.22594824,   13.16807161,  125.43063708,
          0.82386956,    0.82386956])

In [60]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha = 0.4)
ridge_model.fit(x_train_scaled, y_train)
ridge_pred = ridge_model.predict(x_test_scaled)
print(np.sqrt(mean_squared_error(ridge_pred, y_test)))

93.66003653920396


In [109]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 0.001)
lasso_model.fit(x_train_scaled, y_train)
lasso_pred = lasso_model.predict(x_test_scaled)
print(np.sqrt(mean_squared_error(lasso_pred, y_test)))

93.6406727479042


In [110]:
lasso_w = lasso_model.coef_
lasso_w.

array([  -4.05585856,  494.32619407, -249.63865276, -465.82104986,
        310.20015553,  102.17247227,   30.34420541,   26.56058062,
        -17.1130125 ,   17.01355581,  249.83361984,   40.09096313,
         10.30491756,  -47.20961004,  108.879691  , -168.69793662,
       -202.48080861,  -42.44605292, -332.57356496,    7.28122335,
        -77.98877912,   31.38802504,   13.07813126,  118.06259364,
          1.63099462,    0.        ])