In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
fig = plt.figure(figsize=(18,6), dpi=1600) 

from sklearn.preprocessing import OneHotEncoder


<matplotlib.figure.Figure at 0x7f9bf3c02450>

In [2]:
data_raw = pd.read_csv("train.csv")

In [3]:
data_raw.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption
0,0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0
1,1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0
2,2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0
3,3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0
4,4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0


### Let's one hot encode var2

In [4]:
np.unique(data_raw.var2)

array(['A', 'B', 'C'], dtype=object)

In [5]:
var2_one_hot = pd.get_dummies(data_raw.var2)
print("Var2 shape:", var2_one_hot.shape)

('Var2 shape:', (26496, 3))


In [6]:
var2_one_hot.sum()

A    25239
B      217
C     1040
dtype: int64

### Let's add the month as a 1 hot variable 

In [7]:
data_raw["month"] = pd.DatetimeIndex(data_raw.datetime).month
month_dummies = pd.get_dummies(data_raw["month"], prefix = "month")

### Combine the features with the original data 

In [8]:
data_edit = data_raw.drop(["var2"], axis = 1)
data_edit = data_edit.join(var2_one_hot)
data_edit = data_edit.join(month_dummies)

In [13]:
[item for item in data_edit.columns.tolist()]

['ID',
 'datetime',
 'temperature',
 'var1',
 'pressure',
 'windspeed',
 'electricity_consumption',
 'month',
 'A',
 'B',
 'C',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

## Let's begin the ML Process

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
keep_vars = ["temperature", "var1", "pressure", "windspeed", "A", "B", "C",
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

In [16]:
X = data_edit[keep_vars]
Y = data_edit["electricity_consumption"]

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234, shuffle = True)
# Do you want to have shuffle = False?

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
lin_fit = LinearRegression(fit_intercept=True).fit(X_train, Y_train)

In [20]:
print([coef for coef in lin_fit.coef_])
print([col_name for col_name in X_train.columns])

[-1.2376323731850032, 12.659679524434544, -0.53996574082890492, -0.22888507846880657, 27.709023060079062, -15.498032726007978, -12.210990334071141, -174.90942425541101, -179.70152113267824, -129.97369542442203, -15.335905565735949, 59.639049373589572, 147.02021837317713, 183.38933591912013, 166.99082373842398, 108.3731691443884, 20.117559345953353, -57.720718946209992, -127.88889057019639]
['temperature', 'var1', 'pressure', 'windspeed', 'A', 'B', 'C', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']


In [21]:
print("Number of Coefficients: " ,len(lin_fit.coef_))
print("Number of columns in X: ", len(X_train.columns))


('Number of Coefficients: ', 19)
('Number of columns in X: ', 19)


In [22]:
coefs = {col_name: coef for (col_name, coef) in zip(X_train.columns, lin_fit.coef_)}

In [23]:
coefs["Intercept"] = lin_fit.intercept_

In [24]:
coefs

{'A': 27.709023060079062,
 'B': -15.498032726007978,
 'C': -12.210990334071141,
 'Intercept': 841.18238247784711,
 'month_1': -174.90942425541101,
 'month_10': 20.117559345953353,
 'month_11': -57.720718946209992,
 'month_12': -127.88889057019639,
 'month_2': -179.70152113267824,
 'month_3': -129.97369542442203,
 'month_4': -15.335905565735949,
 'month_5': 59.639049373589572,
 'month_6': 147.02021837317713,
 'month_7': 183.38933591912013,
 'month_8': 166.99082373842398,
 'month_9': 108.3731691443884,
 'pressure': -0.53996574082890492,
 'temperature': -1.2376323731850032,
 'var1': 12.659679524434544,
 'windspeed': -0.22888507846880657}

### Model Evaluation 

In [25]:
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [26]:
y_train_pred = lin_fit.predict(X_train)

In [27]:
train_r2 = r2_score(Y_train, y_train_pred) * 100
train_rmse =  np.sqrt(mean_squared_error(Y_train, y_train_pred))

In [28]:
print("Train R- squared ", train_r2)  
print("Train RMSE: ", train_rmse)

('Train R- squared ', 36.03604218928249)
('Train RMSE: ', 86.58623094990223)


### Let's evaluate the model on the train set

In [29]:
y_test_preds = lin_fit.predict(X_test)

In [30]:
test_r2 =  r2_score(Y_test, y_test_preds) * 100
test_rmse = np.sqrt(mean_squared_error(Y_test, y_test_preds))

In [31]:
print("Test R- squared ", test_r2)  
print("Test RMSE: ", test_rmse)

('Test R- squared ', 35.898901579098428)
('Test RMSE: ', 85.891995548457558)


In [32]:
# Percentage difference in R2 
(train_r2 - test_r2)/train_r2 * 100

0.38056512827829103

In [33]:
# Percentage difference RMSE 
(train_rmse - test_rmse)/train_rmse * 100

0.80178498801541376

### Prep the Test Set 

In [48]:
# Import Test set 
test_data = pd.read_csv("test.csv")

In [49]:
test_data.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2
0,552,2013-07-24 00:00:00,-10.0,-16.4,1011.0,263.28,A
1,553,2013-07-24 01:00:00,-10.0,-20.7,1011.0,267.175,A
2,554,2013-07-24 02:00:00,-10.7,-17.1,1003.0,269.555,A
3,555,2013-07-24 03:00:00,-13.6,-20.7,1008.0,273.06,A
4,556,2013-07-24 04:00:00,-10.7,-17.1,1006.0,1.765,A


In [50]:
test_var2_onehot = pd.get_dummies(test_data.var2)
test_month_onehot = pd.get_dummies(pd.DatetimeIndex(test_data.datetime).month, prefix = "month")

In [51]:
test_data_edit = test_data.drop(["var2"], axis = 1)
test_data_edit = test_data_edit.join(test_var2_onehot)
test_data_edit = test_data_edit.join(test_month_onehot)
test_data_edit = test_data_edit[keep_vars]

In [52]:
test_data_edit.columns

Index([u'temperature', u'var1', u'pressure', u'windspeed', u'A', u'B', u'C',
       u'month_1', u'month_2', u'month_3', u'month_4', u'month_5', u'month_6',
       u'month_7', u'month_8', u'month_9', u'month_10', u'month_11',
       u'month_12'],
      dtype='object')

In [53]:
scores = lin_fit.predict(test_data_edit)


### Preparing to deliver scores 

In [55]:
submission_file = pd.read_csv("sample_submission_q0Q3I1Z.csv")

In [56]:
submission_file.head()

Unnamed: 0,ID,electricity_consumption


In [57]:
submission = pd.DataFrame({"ID" : test_data.ID, "electricity_consumption" : scores})
#submission.columns = submission_file.columns

In [58]:
submission.head()

Unnamed: 0,ID,electricity_consumption
0,552,250.872094
1,553,195.543964
2,554,245.760133
3,555,200.272349
4,556,305.433371


In [59]:
submission.to_csv("LinearReg_predictions.csv", index = False)