In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
fig = plt.figure(figsize=(18,6), dpi=1600) 


<matplotlib.figure.Figure at 0x7c7ff0474310>

In [2]:
data_raw = pd.read_csv("train.csv")

In [3]:
data_raw.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption
0,0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0
1,1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0
2,2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0
3,3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0
4,4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0


In [4]:
np.unique(data_raw.var2)

array(['A', 'B', 'C'], dtype=object)

In [8]:
var2_one_hot = pd.get_dummies(data_raw.var2)
print("Var2 shape:", var2_one_hot.shape)

('Var2 shape:', (26496, 3))


In [10]:
var2_one_hot.sum()

A    25239
B      217
C     1040
dtype: int64

In [17]:
data_edit = data_raw.drop(["var2"], axis = 1)
data_edit = data_edit.join(var2_one_hot)

In [18]:
data_edit.columns

Index([u'ID', u'datetime', u'temperature', u'var1', u'pressure', u'windspeed',
       u'electricity_consumption', u'A', u'B', u'C'],
      dtype='object')

### Let's begin the ML Process

In [20]:
from sklearn.model_selection import train_test_split

In [105]:
keep_vars = ["temperature", "var1", "pressure", "windspeed", "A", "B", "C"]

In [106]:
X = data_edit[keep_vars]
Y = data_edit["electricity_consumption"]

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234, shuffle = True)
# Do you want to have shuffle = False?

In [23]:
from sklearn.linear_model import LinearRegression

In [31]:
lin_fit = LinearRegression(fit_intercept=True).fit(X_train, Y_train)

In [37]:
print([coef for coef in lin_fit.coef_])
print([col_name for col_name in X_train.columns])

[-8.3420472731319801, 6.4204749407632322, -0.38398267882209591, -0.36581150898446024, 12.559297888732541, 21.266327101720304, -33.825624990452823]
['temperature', 'var1', 'pressure', 'windspeed', 'A', 'B', 'C']


In [33]:
print("Number of Coefficients: " ,len(lin_fit.coef_))
print("Number of columns in X: ", len(X_train.columns))


('Number of Coefficients: ', 7)
('Number of columns in X: ', 7)


In [56]:
coefs = {col_name: coef for (col_name, coef) in zip(X_train.columns, lin_fit.coef_)}

In [59]:
coefs["Intercept"] = lin_fit.intercept_

In [60]:
coefs

{'A': 12.559297888732541,
 'B': 21.266327101720304,
 'C': -33.825624990452823,
 'Intercept': 729.925044256094,
 'pressure': -0.38398267882209591,
 'temperature': -8.3420472731319801,
 'var1': 6.4204749407632322,
 'windspeed': -0.36581150898446024}

### Model Evaluation 

In [68]:
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [69]:
y_train_pred = lin_fit.predict(X_train)

In [90]:
train_r2 = r2_score(Y_train, y_train_pred) * 100
train_rmse =  np.sqrt(mean_squared_error(Y_train, y_train_pred))

In [91]:
print("Train R- squared ", train_r2)  
print("Train RMSE: ", train_rmse)

('Train R- squared ', 20.42365425155317)
('Train RMSE: ', 96.576879176384921)


### Let's evaluate the model on the train set

In [86]:
y_test_preds = lin_fit.predict(X_test)

In [92]:
test_r2 =  r2_score(Y_test, y_test_preds) * 100
test_rmse = np.sqrt(mean_squared_error(Y_test, y_test_preds))

In [93]:
print("Test R- squared ", test_r2)  
print("Test RMSE: ", test_rmse)

('Test R- squared ', 18.634259658831077)
('Test RMSE: ', 96.770001404240702)


In [95]:
# Percentage difference in R2 
(train_r2 - test_r2)/train_r2 * 100

8.7613831035453131

In [98]:
# Percentage difference RMSE 
(train_rmse - test_rmse)/train_rmse * 100

-0.19996735191977877

### Prep the Test Set 

In [101]:
# Import Test set 
test_data = pd.read_csv("test.csv")

In [102]:
test_data.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2
0,552,2013-07-24 00:00:00,-10.0,-16.4,1011.0,263.28,A
1,553,2013-07-24 01:00:00,-10.0,-20.7,1011.0,267.175,A
2,554,2013-07-24 02:00:00,-10.7,-17.1,1003.0,269.555,A
3,555,2013-07-24 03:00:00,-13.6,-20.7,1008.0,273.06,A
4,556,2013-07-24 04:00:00,-10.7,-17.1,1006.0,1.765,A


In [103]:
test_var2_onehot = pd.get_dummies(test_data.var2)

In [110]:
test_data_edit = test_data.drop(["var2"], axis = 1)
test_data_edit = test_data_edit.join(test_var2_onehot)
test_data_edit = test_data_edit[keep_vars]

In [111]:
test_data_edit.columns

Index([u'temperature', u'var1', u'pressure', u'windspeed', u'A', u'B', u'C'], dtype='object')

In [112]:
scores = lin_fit.predict(test_data_edit)


In [120]:
type(scores)

numpy.ndarray

### Preparing to deliver scores 

In [116]:
submission_file = pd.read_csv("sample_submission_q0Q3I1Z.csv")

In [118]:
submission_file.head()

Unnamed: 0,ID,electricity_consumption


In [129]:
submission = pd.DataFrame({"ID" : test_data.ID, "electricity_consumption" : scores})
#submission.columns = submission_file.columns

In [130]:
submission.head()

Unnamed: 0,ID,electricity_consumption
0,552,236.091683
1,553,207.058805
2,554,238.213178
3,555,236.089323
4,556,335.021894


In [134]:
submission.to_csv("LinearReg_predictions.csv", index = False)