In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
fig = plt.figure(figsize=(18,6), dpi=1600) 

<matplotlib.figure.Figure at 0x163f30b5128>

In [2]:
data_raw = pd.read_csv("train.csv")

In [3]:
data_raw.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption
0,0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0
1,1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0
2,2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0
3,3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0
4,4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0


### Let's one hot encode var2

In [4]:
np.unique(data_raw.var2)

array(['A', 'B', 'C'], dtype=object)

In [5]:
var2_one_hot = pd.get_dummies(data_raw.var2)
print("Var2 shape:", var2_one_hot.shape)

Var2 shape: (26496, 3)


In [6]:
var2_one_hot.sum()

A    25239
B      217
C     1040
dtype: int64

### Let's add the month as a 1 hot variable 

In [7]:
data_raw["month"] = pd.DatetimeIndex(data_raw.datetime).month
month_dummies = pd.get_dummies(data_raw["month"], prefix = "month")

### Combine the features with the original data 

In [8]:
data_edit = data_raw.drop(["var2"], axis = 1)
data_edit = data_edit.join(var2_one_hot)
data_edit = data_edit.join(month_dummies)

In [9]:
[item for item in data_edit.columns.tolist()]

['ID',
 'datetime',
 'temperature',
 'var1',
 'pressure',
 'windspeed',
 'electricity_consumption',
 'month',
 'A',
 'B',
 'C',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

## Let's begin the ML Process

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
keep_vars = ["temperature", "var1", "pressure", "windspeed", "A", "B", "C",
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

In [12]:
X = data_edit[keep_vars]
Y = data_edit["electricity_consumption"]

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234) # , shuffle = True)
# Do you want to have shuffle = False?

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lin_fit = LinearRegression(fit_intercept=True).fit(X_train, Y_train)

In [16]:
print([coef for coef in lin_fit.coef_])
print([col_name for col_name in X_train.columns])

[-1.237632373184975, 12.659679524434534, -0.53996574082891147, -0.22888507846880479, 27.709023060079062, -15.49803272600775, -12.210990334071155, -174.90942425541093, -179.70152113267787, -129.97369542442181, -15.335905565735761, 59.639049373589614, 147.02021837317719, 183.38933591912016, 166.9908237384239, 108.37316914438844, 20.117559345953477, -57.720718946209772, -127.88889057019624]
['temperature', 'var1', 'pressure', 'windspeed', 'A', 'B', 'C', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']


In [17]:
print("Number of Coefficients: " ,len(lin_fit.coef_))
print("Number of columns in X: ", len(X_train.columns))


Number of Coefficients:  19
Number of columns in X:  19


In [18]:
coefs = {col_name: coef for (col_name, coef) in zip(X_train.columns, lin_fit.coef_)}

In [19]:
coefs["Intercept"] = lin_fit.intercept_

In [20]:
coefs

{'A': 27.709023060079062,
 'B': -15.49803272600775,
 'C': -12.210990334071155,
 'Intercept': 841.18238247785325,
 'month_1': -174.90942425541093,
 'month_10': 20.117559345953477,
 'month_11': -57.720718946209772,
 'month_12': -127.88889057019624,
 'month_2': -179.70152113267787,
 'month_3': -129.97369542442181,
 'month_4': -15.335905565735761,
 'month_5': 59.639049373589614,
 'month_6': 147.02021837317719,
 'month_7': 183.38933591912016,
 'month_8': 166.9908237384239,
 'month_9': 108.37316914438844,
 'pressure': -0.53996574082891147,
 'temperature': -1.237632373184975,
 'var1': 12.659679524434534,
 'windspeed': -0.22888507846880479}

### Model Evaluation 

In [21]:
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [22]:
y_train_pred = lin_fit.predict(X)

In [23]:
# Export RF scores to csv 
y_train_pred_pd = pd.DataFrame({"ID": X.index, "electricity_consumption":y_train_pred})
y_train_pred_pd.to_csv("Linear Reg Train Sample Scores.csv", index = False)

In [24]:
train_r2 = r2_score(Y, y_train_pred) * 100
train_rmse =  np.sqrt(mean_squared_error(Y, y_train_pred))

In [25]:
print("Train R- squared ", train_r2)  
print("Train RMSE: ", train_rmse)

Train R- squared  36.0023973755
Train RMSE:  86.4131949844


### Let's evaluate the model on the train set

In [26]:
y_test_preds = lin_fit.predict(X_test)

In [27]:
test_r2 =  r2_score(Y_test, y_test_preds) * 100
test_rmse = np.sqrt(mean_squared_error(Y_test, y_test_preds))

In [28]:
print("Test R- squared ", test_r2)  
print("Test RMSE: ", test_rmse)

Test R- squared  35.8989015791
Test RMSE:  85.8919955485


In [29]:
# Percentage difference in R2 
(train_r2 - test_r2)/train_r2 * 100

0.287469179545354

In [30]:
# Percentage difference RMSE 
(train_rmse - test_rmse)/train_rmse * 100

0.60314797526560082

### Begin using LIME 

In [33]:
import lime
import lime.lime_tabular

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train)