In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
fig = plt.figure(figsize=(18,6), dpi=1600) 

<matplotlib.figure.Figure at 0x780dd600c950>

In [3]:
data_raw = pd.read_csv("train.csv")

In [4]:
data_raw.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption
0,0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0
1,1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0
2,2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0
3,3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0
4,4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0


### Let's one hot encode var2

In [5]:
np.unique(data_raw.var2)

array(['A', 'B', 'C'], dtype=object)

In [6]:
var2_one_hot = pd.get_dummies(data_raw.var2)
print("Var2 shape:", var2_one_hot.shape)

('Var2 shape:', (26496, 3))


In [7]:
var2_one_hot.sum()

A    25239
B      217
C     1040
dtype: int64

### Let's add the month as a 1 hot variable 

In [8]:
data_raw["month"] = pd.DatetimeIndex(data_raw.datetime).month
month_dummies = pd.get_dummies(data_raw["month"], prefix = "month")

### Combine the features with the original data 

In [9]:
data_edit = data_raw.drop(["var2"], axis = 1)
data_edit = data_edit.join(var2_one_hot)
data_edit = data_edit.join(month_dummies)

In [10]:
[item for item in data_edit.columns.tolist()]

['ID',
 'datetime',
 'temperature',
 'var1',
 'pressure',
 'windspeed',
 'electricity_consumption',
 'month',
 'A',
 'B',
 'C',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

## Let's begin the ML Process

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
keep_vars = ["temperature", "var1", "pressure", "windspeed", "A", "B", "C",
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12']

In [13]:
X = data_edit[keep_vars]
Y = data_edit["electricity_consumption"]

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234, shuffle = True)
# Do you want to have shuffle = False?

## Begin Using Algorithms 

Random Forest Grid Search 

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [16]:
np.random.seed(1234)
param_dist = {"n_estimators": [10, 100, 200, 400], 
              "max_depth": [3, 6, None],
              "min_samples_split": [10, 50, 100],
              "min_samples_leaf": [3,7,11], 
              "max_leaf_nodes": [2,4, None],
              "max_features": [None, 'auto', 'sqrt', 3]
              }
n_iter_search = 30

In [19]:
rf1_random_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv = 3)

In [20]:
rf1_random_search.fit(X_train, Y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'n_estimators': [10, 100, 200, 400], 'max_features': [None, 'auto', 'sqrt', 3], 'max_depth': [3, 6, None], 'min_samples_leaf': [3, 7, 11]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [21]:
rf1_random_search.cv_results_.keys()

['std_train_score',
 'rank_test_score',
 'mean_train_score',
 'param_min_samples_leaf',
 'param_max_depth',
 'std_test_score',
 'split1_train_score',
 'split2_train_score',
 'split0_test_score',
 'mean_test_score',
 'split0_train_score',
 'param_max_features',
 'std_score_time',
 'params',
 'std_fit_time',
 'split2_test_score',
 'param_n_estimators',
 'mean_score_time',
 'mean_fit_time',
 'split1_test_score']

In [22]:
wanted_grid_params = ["mean_train_score" ,"mean_test_score", "param_n_estimators", "param_max_depth", 
                      "param_max_features", "param_min_samples_leaf", "rank_test_score"]
# Which params are in both lists 
set(wanted_grid_params) & set(rf1_random_search.cv_results_.keys())

{'mean_test_score',
 'mean_train_score',
 'param_max_depth',
 'param_max_features',
 'param_min_samples_leaf',
 'param_n_estimators',
 'rank_test_score'}

In [32]:
dict_temp = {x:rf1_random_search.cv_results_[x] for x in wanted_grid_params}
search_results = pd.DataFrame(dict_temp)
search_results["test_train_pct_diff"] = (search_results.mean_train_score - search_results.mean_test_score)/ search_results.mean_test_score
search_results[search_results.test_train_pct_diff < 0.10]\
.sort_values(by=["mean_test_score"], ascending = False).head(15)

Unnamed: 0,mean_test_score,mean_train_score,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,rank_test_score,test_train_pct_diff
2,0.375859,0.410918,6,auto,3,400,10,0.093275
10,0.375357,0.406961,6,auto,7,200,11,0.084197
12,0.374826,0.409758,6,,3,400,12,0.093195
11,0.374639,0.404017,6,,11,200,13,0.078418
27,0.371362,0.40476,6,,3,10,14,0.089934
25,0.296953,0.315054,6,sqrt,7,200,15,0.060953
14,0.295187,0.313507,6,sqrt,7,400,16,0.062064
24,0.271469,0.288213,6,3,7,200,17,0.061679
16,0.269781,0.287887,6,3,3,100,18,0.067115
3,0.261609,0.279583,6,3,7,10,19,0.068708


In [24]:
print("Best Estimator Score according to sklearn", rf1_random_search.best_score_)
#rf_best = rf1_random_search.best_estimator_
#rf_best

('Best Estimator Score according to sklearn', 0.46005379603111657)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=11, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Refit the model

In [36]:
rf_model_fit = RandomForestRegressor(max_depth=6, max_features='auto', min_samples_leaf=3, n_estimators=400).fit(X_train, Y_train)

In [40]:
feat_imp = {feat:imp for (feat, imp) in zip(X_train.columns, rf_model_fit.feature_importances_)}

In [43]:
feat_imp

{'A': 0.00018917323931360878,
 'B': 0.00014788633020117674,
 'C': 9.4435652757077011e-06,
 'month_1': 8.7580417223566919e-05,
 'month_10': 0.00073958039324666601,
 'month_11': 0.00014461105154081892,
 'month_12': 0.00085924056971556297,
 'month_2': 0.00027764525527672313,
 'month_3': 0.00018915964942140565,
 'month_4': 0.044971437253506809,
 'month_5': 0.017351285324855106,
 'month_6': 0.0058910104901041573,
 'month_7': 0.019155369126155092,
 'month_8': 0.04133936779536896,
 'month_9': 0.061829943336394549,
 'pressure': 0.011444320811342105,
 'temperature': 0.28173865789882546,
 'var1': 0.3694470444871033,
 'windspeed': 0.14418724300512906}

### Model Evaluation 

In [44]:
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [45]:
y_train_pred = rf_model_fit.predict(X_train)

In [46]:
train_r2 = r2_score(Y_train, y_train_pred) * 100
train_rmse =  np.sqrt(mean_squared_error(Y_train, y_train_pred))

In [47]:
print("Train R- squared ", train_r2)  
print("Train RMSE: ", train_rmse)

('Train R- squared ', 39.890458828466436)
('Train RMSE: ', 83.936890772288706)


### Let's evaluate the model on the train set

In [48]:
y_test_preds = rf_model_fit.predict(X_test)

In [49]:
test_r2 =  r2_score(Y_test, y_test_preds) * 100
test_rmse = np.sqrt(mean_squared_error(Y_test, y_test_preds))

In [50]:
print("Test R- squared ", test_r2)  
print("Test RMSE: ", test_rmse)

('Test R- squared ', 36.699642619497361)
('Test RMSE: ', 85.353834827459565)


In [51]:
# Percentage difference in R2 
(train_r2 - test_r2)/train_r2 * 100

7.9989458699634213

In [52]:
# Percentage difference RMSE 
(train_rmse - test_rmse)/train_rmse * 100

-1.6881064358398357

### Prep the Test Set 

In [59]:
# Import Test set 
test_data = pd.read_csv("test.csv")

In [60]:
test_data.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2
0,552,2013-07-24 00:00:00,-10.0,-16.4,1011.0,263.28,A
1,553,2013-07-24 01:00:00,-10.0,-20.7,1011.0,267.175,A
2,554,2013-07-24 02:00:00,-10.7,-17.1,1003.0,269.555,A
3,555,2013-07-24 03:00:00,-13.6,-20.7,1008.0,273.06,A
4,556,2013-07-24 04:00:00,-10.7,-17.1,1006.0,1.765,A


In [61]:
test_var2_onehot = pd.get_dummies(test_data.var2)
test_month_onehot = pd.get_dummies(pd.DatetimeIndex(test_data.datetime).month, prefix = "month")

In [62]:
test_data_edit = test_data.drop(["var2"], axis = 1)
test_data_edit = test_data_edit.join(test_var2_onehot)
test_data_edit = test_data_edit.join(test_month_onehot)
test_data_edit = test_data_edit[keep_vars]

In [63]:
test_data_edit.columns

Index([u'temperature', u'var1', u'pressure', u'windspeed', u'A', u'B', u'C',
       u'month_1', u'month_2', u'month_3', u'month_4', u'month_5', u'month_6',
       u'month_7', u'month_8', u'month_9', u'month_10', u'month_11',
       u'month_12'],
      dtype='object')

In [64]:
scores = rf_model_fit.predict(test_data_edit)


### Preparing to deliver scores 

In [65]:
submission_file = pd.read_csv("sample_submission_q0Q3I1Z.csv")

In [66]:
submission_file.head()

Unnamed: 0,ID,electricity_consumption


In [67]:
submission = pd.DataFrame({"ID" : test_data.ID, "electricity_consumption" : scores})
#submission.columns = submission_file.columns

In [68]:
submission.head()

Unnamed: 0,ID,electricity_consumption
0,552,204.446082
1,553,204.541955
2,554,204.576544
3,555,204.682612
4,556,269.365624


In [69]:
submission.to_csv("Random Forest predictions.csv", index = False)