In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib as plt

%matplotlib inline

In [2]:
data = pd.read_csv('Data/ml.csv')

In [5]:
data.columns

Index(['id_assessment', 'id_student', 'date_submitted', 'is_banked', 'score',
       'code_module', 'code_presentation', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
       'studied_credits', 'disability', 'final_result',
       'module_presentation_length'],
      dtype='object')

In [37]:
data.shape

(207319, 58)

In [7]:
data['id_assessment'] = data['id_assessment'].apply(str)
data['id_student'] = data['id_student'].apply(str)
data['is_banked'] = data['is_banked'].apply(str)
data['code_module'] = data['code_module'].apply(str)
data['code_presentation'] = data['code_presentation'].apply(str)

In [8]:
data_ml = data[['date_submitted', 'is_banked', 'score',
       'code_module', 'code_presentation', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
       'studied_credits', 'disability', 'final_result',
       'module_presentation_length']]

In [10]:

to_dummies = ['is_banked','code_module', 'code_presentation', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'disability', 'final_result',]

data = pd.get_dummies(data_ml, columns=to_dummies)

In [14]:
data.columns

Index(['date_submitted', 'score', 'num_of_prev_attempts', 'studied_credits',
       'module_presentation_length', 'is_banked_0', 'is_banked_1',
       'code_module_AAA', 'code_module_BBB', 'code_module_CCC',
       'code_module_DDD', 'code_module_EEE', 'code_module_FFF',
       'code_module_GGG', 'code_presentation_2013B', 'code_presentation_2013J',
       'code_presentation_2014B', 'code_presentation_2014J', 'gender_F',
       'gender_M', 'region_East Anglian Region', 'region_East Midlands Region',
       'region_Ireland', 'region_London Region', 'region_North Region',
       'region_North Western Region', 'region_Scotland',
       'region_South East Region', 'region_South Region',
       'region_South West Region', 'region_Wales',
       'region_West Midlands Region', 'region_Yorkshire Region',
       'highest_education_A Level or Equivalent',
       'highest_education_HE Qualification',
       'highest_education_Lower Than A Level',
       'highest_education_No Formal quals',
      

In [15]:
data_data = data[['date_submitted', 'num_of_prev_attempts', 'studied_credits',
       'module_presentation_length', 'is_banked_0', 'is_banked_1',
       'code_module_AAA', 'code_module_BBB', 'code_module_CCC',
       'code_module_DDD', 'code_module_EEE', 'code_module_FFF',
       'code_module_GGG', 'code_presentation_2013B', 'code_presentation_2013J',
       'code_presentation_2014B', 'code_presentation_2014J', 'gender_F',
       'gender_M', 'region_East Anglian Region', 'region_East Midlands Region',
       'region_Ireland', 'region_London Region', 'region_North Region',
       'region_North Western Region', 'region_Scotland',
       'region_South East Region', 'region_South Region',
       'region_South West Region', 'region_Wales',
       'region_West Midlands Region', 'region_Yorkshire Region',
       'highest_education_A Level or Equivalent',
       'highest_education_HE Qualification',
       'highest_education_Lower Than A Level',
       'highest_education_No Formal quals',
       'highest_education_Post Graduate Qualification', 'imd_band_0-10%',
       'imd_band_10-20', 'imd_band_20-30%', 'imd_band_30-40%',
       'imd_band_40-50%', 'imd_band_50-60%', 'imd_band_60-70%',
       'imd_band_70-80%', 'imd_band_80-90%', 'imd_band_90-100%', 'imd_band_?',
       'age_band_0-35', 'age_band_35-55', 'age_band_55<=', 'disability_N',
       'disability_Y', 'final_result_Distinction', 'final_result_Fail',
       'final_result_Pass', 'final_result_Withdrawn']]

data_target = data["score"]

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(data_data, data_target, test_size=0.2, random_state=42)

In [38]:
#Grid search for parameter selection for a Random Forest Classifier model
param_grid = {
    'n_estimators': [50, 500],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [25, 15]
}

In [41]:
RFR = RandomForestRegressor(n_jobs=-1)
GS = GridSearchCV(RFR, param_grid, cv=5, verbose = 3)

In [42]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=25, max_features=auto, n_estimators=50 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=25, max_features=auto, n_estimators=50, score=0.29908865993874645, total=  22.0s
[CV] max_depth=25, max_features=auto, n_estimators=50 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.1s remaining:    0.0s


[CV]  max_depth=25, max_features=auto, n_estimators=50, score=0.277646969771264, total=  21.4s
[CV] max_depth=25, max_features=auto, n_estimators=50 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   46.1s remaining:    0.0s


[CV]  max_depth=25, max_features=auto, n_estimators=50, score=0.2968282408239381, total=  29.3s
[CV] max_depth=25, max_features=auto, n_estimators=50 ................
[CV]  max_depth=25, max_features=auto, n_estimators=50, score=0.2825015341788366, total=  22.8s
[CV] max_depth=25, max_features=auto, n_estimators=50 ................
[CV]  max_depth=25, max_features=auto, n_estimators=50, score=0.2941751848318128, total=  20.1s
[CV] max_depth=25, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=25, max_features=auto, n_estimators=500, score=0.3077776836411067, total= 3.4min
[CV] max_depth=25, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=25, max_features=auto, n_estimators=500, score=0.28674408737579915, total= 3.3min
[CV] max_depth=25, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=25, max_features=auto, n_estimators=500, score=0.30594446600354386, total= 3.5min
[CV] max_depth=25, max_features=auto, n_estimators=500 .........

[CV]  max_depth=15, max_features=log2, n_estimators=50, score=0.25539660487187155, total=   3.0s
[CV] max_depth=15, max_features=log2, n_estimators=50 ................
[CV]  max_depth=15, max_features=log2, n_estimators=50, score=0.2636608131697411, total=   3.0s
[CV] max_depth=15, max_features=log2, n_estimators=50 ................
[CV]  max_depth=15, max_features=log2, n_estimators=50, score=0.2538639938654842, total=   2.8s
[CV] max_depth=15, max_features=log2, n_estimators=50 ................
[CV]  max_depth=15, max_features=log2, n_estimators=50, score=0.2553683624189743, total=   2.6s
[CV] max_depth=15, max_features=log2, n_estimators=500 ...............
[CV]  max_depth=15, max_features=log2, n_estimators=500, score=0.25794573727908976, total=  26.7s
[CV] max_depth=15, max_features=log2, n_estimators=500 ...............
[CV]  max_depth=15, max_features=log2, n_estimators=500, score=0.2567981791625332, total=  26.7s
[CV] max_depth=15, max_features=log2, n_estimators=500 ..........

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 52.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [25, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [43]:
GS.best_params_

{'max_depth': 25, 'max_features': 'sqrt', 'n_estimators': 500}

In [49]:
RFR = RandomForestRegressor(max_depth = 25, max_features='auto', n_estimators=500)
RFR.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [50]:
y_train_pred = RFR.predict(X_train)
y_pred = RFR.predict(X_test)

In [51]:
r2 = r2_score(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
print ('TRAIN MODEL METRICS:')
print('The R2 score is: ' + str(r2))
print('The MAE score is: ' + str(mae))

TRAIN MODEL METRICS:
The R2 score is: 0.7856626204020406
The MAE score is: 6.62809160199953


In [52]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print ('TEST MODEL METRICS:')
print('The R2 score is: ' + str(r2))
print('The MAE score is: ' + str(mae))

TEST MODEL METRICS:
The R2 score is: 0.30211487767079703
The MAE score is: 11.777427316308438


In [47]:
ks_rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)

In [None]:
y_pred_test_rf = ks_rf.predict(X_test)
confusion_matrix(y_test, y_pred)