In [63]:
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sn


In [64]:
dframe = pd.read_csv("datasets_dataset.csv")

## Part 1: Processing

In [65]:
#dropping columns that tell nothing relevant about the grade (majority 0 values)
dframe = dframe.drop("Week1_Stat1",axis=1)
dframe = dframe.drop("Week1_Stat2",axis=1)
dframe = dframe.drop("Week1_Stat3",axis=1)
dframe = dframe.drop("Week2_Stat3",axis=1)
dframe = dframe.drop("Week3_Stat3",axis=1)
dframe = dframe.drop("Week4_Stat3",axis=1)
dframe = dframe.drop("Week5_Stat3",axis=1)
dframe = dframe.drop("Week6_Stat3",axis=1)
dframe = dframe.drop("Week7_Stat3",axis=1)
dframe = dframe.drop("Week8_Stat3",axis=1)
dframe = dframe.drop("Week9_Stat3",axis=1)
dframe = dframe.drop("Week3_PR1",axis=1)
dframe = dframe.drop("Week5_PR2",axis=1)
dframe = dframe.drop("Week7_PR3",axis=1)
dframe = dframe.drop("ID",axis=1)
dframe = dframe.drop("Week8_Stat2",axis=1)


#dropping rows for users that did not really participate (whose grade == 0)
dframe = dframe.drop(dframe[dframe.Grade == 0].index)

dframe

Unnamed: 0,Week2_Quiz1,Week3_MP1,Week5_MP2,Week7_MP3,Week4_Quiz2,Week6_Quiz3,Week8_Total,Week1_Stat0,Week2_Stat0,Week2_Stat1,...,Week6_Stat2,Week7_Stat0,Week7_Stat1,Week7_Stat2,Week8_Stat0,Week8_Stat1,Week9_Stat0,Week9_Stat1,Week9_Stat2,Grade
0,5.0,15.0,16.09,21.88,5.0,5.0,82.97,0,7,30,...,2,2,0,1,5,4,8,6,1,4
1,3.33,15.0,17.83,22.27,4.0,5.0,82.43,8,61,10,...,2,31,9,0,5,2,25,3,2,4
2,1.67,13.0,15.22,27.05,5.0,5.0,79.44,4,19,10,...,2,12,4,0,8,2,9,0,1,3
3,2.5,14.0,10.0,31.02,3.13,5.0,80.65,12,30,10,...,4,29,4,5,10,0,7,6,0,3
4,0.0,15.0,12.17,15.91,4.67,5.0,67.68,6,0,0,...,1,43,4,3,8,5,5,3,1,2
5,3.33,14.0,14.78,15.51,4.67,4.5,71.79,12,35,12,...,1,33,9,0,12,0,7,2,0,3
7,5.0,15.0,20.0,35.0,4.71,5.0,99.71,19,16,10,...,1,47,9,3,21,12,15,0,1,5
8,0.0,13.0,20.0,34.6,2.71,0.0,85.31,0,3,0,...,0,29,4,0,14,0,12,2,0,4
9,5.0,12.0,12.17,24.66,4.67,5.0,78.5,3,11,20,...,4,24,5,2,6,0,23,9,5,3
12,4.17,15.0,19.57,35.0,5.0,5.0,96.24,3,16,10,...,0,0,0,0,0,0,0,0,0,5


## Part 2: Splitting data, training the model

In [66]:
trainx, testx, trainy, testy = train_test_split(dframe.drop(["Grade"],axis=1), dframe.Grade, test_size=0.25)

In [67]:
model = RandomForestClassifier(max_features=len(dframe.columns)-1,n_estimators=10)
model.fit(trainx,trainy)

RandomForestClassifier(max_features=31, n_estimators=10)

In [68]:
model.score(testx,testy)

1.0

In [69]:
ypredict = model.predict(testx)
pd.crosstab(testy,ypredict,rownames=["Actual grade"],colnames=["Predicted grade"])

Predicted grade,2,3,4,5
Actual grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,2,0,0,0
3,0,3,0,0
4,0,0,8,0
5,0,0,0,2


## Part 3: Testing Hyperparameters

In [70]:
tuner = GridSearchCV(RandomForestClassifier(),{"n_estimators":[10,20,30,40,50,60,70,80,90,100],"max_features": ["auto",30]}, cv = 4)
tuner.fit(trainx,trainy)
resultframe = pd.DataFrame(tuner.cv_results_)
resultframe = resultframe.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time"],axis=1)
resultframe



Unnamed: 0,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,auto,10,"{'max_features': 'auto', 'n_estimators': 10}",0.636364,0.636364,0.636364,0.818182,0.681818,0.07873,20
1,auto,20,"{'max_features': 'auto', 'n_estimators': 20}",0.727273,0.636364,1.0,0.636364,0.75,0.149033,17
2,auto,30,"{'max_features': 'auto', 'n_estimators': 30}",0.636364,0.818182,0.636364,0.818182,0.727273,0.090909,18
3,auto,40,"{'max_features': 'auto', 'n_estimators': 40}",0.818182,0.636364,0.636364,0.727273,0.704545,0.075378,19
4,auto,50,"{'max_features': 'auto', 'n_estimators': 50}",0.909091,0.727273,0.727273,0.818182,0.795455,0.075378,14
5,auto,60,"{'max_features': 'auto', 'n_estimators': 60}",0.636364,0.727273,1.0,0.909091,0.818182,0.14374,11
6,auto,70,"{'max_features': 'auto', 'n_estimators': 70}",0.727273,0.727273,0.818182,0.909091,0.795455,0.075378,12
7,auto,80,"{'max_features': 'auto', 'n_estimators': 80}",0.545455,0.727273,0.909091,0.909091,0.772727,0.150756,15
8,auto,90,"{'max_features': 'auto', 'n_estimators': 90}",0.636364,0.818182,0.727273,0.909091,0.772727,0.101639,15
9,auto,100,"{'max_features': 'auto', 'n_estimators': 100}",0.818182,0.636364,0.909091,0.818182,0.795455,0.099066,12


In [71]:
features = dframe.columns[:31]
#features

In [72]:
list(zip(trainx[features],model.feature_importances_))

[('Week2_Quiz1', 0.0),
 ('Week3_MP1', 0.0),
 ('Week5_MP2', 0.0),
 ('Week7_MP3', 0.037285704973600155),
 ('Week4_Quiz2', 0.0),
 ('Week6_Quiz3', 0.0),
 ('Week8_Total', 0.9232301791786426),
 ('Week1_Stat0', 0.0),
 ('Week2_Stat0', 0.0),
 ('Week2_Stat1', 0.0),
 ('Week2_Stat2', 0.0),
 ('Week3_Stat0', 0.0),
 ('Week3_Stat1', 0.0),
 ('Week3_Stat2', 0.0),
 ('Week4_Stat0', 0.020835447319465582),
 ('Week4_Stat1', 0.0),
 ('Week4_Stat2', 0.006482769910780063),
 ('Week5_Stat0', 0.0),
 ('Week5_Stat1', 0.012165898617511522),
 ('Week5_Stat2', 0.0),
 ('Week6_Stat0', 0.0),
 ('Week6_Stat1', 0.0),
 ('Week6_Stat2', 0.0),
 ('Week7_Stat0', 0.0),
 ('Week7_Stat1', 0.0),
 ('Week7_Stat2', 0.0),
 ('Week8_Stat0', 0.0),
 ('Week8_Stat1', 0.0),
 ('Week9_Stat0', 0.0),
 ('Week9_Stat1', 0.0),
 ('Week9_Stat2', 0.0)]

In [76]:
#running a new model - same dataframe except removed some least important features
#df2 = dframe.drop(["Week7_Stat1","Week7_Stat2","Week3_Stat1","Week3_Stat2","Week6_Stat0","Week6_Stat1","Week6_Quiz3"],axis=1)
df2 = dframe[["Week3_MP1","Week8_Total","Week2_Stat1","Week5_Stat1","Week7_Stat2","Grade"]].copy()
trX, teX, trY, teY = train_test_split(df2.drop(["Grade"],axis=1), df2.Grade, test_size=0.25)
model2 = RandomForestClassifier(n_estimators=10,max_features=len(df2.columns)-1)
model2.fit(trX,trY)
model2.score(teX,teY)

1.0

In [77]:
ypr = model2.predict(teX)
pd.crosstab(teY,ypr,rownames=["Actual grade"],colnames=["Predicted grade"])

Predicted grade,3,4,5
Actual grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,8,0,0
4,0,4,0
5,0,0,3


## Part 4: Evaluating model performance

In [78]:
modelscore = 0
model2score = 0
for i in range(1000):
    trainx, testx, trainy, testy = train_test_split(dframe.drop(["Grade"],axis=1), dframe.Grade, test_size=0.25)
    model = RandomForestClassifier(max_features=len(dframe.columns)-1,n_estimators=10)
    model.fit(trainx,trainy)
    modelscore += model.score(testx,testy)
    trX, teX, trY, teY = train_test_split(df2.drop(["Grade"],axis=1), df2.Grade, test_size=0.25)
    model2 = RandomForestClassifier(n_estimators=10,max_features=len(df2.columns)-1)
    model2.fit(teX,teY)
    model2score += model2.score(teX,teY)
modelscore /= 1000
model2score /= 1000
print(modelscore)
print(model2score)

0.9482666666666608
0.9869333333333287


# Part 5: New model without Week8_Total

In [82]:
df3 = dframe.drop("Week8_Total",axis=1)
model3score = 0
for i in range(1000):
    
    trainx, testx, trainy, testy = train_test_split(dframe.drop(["Grade"],axis=1), dframe.Grade, test_size=0.25)
    model3 = RandomForestClassifier(max_features=len(dframe.columns)-1,n_estimators=10)
    model3.fit(trainx,trainy)
    model3score += model3.score(testx,testy)
model3score /= 1000
print(model3score)

0.9547333333333262


In [81]:
#ypredict = model.predict(testx)
#pd.crosstab(testy,ypredict,rownames=["Actual grade"],colnames=["Predicted grade"])
features = df3.columns[:len(df3.columns)-1]
list(zip(trainx[features],model3.feature_importances_))

[('Week2_Quiz1', 0.0),
 ('Week3_MP1', 0.05107132948229048),
 ('Week5_MP2', 0.0),
 ('Week7_MP3', 0.0),
 ('Week4_Quiz2', 0.0),
 ('Week6_Quiz3', 0.0),
 ('Week1_Stat0', 0.9310424916559209),
 ('Week2_Stat0', 0.0),
 ('Week2_Stat1', 0.017886178861788622),
 ('Week2_Stat2', 0.0),
 ('Week3_Stat0', 0.0),
 ('Week3_Stat1', 0.0),
 ('Week3_Stat2', 0.0),
 ('Week4_Stat0', 0.0),
 ('Week4_Stat1', 0.0),
 ('Week4_Stat2', 0.0),
 ('Week5_Stat0', 0.0),
 ('Week5_Stat1', 0.0),
 ('Week5_Stat2', 0.0),
 ('Week6_Stat0', 0.0),
 ('Week6_Stat1', 0.0),
 ('Week6_Stat2', 0.0),
 ('Week7_Stat0', 0.0),
 ('Week7_Stat1', 0.0),
 ('Week7_Stat2', 0.0),
 ('Week8_Stat0', 0.0),
 ('Week8_Stat1', 0.0),
 ('Week9_Stat0', 0.0),
 ('Week9_Stat1', 0.0),
 ('Week9_Stat2', 0.0)]

In [85]:
ypredict = model3.predict(testx)
pd.crosstab(testy,ypredict,rownames=["Actual grade"],colnames=["Predicted grade"])


Predicted grade,2,3,4,5
Actual grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,0,0,0
3,0,3,0,0
4,0,0,6,0
5,0,0,0,5
