# Proširenje

In [6]:
# Imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [7]:
# Loading data
df_assessments = pd.read_csv('assessments.csv')
df_courses = pd.read_csv('courses.csv')
df_studentAssessment = pd.read_csv('studentAssessment.csv')
df_studentInfo = pd.read_csv('studentInfo.csv')
df_studentRegistration = pd.read_csv('studentRegistration.csv')
df_studentVle = pd.read_csv('studentVle.csv')
df_vle = pd.read_csv('vle.csv')

# Dropout prediction

## Feature extraction & Preprocessing

In [3]:
df_dropout = df_studentInfo[['code_module', 'code_presentation', 'id_student', 'final_result']]
df_dropout['final_result'].replace({'Pass':0, 'Fail':0, 'Distinction':0, 'Withdrawn':1}, inplace=True)
# DF sa svim student-predmet kombinacijama i njihov final result; 1->dropout, 0->not dropout
df_dropout

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,code_module,code_presentation,id_student,final_result
0,AAA,2013J,11391,0
1,AAA,2013J,28400,0
2,AAA,2013J,30268,1
3,AAA,2013J,31604,0
4,AAA,2013J,32885,0
...,...,...,...,...
32588,GGG,2014J,2640965,0
32589,GGG,2014J,2645731,0
32590,GGG,2014J,2648187,0
32591,GGG,2014J,2679821,1


In [14]:
print("Dropout rate is "+str(round(len(df_dropout[df_dropout['final_result'] == 1])/len(df_dropout), 5)))
print("Our model needs to have accuracy > "+str(1-0.3116))

Dropout rate is 0.3116
Our model needs to have accuracy > 0.6884


In [15]:
df_stud_demographic = pd.get_dummies(df_studentInfo, columns=['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability'], drop_first=True)
df_stud_demographic['final_result'].replace({'Pass':0, 'Fail':0, 'Distinction':0, 'Withdrawn':1}, inplace=True)
# DF sa demografskin znacajkama studenata
df_stud_demographic

Unnamed: 0,code_module,code_presentation,id_student,num_of_prev_attempts,studied_credits,final_result,gender_M,region_East Midlands Region,region_Ireland,region_London Region,...,imd_band_30-40%,imd_band_40-50%,imd_band_50-60%,imd_band_60-70%,imd_band_70-80%,imd_band_80-90%,imd_band_90-100%,age_band_35-55,age_band_55<=,disability_Y
0,AAA,2013J,11391,0,240,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,AAA,2013J,28400,0,60,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,AAA,2013J,30268,0,60,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3,AAA,2013J,31604,0,60,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,AAA,2013J,32885,0,60,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,0,30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32589,GGG,2014J,2645731,0,30,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
32590,GGG,2014J,2648187,0,30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32591,GGG,2014J,2679821,0,30,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [44]:
df_stud_Assessment = df_studentAssessment.merge(df_assessments.drop(['date', 'weight'], axis=1), how='left', on=['id_assessment'])
df_stud_Assessment = df_stud_Assessment[df_stud_Assessment['assessment_type'] != 'Exam']
df_stud_Assessment = pd.get_dummies(df_stud_Assessment, columns=['assessment_type'], drop_first=True).drop(['is_banked'], axis=1)
df_stud_Assessment = df_stud_Assessment.drop(['id_assessment', 'date_submitted'], axis=1).groupby(['id_student', 'code_module', 'code_presentation', 'assessment_type_TMA']).mean().reset_index()
df_stud_Assessment = df_stud_Assessment.set_index(['code_module', 'code_presentation', 'id_student', 'assessment_type_TMA']).unstack(['assessment_type_TMA'])
df_stud_Assessment.columns = ['{}_{}'.format(t, v) for v, t in df_stud_Assessment.columns]
df_stud_Assessment = df_stud_Assessment.reset_index()
df_stud_Assessment = df_stud_Assessment.fillna(0)
df_stud_Assessment = df_stud_Assessment.merge(df_dropout, how='left', on=['code_module', 'code_presentation', 'id_student'])
# DF sa avg score svakog studenta za dvije razlicite vrse assessmenta
#1_score -> TMA score
#0_score -> CMA score
df_stud_Assessment

Unnamed: 0,code_module,code_presentation,id_student,0_score,1_score,final_result
0,AAA,2013J,11391,0.000000,82.000000,0
1,AAA,2013J,28400,0.000000,66.400000,0
2,AAA,2013J,31604,0.000000,76.000000,0
3,AAA,2013J,32885,0.000000,54.400000,0
4,AAA,2013J,38053,0.000000,68.000000,0
...,...,...,...,...,...,...
25834,GGG,2014J,2620947,93.333333,80.000000,0
25835,GGG,2014J,2645731,93.333333,77.666667,0
25836,GGG,2014J,2648187,80.000000,70.000000,0
25837,GGG,2014J,2679821,100.000000,83.000000,1


In [17]:
df_activity_sum = df_studentVle.drop(['date'], axis=1).merge(df_vle.drop(['week_from', 'week_to'], axis=1), how='left', on=['id_site', 'code_module', 'code_presentation'])
df_activity_sum = df_activity_sum.drop(['id_site'], axis=1)
df_activity_sum = df_activity_sum.groupby(['code_module', 'code_presentation', 'id_student', 'activity_type']).sum()
df_activity_sum = df_activity_sum.reset_index()
df_activity_sum = df_activity_sum.set_index(['code_module', 'code_presentation', 'id_student', 'activity_type']).unstack(['activity_type'])
df_activity_sum.columns = ['{}_{}'.format(t, v) for v, t in df_activity_sum.columns]
df_activity_sum = df_activity_sum.reset_index()
df_activity_sum = df_activity_sum.fillna(0)
df_activity_sum = df_activity_sum.merge(df_dropout, how='left', on=['code_module', 'code_presentation', 'id_student'])
# DF sa sumom klikova po aktivnostima za svakog studenta i predmete
df_activity_sum

Unnamed: 0,code_module,code_presentation,id_student,dataplus_sum_click,dualpane_sum_click,externalquiz_sum_click,folder_sum_click,forumng_sum_click,glossary_sum_click,homepage_sum_click,...,ouwiki_sum_click,page_sum_click,questionnaire_sum_click,quiz_sum_click,repeatactivity_sum_click,resource_sum_click,sharedsubpage_sum_click,subpage_sum_click,url_sum_click,final_result
0,AAA,2013J,11391,0.0,0.0,0.0,0.0,193.0,0.0,138.0,...,0.0,0.0,0.0,0.0,0.0,13.0,0.0,32.0,5.0,0
1,AAA,2013J,28400,10.0,0.0,0.0,0.0,417.0,0.0,324.0,...,0.0,0.0,0.0,0.0,0.0,12.0,0.0,87.0,48.0,0
2,AAA,2013J,30268,0.0,0.0,0.0,0.0,126.0,0.0,59.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,22.0,4.0,1
3,AAA,2013J,31604,2.0,0.0,0.0,0.0,634.0,1.0,432.0,...,0.0,0.0,0.0,0.0,0.0,19.0,0.0,144.0,90.0,0
4,AAA,2013J,32885,0.0,0.0,0.0,0.0,194.0,4.0,204.0,...,0.0,0.0,0.0,0.0,0.0,45.0,0.0,79.0,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29223,GGG,2014J,2640965,0.0,0.0,0.0,0.0,0.0,0.0,22.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,9.0,0.0,0
29224,GGG,2014J,2645731,0.0,0.0,0.0,0.0,65.0,5.0,167.0,...,0.0,0.0,0.0,152.0,0.0,109.0,0.0,47.0,0.0,0
29225,GGG,2014J,2648187,0.0,0.0,0.0,0.0,0.0,1.0,63.0,...,0.0,0.0,0.0,130.0,0.0,19.0,0.0,20.0,0.0,0
29226,GGG,2014J,2679821,0.0,0.0,0.0,0.0,118.0,0.0,65.0,...,0.0,0.0,0.0,31.0,0.0,9.0,0.0,12.0,0.0,1


In [18]:
df_activity_num = df_studentVle.drop(['date'], axis=1).merge(df_vle.drop(['week_from', 'week_to'], axis=1), how='left', on=['id_site', 'code_module', 'code_presentation'])
df_activity_num = df_activity_num.drop(['id_site'], axis=1)
df_activity_num = df_activity_num.groupby(['code_module', 'code_presentation', 'id_student', 'activity_type']).count()
df_activity_num = df_activity_num.reset_index()
df_activity_num = df_activity_num.rename(columns={'sum_click':'num_click'})
df_activity_num = df_activity_num.set_index(['code_module', 'code_presentation', 'id_student', 'activity_type']).unstack(['activity_type'])
df_activity_num.columns = ['{}_{}'.format(t, v) for v, t in df_activity_num.columns]
df_activity_num = df_activity_num.reset_index()
df_activity_num = df_activity_num.fillna(0)
df_activity_num = df_activity_num.merge(df_dropout, how='left', on=['code_module', 'code_presentation', 'id_student'])
# DF sa brojem pristupa aktivnostima po aktivnostima za svakog studenta i predmete
df_activity_num

Unnamed: 0,code_module,code_presentation,id_student,dataplus_num_click,dualpane_num_click,externalquiz_num_click,folder_num_click,forumng_num_click,glossary_num_click,homepage_num_click,...,ouwiki_num_click,page_num_click,questionnaire_num_click,quiz_num_click,repeatactivity_num_click,resource_num_click,sharedsubpage_num_click,subpage_num_click,url_num_click,final_result
0,AAA,2013J,11391,0.0,0.0,0.0,0.0,52.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,11.0,0.0,11.0,4.0,0
1,AAA,2013J,28400,3.0,0.0,0.0,0.0,163.0,0.0,80.0,...,0.0,0.0,0.0,0.0,0.0,9.0,0.0,42.0,27.0,0
2,AAA,2013J,30268,0.0,0.0,0.0,0.0,30.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,12.0,4.0,1
3,AAA,2013J,31604,1.0,0.0,0.0,0.0,197.0,1.0,121.0,...,0.0,0.0,0.0,0.0,0.0,13.0,0.0,83.0,51.0,0
4,AAA,2013J,32885,0.0,0.0,0.0,0.0,77.0,3.0,68.0,...,0.0,0.0,0.0,0.0,0.0,23.0,0.0,53.0,13.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29223,GGG,2014J,2640965,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,6.0,0.0,0
29224,GGG,2014J,2645731,0.0,0.0,0.0,0.0,14.0,2.0,35.0,...,0.0,0.0,0.0,38.0,0.0,67.0,0.0,20.0,0.0,0
29225,GGG,2014J,2648187,0.0,0.0,0.0,0.0,0.0,1.0,22.0,...,0.0,0.0,0.0,36.0,0.0,14.0,0.0,14.0,0.0,0
29226,GGG,2014J,2679821,0.0,0.0,0.0,0.0,17.0,0.0,13.0,...,0.0,0.0,0.0,5.0,0.0,9.0,0.0,6.0,0.0,1


In [54]:
df_all = df_activity_num.merge(df_activity_sum, how='inner', on=['code_module', 'code_presentation', 'id_student', 'final_result'])
df_all = df_all.merge(df_stud_Assessment, how='inner', on=['code_module', 'code_presentation', 'id_student', 'final_result'])
df_all = df_all.merge(df_stud_demographic, how='inner', on=['code_module', 'code_presentation', 'id_student', 'final_result'])
df_all

Unnamed: 0,code_module,code_presentation,id_student,dataplus_num_click,dualpane_num_click,externalquiz_num_click,folder_num_click,forumng_num_click,glossary_num_click,homepage_num_click,...,imd_band_30-40%,imd_band_40-50%,imd_band_50-60%,imd_band_60-70%,imd_band_70-80%,imd_band_80-90%,imd_band_90-100%,age_band_35-55,age_band_55<=,disability_Y
0,AAA,2013J,11391,0.0,0.0,0.0,0.0,52.0,0.0,40.0,...,0,0,0,0,0,0,1,0,1,0
1,AAA,2013J,28400,3.0,0.0,0.0,0.0,163.0,0.0,80.0,...,0,0,0,0,0,0,0,1,0,0
2,AAA,2013J,31604,1.0,0.0,0.0,0.0,197.0,1.0,121.0,...,0,0,1,0,0,0,0,1,0,0
3,AAA,2013J,32885,0.0,0.0,0.0,0.0,77.0,3.0,68.0,...,0,0,1,0,0,0,0,0,0,0
4,AAA,2013J,38053,1.0,0.0,0.0,0.0,286.0,2.0,142.0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25785,GGG,2014J,2620947,0.0,0.0,0.0,0.0,7.0,1.0,35.0,...,0,0,0,0,0,1,0,0,0,1
25786,GGG,2014J,2645731,0.0,0.0,0.0,0.0,14.0,2.0,35.0,...,0,1,0,0,0,0,0,1,0,0
25787,GGG,2014J,2648187,0.0,0.0,0.0,0.0,0.0,1.0,22.0,...,0,0,0,0,0,0,0,0,0,1
25788,GGG,2014J,2679821,0.0,0.0,0.0,0.0,17.0,0.0,13.0,...,0,0,0,0,0,0,1,1,0,0


## Implementation

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

# Demographic data predicting dropout

In [29]:
# Mođda povecat n_estimators kod RFC
dem_RFC = []
dem_RFC_minmax = []
dem_RFC_standard = []
dem_GBM = []
dem_GBM_minmax = []
dem_GBM_standard = []
dem_LR = []
dem_LR_minmax = []
dem_LR_standard = []
X = df_stud_demographic[['num_of_prev_attempts','studied_credits','gender_M','region_East Midlands Region','region_Ireland','region_London Region','region_North Region','region_North Western Region','region_Scotland','region_South East Region','region_South Region','region_South West Region','region_Wales','region_West Midlands Region','region_Yorkshire Region','highest_education_HE Qualification','highest_education_Lower Than A Level','highest_education_No Formal quals','highest_education_Post Graduate Qualification','imd_band_10-20','imd_band_20-30%','imd_band_30-40%','imd_band_40-50%','imd_band_50-60%','imd_band_60-70%','imd_band_70-80%','imd_band_80-90%','imd_band_90-100%','age_band_35-55','age_band_55<=','disability_Y']].values
y = df_stud_demographic['final_result'].values

kf = KFold(n_splits=5)
kf.get_n_splits(X)
counter = 0
for train_index, test_index in kf.split(X):
    counter+= 1
    print(str(counter)+" Fold")
    scaler1 = MinMaxScaler()
    scaler2 = StandardScaler()

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_minmax = scaler1.fit_transform(X_train)
    X_test_minmax = scaler1.transform(X_test)
    X_train_standard = scaler2.fit_transform(X_train)
    X_test_standard = scaler2.transform(X_test)
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    dem_RFC.append(clf1.score(X_test, y_test))
    dem_GBM.append(clf2.score(X_test, y_test))
    dem_LR.append(clf3.score(X_test, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_minmax, y_train)
    clf2.fit(X_train_minmax, y_train)
    clf3.fit(X_train_minmax, y_train)
    dem_RFC_minmax.append(clf1.score(X_test_minmax, y_test))
    dem_GBM_minmax.append(clf2.score(X_test_minmax, y_test))
    dem_LR_minmax.append(clf3.score(X_test_minmax, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_standard, y_train)
    clf2.fit(X_train_standard, y_train)
    clf3.fit(X_train_standard, y_train)
    dem_RFC_standard.append(clf1.score(X_test_standard, y_test))
    dem_GBM_standard.append(clf2.score(X_test_standard, y_test))
    dem_LR_standard.append(clf3.score(X_test_standard, y_test))

print("Avg 5-fold accuracy of RFC "+str(np.mean(np.array(dem_RFC))))
print("Avg 5-fold accuracy of GBM "+str(np.mean(np.array(dem_GBM))))
print("Avg 5-fold accuracy of LR "+str(np.mean(np.array(dem_LR))))

print("Avg 5-fold accuracy of RFC minmax "+str(np.mean(np.array(dem_RFC_minmax))))
print("Avg 5-fold accuracy of GBM minmax "+str(np.mean(np.array(dem_GBM_minmax))))
print("Avg 5-fold accuracy of LR minmax "+str(np.mean(np.array(dem_LR_minmax))))

print("Avg 5-fold accuracy of RFC standard "+str(np.mean(np.array(dem_RFC_standard))))
print("Avg 5-fold accuracy of GBM standard "+str(np.mean(np.array(dem_GBM_standard))))
print("Avg 5-fold accuracy of LR  standard "+str(np.mean(np.array(dem_LR_standard))))

1 Fold
2 Fold
3 Fold
4 Fold
5 Fold
Avg 5-fold accuracy of RFC 0.634094560893851
Avg 5-fold accuracy of GBM 0.688125963707662
Avg 5-fold accuracy of LR 0.6932193906630515
Avg 5-fold accuracy of RFC minmax 0.6353830691328733
Avg 5-fold accuracy of GBM minmax 0.6881566479666372
Avg 5-fold accuracy of LR minmax 0.6930659740750724
Avg 5-fold accuracy of RFC standard 0.6358433330175005
Avg 5-fold accuracy of GBM standard 0.6880952841555834
Avg 5-fold accuracy of LR  standard 0.6931273143516432


# Assessment score to predict dropout

In [46]:
ase_RFC = []
ase_RFC_minmax = []
ase_RFC_standard = []
ase_GBM = []
ase_GBM_minmax = []
ase_GBM_standard = []
ase_LR = []
ase_LR_minmax = []
ase_LR_standard = []
X = df_stud_Assessment[['0_score', '1_score']].values
y = df_stud_Assessment['final_result'].values

kf = KFold(n_splits=5)
kf.get_n_splits(X)
counter = 0
for train_index, test_index in kf.split(X):
    counter+= 1
    print(str(counter)+" Fold")
    scaler1 = MinMaxScaler()
    scaler2 = StandardScaler()

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_minmax = scaler1.fit_transform(X_train)
    X_test_minmax = scaler1.transform(X_test)
    X_train_standard = scaler2.fit_transform(X_train)
    X_test_standard = scaler2.transform(X_test)
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    ase_RFC.append(clf1.score(X_test, y_test))
    ase_GBM.append(clf2.score(X_test, y_test))
    ase_LR.append(clf3.score(X_test, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_minmax, y_train)
    clf2.fit(X_train_minmax, y_train)
    clf3.fit(X_train_minmax, y_train)
    ase_RFC_minmax.append(clf1.score(X_test_minmax, y_test))
    ase_GBM_minmax.append(clf2.score(X_test_minmax, y_test))
    ase_LR_minmax.append(clf3.score(X_test_minmax, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_standard, y_train)
    clf2.fit(X_train_standard, y_train)
    clf3.fit(X_train_standard, y_train)
    ase_RFC_standard.append(clf1.score(X_test_standard, y_test))
    ase_GBM_standard.append(clf2.score(X_test_standard, y_test))
    ase_LR_standard.append(clf3.score(X_test_standard, y_test))

print("Avg 5-fold accuracy of RFC "+str(np.mean(np.array(ase_RFC))))
print("Avg 5-fold accuracy of GBM "+str(np.mean(np.array(ase_GBM))))
print("Avg 5-fold accuracy of LR "+str(np.mean(np.array(ase_LR))))

print("Avg 5-fold accuracy of RFC minmax "+str(np.mean(np.array(ase_RFC_minmax))))
print("Avg 5-fold accuracy of GBM minmax "+str(np.mean(np.array(ase_GBM_minmax))))
print("Avg 5-fold accuracy of LR minmax "+str(np.mean(np.array(ase_LR_minmax))))

print("Avg 5-fold accuracy of RFC standard "+str(np.mean(np.array(ase_RFC_standard))))
print("Avg 5-fold accuracy of GBM standard "+str(np.mean(np.array(ase_GBM_standard))))
print("Avg 5-fold accuracy of LR  standard "+str(np.mean(np.array(ase_LR_standard))))

1 Fold
2 Fold
3 Fold
4 Fold
5 Fold
Avg 5-fold accuracy of RFC 0.7950006321373853
Avg 5-fold accuracy of GBM 0.815357036288281
Avg 5-fold accuracy of LR 0.8266199494170255
Avg 5-fold accuracy of RFC minmax 0.7949618201002911
Avg 5-fold accuracy of GBM minmax 0.8153183291080991
Avg 5-fold accuracy of LR minmax 0.826774755668415
Avg 5-fold accuracy of RFC standard 0.7949619998549978
Avg 5-fold accuracy of GBM standard 0.8153957359786835
Avg 5-fold accuracy of LR  standard 0.8266199494170255


# Activity sum vle predicting dropout

In [51]:
sum_RFC = []
sum_RFC_minmax = []
sum_RFC_standard = []
sum_GBM = []
sum_GBM_minmax = []
sum_GBM_standard = []
sum_LR = []
sum_LR_minmax = []
sum_LR_standard = []
X = df_activity_sum[['dataplus_sum_click','dualpane_sum_click','externalquiz_sum_click','folder_sum_click','forumng_sum_click','glossary_sum_click','homepage_sum_click','htmlactivity_sum_click','oucollaborate_sum_click','oucontent_sum_click','ouelluminate_sum_click','ouwiki_sum_click','page_sum_click','questionnaire_sum_click','quiz_sum_click','repeatactivity_sum_click','resource_sum_click','sharedsubpage_sum_click','subpage_sum_click','url_sum_click',]].values
y = df_activity_sum['final_result'].values

kf = KFold(n_splits=5)
kf.get_n_splits(X)
counter = 0
for train_index, test_index in kf.split(X):
    counter+= 1
    print(str(counter)+" Fold")
    scaler1 = MinMaxScaler()
    scaler2 = StandardScaler()

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_minmax = scaler1.fit_transform(X_train)
    X_test_minmax = scaler1.transform(X_test)
    X_train_standard = scaler2.fit_transform(X_train)
    X_test_standard = scaler2.transform(X_test)
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    sum_RFC.append(clf1.score(X_test, y_test))
    sum_GBM.append(clf2.score(X_test, y_test))
    sum_LR.append(clf3.score(X_test, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_minmax, y_train)
    clf2.fit(X_train_minmax, y_train)
    clf3.fit(X_train_minmax, y_train)
    sum_RFC_minmax.append(clf1.score(X_test_minmax, y_test))
    sum_GBM_minmax.append(clf2.score(X_test_minmax, y_test))
    sum_LR_minmax.append(clf3.score(X_test_minmax, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_standard, y_train)
    clf2.fit(X_train_standard, y_train)
    clf3.fit(X_train_standard, y_train)
    sum_RFC_standard.append(clf1.score(X_test_standard, y_test))
    sum_GBM_standard.append(clf2.score(X_test_standard, y_test))
    sum_LR_standard.append(clf3.score(X_test_standard, y_test))

print("Avg 10-fold accuracy of RFC "+str(np.mean(np.array(sum_RFC))))
print("Avg 10-fold accuracy of GBM "+str(np.mean(np.array(sum_GBM))))
print("Avg 10-fold accuracy of LR "+str(np.mean(np.array(sum_LR))))

print("Avg 10-fold accuracy of RFC minmax "+str(np.mean(np.array(sum_RFC_minmax))))
print("Avg 10-fold accuracy of GBM minmax "+str(np.mean(np.array(sum_GBM_minmax))))
print("Avg 10-fold accuracy of LR minmax "+str(np.mean(np.array(sum_LR_minmax))))

print("Avg 10-fold accuracy of RFC standard "+str(np.mean(np.array(sum_RFC_standard))))
print("Avg 10-fold accuracy of GBM standard "+str(np.mean(np.array(sum_GBM_standard))))
print("Avg 10-fold accuracy of LR  standard "+str(np.mean(np.array(sum_LR_standard))))

1 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


5 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Avg 10-fold accuracy of RFC 0.7239686308434887
Avg 10-fold accuracy of GBM 0.7266029516647268
Avg 10-fold accuracy of LR 0.7334801917595823
Avg 10-fold accuracy of RFC minmax 0.7237289694107704
Avg 10-fold accuracy of GBM minmax 0.7264661059582609
Avg 10-fold accuracy of LR minmax 0.7561611618657021
Avg 10-fold accuracy of RFC standard 0.7257818423072725
Avg 10-fold accuracy of GBM standard 0.7266029575178365
Avg 10-fold accuracy of LR  standard 0.733719589802361


# Activity count vle predicting dropout

In [52]:
num_RFC = []
num_RFC_minmax = []
num_RFC_standard = []
num_GBM = []
num_GBM_minmax = []
num_GBM_standard = []
num_LR = []
num_LR_minmax = []
num_LR_standard = []
X_num = df_activity_num[['dataplus_num_click','dualpane_num_click','externalquiz_num_click','folder_num_click','forumng_num_click','glossary_num_click','homepage_num_click','htmlactivity_num_click','oucollaborate_num_click','oucontent_num_click','ouelluminate_num_click','ouwiki_num_click','page_num_click','questionnaire_num_click','quiz_num_click','repeatactivity_num_click','resource_num_click','sharedsubpage_num_click','subpage_num_click','url_num_click']].values
y_num = df_activity_num['final_result'].values

kf = KFold(n_splits=5)
kf.get_n_splits(X_num)
counter = 0
for train_index, test_index in kf.split(X_num):
    counter+= 1
    print(str(counter)+" Fold")
    scaler1 = MinMaxScaler()
    scaler2 = StandardScaler()

    X_train, X_test = X_num[train_index], X_num[test_index]
    y_train, y_test = y_num[train_index], y_num[test_index]
    X_train_minmax = scaler1.fit_transform(X_train)
    X_test_minmax = scaler1.transform(X_test)
    X_train_standard = scaler2.fit_transform(X_train)
    X_test_standard = scaler2.transform(X_test)
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    num_RFC.append(clf1.score(X_test, y_test))
    num_GBM.append(clf2.score(X_test, y_test))
    num_LR.append(clf3.score(X_test, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_minmax, y_train)
    clf2.fit(X_train_minmax, y_train)
    clf3.fit(X_train_minmax, y_train)
    num_RFC_minmax.append(clf1.score(X_test_minmax, y_test))
    num_GBM_minmax.append(clf2.score(X_test_minmax, y_test))
    num_LR_minmax.append(clf3.score(X_test_minmax, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_standard, y_train)
    clf2.fit(X_train_standard, y_train)
    clf3.fit(X_train_standard, y_train)
    num_RFC_standard.append(clf1.score(X_test_standard, y_test))
    num_GBM_standard.append(clf2.score(X_test_standard, y_test))
    num_LR_standard.append(clf3.score(X_test_standard, y_test))

print("Avg 10-fold accuracy of RFC "+str(np.mean(np.array(num_RFC))))
print("Avg 10-fold accuracy of GBM "+str(np.mean(np.array(num_GBM))))
print("Avg 10-fold accuracy of LR "+str(np.mean(np.array(num_LR))))

print("Avg 10-fold accuracy of RFC minmax "+str(np.mean(np.array(num_RFC_minmax))))
print("Avg 10-fold accuracy of GBM minmax "+str(np.mean(np.array(num_GBM_minmax))))
print("Avg 10-fold accuracy of LR minmax "+str(np.mean(np.array(num_LR_minmax))))

print("Avg 10-fold accuracy of RFC standard "+str(np.mean(np.array(num_RFC_standard))))
print("Avg 10-fold accuracy of GBM standard "+str(np.mean(np.array(num_GBM_standard))))
print("Avg 10-fold accuracy of LR  standard "+str(np.mean(np.array(num_LR_standard))))

1 Fold
2 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3 Fold
4 Fold
5 Fold
Avg 10-fold accuracy of RFC 0.7422367717524241
Avg 10-fold accuracy of GBM 0.7254041528399142
Avg 10-fold accuracy of LR 0.7793236907251916
Avg 10-fold accuracy of RFC minmax 0.7370710453390663
Avg 10-fold accuracy of GBM minmax 0.7257120615325724
Avg 10-fold accuracy of LR minmax 0.7776470030468363
Avg 10-fold accuracy of RFC standard 0.7360104033173084
Avg 10-fold accuracy of GBM standard 0.7252330781475025
Avg 10-fold accuracy of LR  standard 0.779289455886136


# All features predicting dropout

In [56]:
all_RFC = []
all_RFC_minmax = []
all_RFC_standard = []
all_GBM = []
all_GBM_minmax = []
all_GBM_standard = []
all_LR = []
all_LR_minmax = []
all_LR_standard = []
X = df_all[['dataplus_num_click','dualpane_num_click','externalquiz_num_click','folder_num_click','forumng_num_click','glossary_num_click','homepage_num_click','htmlactivity_num_click','oucollaborate_num_click','oucontent_num_click','ouelluminate_num_click','ouwiki_num_click','page_num_click','questionnaire_num_click','quiz_num_click','repeatactivity_num_click','resource_num_click','sharedsubpage_num_click','subpage_num_click','url_num_click','dataplus_sum_click','dualpane_sum_click','externalquiz_sum_click','folder_sum_click','forumng_sum_click','glossary_sum_click','homepage_sum_click','htmlactivity_sum_click','oucollaborate_sum_click','oucontent_sum_click','ouelluminate_sum_click','ouwiki_sum_click','page_sum_click','questionnaire_sum_click','quiz_sum_click','repeatactivity_sum_click','resource_sum_click',
 'sharedsubpage_sum_click','subpage_sum_click','url_sum_click','0_score','1_score','num_of_prev_attempts','studied_credits','gender_M','region_East Midlands Region','region_Ireland','region_London Region','region_North Region','region_North Western Region','region_Scotland','region_South East Region','region_South Region','region_South West Region','region_Wales','region_West Midlands Region','region_Yorkshire Region','highest_education_HE Qualification','highest_education_Lower Than A Level','highest_education_No Formal quals','highest_education_Post Graduate Qualification','imd_band_10-20','imd_band_20-30%','imd_band_30-40%','imd_band_40-50%','imd_band_50-60%','imd_band_60-70%','imd_band_70-80%','imd_band_80-90%','imd_band_90-100%','age_band_35-55','age_band_55<=','disability_Y']].values
y = df_all['final_result'].values

kf = KFold(n_splits=5)
kf.get_n_splits(X)
counter = 0
for train_index, test_index in kf.split(X):
    counter+= 1
    print(str(counter)+" Fold")
    scaler1 = MinMaxScaler()
    scaler2 = StandardScaler()

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_minmax = scaler1.fit_transform(X_train)
    X_test_minmax = scaler1.transform(X_test)
    X_train_standard = scaler2.fit_transform(X_train)
    X_test_standard = scaler2.transform(X_test)
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    all_RFC.append(clf1.score(X_test, y_test))
    all_GBM.append(clf2.score(X_test, y_test))
    all_LR.append(clf3.score(X_test, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_minmax, y_train)
    clf2.fit(X_train_minmax, y_train)
    clf3.fit(X_train_minmax, y_train)
    all_RFC_minmax.append(clf1.score(X_test_minmax, y_test))
    all_GBM_minmax.append(clf2.score(X_test_minmax, y_test))
    all_LR_minmax.append(clf3.score(X_test_minmax, y_test))
    
    clf1 = RandomForestClassifier()
    clf2 = GradientBoostingClassifier(n_estimators = 500, learning_rate=0.1, max_depth=3, random_state=1)
    clf3 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train_standard, y_train)
    clf2.fit(X_train_standard, y_train)
    clf3.fit(X_train_standard, y_train)
    all_RFC_standard.append(clf1.score(X_test_standard, y_test))
    all_GBM_standard.append(clf2.score(X_test_standard, y_test))
    all_LR_standard.append(clf3.score(X_test_standard, y_test))

print("Avg 10-fold accuracy of RFC "+str(np.mean(np.array(all_RFC))))
print("Avg 10-fold accuracy of GBM "+str(np.mean(np.array(all_GBM))))
print("Avg 10-fold accuracy of LR "+str(np.mean(np.array(all_LR))))

print("Avg 10-fold accuracy of RFC minmax "+str(np.mean(np.array(all_RFC_minmax))))
print("Avg 10-fold accuracy of GBM minmax "+str(np.mean(np.array(all_GBM_minmax))))
print("Avg 10-fold accuracy of LR minmax "+str(np.mean(np.array(all_LR_minmax))))

print("Avg 10-fold accuracy of RFC standard "+str(np.mean(np.array(all_RFC_standard))))
print("Avg 10-fold accuracy of GBM standard "+str(np.mean(np.array(all_GBM_standard))))
print("Avg 10-fold accuracy of LR  standard "+str(np.mean(np.array(all_LR_standard))))

1 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


5 Fold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Avg 10-fold accuracy of RFC 0.8062039550213262
Avg 10-fold accuracy of GBM 0.8076773943388911
Avg 10-fold accuracy of LR 0.8155098875533152
Avg 10-fold accuracy of RFC minmax 0.8061264055835595
Avg 10-fold accuracy of GBM minmax 0.8076386196200078
Avg 10-fold accuracy of LR minmax 0.8172935246219465
Avg 10-fold accuracy of RFC standard 0.7991081814656843
Avg 10-fold accuracy of GBM standard 0.8075610701822413
Avg 10-fold accuracy of LR  standard 0.8149282667700659


## Prediction of success based on demographics and previous education

In [None]:
## imd_band, region, highest_education ----> pomocu decision tree
## ali probaj i logisticka regresija, SVM, NaiveBayes i random forest

## Prediction of success based on gender