# MOOC_Student_Drop_Rate_Prediction

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop(['enrollment_id', 'startdate', 'enddate'], axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]
# Droping independent features
data.drop(['page_close', 'video', 'proccess_period'], axis=1, inplace=True)
# Extracting extra feature from Start_Date and End_Date
duration_in_days = (data['enddate'] - data['startdate']).dt.days + 1
data.insert(8,"duration_in_days", duration_in_days)
# Splitting the data using train_test_split
train, test = train_test_split(data.iloc[:, 3:], test_size=0.3, random_state=0)
X_test = test.drop(['dropout_prob'], axis=1)
y_test = test['dropout_prob']
# Upsampling data i.e., Minor to Major
dropout_minor = train[train.dropout_prob==0]
dropout_major = train[train.dropout_prob==1]
dropout_upsampled = resample(dropout_minor,
                          replace=True, # sample with replacement
                          n_samples=len(dropout_major), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([dropout_major, dropout_upsampled])
y_train = upsampled.dropout_prob
X_train = upsampled.drop(['dropout_prob'], axis=1)
X_train = X_train[['duration_in_days', 'access', 'discussion', 'navigate', 'problem', 'wiki', 'present_days', 'effective_time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'holidays', 'course_enroll', 'user_enroll', 'course_drop_rate']]

# Predicting result with RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)
print("Training Score : ", classifier.score(X_train, y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

# Generating Pickle file
pickle.dump(classifier, open('pkl_rfc_mim.pkl', 'wb'))

In [34]:
X_train.head()

Unnamed: 0,duration_in_days,access,discussion,navigate,problem,wiki,present_days,effective_time,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,holidays,course_enroll,user_enroll,course_drop_rate
54331,1,0,0,1,0,0,1,0.0,0,0,0,0,1,0,0,0,761,4,0.875546
40981,6,74,14,12,22,2,4,8.622222,1,1,1,0,1,0,0,0,1481,2,0.823991
22866,2,7,51,8,0,0,2,1.472778,0,0,1,1,0,0,0,0,2796,3,0.666866
57590,7,5,0,4,0,0,3,0.025278,1,0,1,0,0,0,1,0,2981,6,0.799671
51290,1,4,0,5,0,1,1,0.664444,0,1,0,0,0,0,0,0,4896,4,0.829657


In [35]:
y_train.head()

54331    1
40981    1
22866    1
57590    1
51290    1
Name: dropout_prob, dtype: int64

##### ==================================== ROUGH ====================================

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV  # using for "DecisionTreeClassifier" and "RandomForestClassifier"
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix

In [28]:
tuned_parameters = {'min_samples_leaf': range(10,100,5), 
                    'n_estimators' : range(1,50,10),
                    'max_features':['auto','sqrt','log2']
                    }
RFC = RandomForestClassifier(class_weight= {0: 1, 1: 1})
model = RandomizedSearchCV(RFC, tuned_parameters,cv=10,scoring='accuracy',n_iter=20,n_jobs= -1,random_state=5)    
model.fit(X_train, y_train)
y_test_prob = model.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_test_pred = np.where(y_test_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
roc_auc = auc(fpr, tpr)
print("Model Score: ", model.score(X_test, y_test_pred))
print("Best Score : ", model.best_score_)
print("Best Params : ",model.best_params_)

Model Score:  1.0
Best Score :  0.8377868281473658
Best Params :  {'n_estimators': 41, 'min_samples_leaf': 10, 'max_features': 'log2'}


In [29]:
model.score(X_test, y_test)

0.8189429618001046