# MOOC_Student_Drop_Rate_Prediction

In [36]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop(['enrollment_id', 'startdate', 'enddate'], axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]
# Droping independent features
data.drop(['page_close', 'video', 'proccess_period'], axis=1, inplace=True)
# Extracting extra feature from Start_Date and End_Date
duration_in_days = (data['enddate'] - data['startdate']).dt.days + 1
data.insert(8,"duration_in_days", duration_in_days)
# Splitting the data using train_test_split
train, test = train_test_split(data.iloc[:, 3:], test_size=0.3, random_state=0)
X_test = test.drop(['dropout_prob'], axis=1)
y_test = test['dropout_prob']
# Upsampling data i.e., Minor to Major
dropout_minor = train[train.dropout_prob==0]
dropout_major = train[train.dropout_prob==1]
dropout_upsampled = resample(dropout_minor,
                          replace=True, # sample with replacement
                          n_samples=len(dropout_major), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([dropout_major, dropout_upsampled])
y_train = upsampled.dropout_prob
X_train = upsampled.drop(['dropout_prob'], axis=1)
X_train = X_train[['duration_in_days', 'access', 'discussion', 'navigate', 'problem', 'wiki', 'present_days', 'effective_time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'holidays', 'course_enroll', 'user_enroll', 'course_drop_rate']]



In [None]:
# Predicting result with RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)
print("Training Score : ", classifier.score(X_train, y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

# Generating Pickle file
pickle.dump(classifier, open('pkl_rfc_mim.pkl', 'wb'))

### Alternative Method

In [183]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop('enrollment_id', axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]

# Extracting extra feature from Start_Date and End_Date
duration_in_days = (data['enddate'] - data['startdate']).dt.days + 1
data.insert(11,"duration_in_days", duration_in_days)

# Exclude independent features ('page_close', 'video', 'proccess_period') which are highly correlated 
# Include independent features ('effective_time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'holidays', 'course_enroll', 'user_enroll', 'course_drop_rate') 
data = data[['duration_in_days', 'present_days', 'access', 'discussion', 'navigate', 'problem', 'wiki', 'dropout_prob']]

# Spliting Input and Output features and scaling X data and concatenate into "data"
y = data['dropout_prob']
X = data.drop('dropout_prob', axis=1)


In [184]:
# Log
X_log = np.log(X+1)

In [185]:
# Normalizing
X_norm = preprocessing.normalize(X_log)
X_norm = pd.DataFrame(X_norm, index= X.index, columns=X.columns)

In [186]:
# Scaling the Input features
ss_scale = StandardScaler()
X_scale = scale.fit_transform(X_norm)
X_scale = pd.DataFrame(X_scale, index=X.index, columns=X.columns)

In [187]:
data = pd.concat([X_scale, y], axis=1)
# Splitting training and testing data using train_test_split()
train, test = train_test_split(data, test_size=0.3, random_state=0)
X_test = test.drop(['dropout_prob'], axis=1)
y_test = test['dropout_prob']
# Upsampling data i.e., Minor to Major
dropout_minor = train[train.dropout_prob==0]
dropout_major = train[train.dropout_prob==1]
dropout_upsampled = resample(dropout_minor,
                          replace=True, # sample with replacement
                          n_samples=len(dropout_major), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([dropout_major, dropout_upsampled])
y_train = upsampled.dropout_prob
X_train = upsampled.drop(['dropout_prob'], axis=1)

In [188]:
# Predicting result with RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)
# Generating Pickle file
pickle.dump(classifier, open('pkl_rfc_log_norm_scale_ggm.pkl', 'wb'))
print("Training Score : ", classifier.score(X_train, y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

Training Score :  0.9362165008452734
Testing Score :  0.8066576155277316


In [192]:
data1 = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])


In [204]:
X_train.columns

Index(['duration_in_days', 'present_days', 'access', 'discussion', 'navigate',
       'problem', 'wiki'],
      dtype='object')

In [203]:
data1[['startdate', 'enddate', 'present_days', 'access', 'discussion', 'navigate', 'problem', 'wiki']].iloc[30:31]

Unnamed: 0,startdate,enddate,present_days,access,discussion,navigate,problem,wiki
30,2014-06-02,2014-06-03,2,8,0,6,0,0


In [205]:
model = pickle.load(open("pkl_rfc_log_norm_scale_ggm.pkl", "rb"))

In [223]:
X_scale.reshape(1,-1)

array([[ 0.8660254 ,  0.8660254 ,  0.8660254 , -1.15470054,  0.8660254 ,
        -1.15470054, -1.15470054]])

1