# MOOC_Student_Drop_Rate_Prediction

##### Method 1 : PREDICTIONS using rfc_mim.pkl -----------------------------------------------------------------

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop(['enrollment_id', 'startdate', 'enddate'], axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]
# Droping independent features
data.drop(['page_close', 'video', 'proccess_period'], axis=1, inplace=True)
# Extracting extra feature from Start_Date and End_Date
duration_in_days = (data['enddate'] - data['startdate']).dt.days + 1
data.insert(8,"duration_in_days", duration_in_days)
# Splitting the data using train_test_split
train, test = train_test_split(data.iloc[:, 3:], test_size=0.3, random_state=0)
X_test = test.drop(['dropout_prob'], axis=1)
y_test = test['dropout_prob']
# Upsampling data i.e., Minor to Major
dropout_minor = train[train.dropout_prob==0]
dropout_major = train[train.dropout_prob==1]
dropout_upsampled = resample(dropout_minor,
                          replace=True, # sample with replacement
                          n_samples=len(dropout_major), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([dropout_major, dropout_upsampled])
y_train = upsampled.dropout_prob
X_train = upsampled.drop(['dropout_prob'], axis=1)
X_train = X_train[['duration_in_days', 'access', 'discussion', 'navigate', 'problem', 'wiki', 'present_days', 'effective_time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'holidays', 'course_enroll', 'user_enroll', 'course_drop_rate']]



In [None]:
# Predicting result with RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)
print("Training Score : ", classifier.score(X_train, y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

# Generating Pickle file
pickle.dump(classifier, open('pkl_rfc_mim.pkl', 'wb'))

##### Method 2 : PREDICTIONS using rf_model_feature_10.pkl ------------------------------------------------------

Model is created by Mr. Amar

##### Method 3 : PREDICTIONS using pkl_rfc_log_norm_scale_ggm.pkl ------------------------------------------------------

In [5]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop('enrollment_id', axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]

# Extracting extra feature from Start_Date and End_Date
duration_in_days = (data['enddate'] - data['startdate']).dt.days + 1
data.insert(11,"duration_in_days", duration_in_days)

# Exclude independent features ('page_close', 'video', 'proccess_period') which are highly correlated 
# Include independent features ('effective_time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'holidays', 'course_enroll', 'user_enroll', 'course_drop_rate') 
data = data[['duration_in_days', 'present_days', 'access', 'discussion', 'navigate', 'problem', 'wiki', 'dropout_prob']]

# Spliting Input and Output features and scaling X data and concatenate into "data"
y = data['dropout_prob']
X = data.drop('dropout_prob', axis=1)
# Log
X_log = np.log(X+1)
# Normalizing
X_norm = preprocessing.normalize(X_log)
X_norm = pd.DataFrame(X_norm, index= X.index, columns=X.columns)
# Scaling the Input features
ss_scale = StandardScaler()
X_scale = scale.fit_transform(X_norm)
X_scale = pd.DataFrame(X_scale, index=X.index, columns=X.columns)
# Concatenating
data = pd.concat([X_scale, y], axis=1)
# Splitting training and testing data using train_test_split()
train, test = train_test_split(data, test_size=0.3, random_state=0)
X_test = test.drop(['dropout_prob'], axis=1)
y_test = test['dropout_prob']
# Upsampling data i.e., Minor to Major
dropout_minor = train[train.dropout_prob==0]
dropout_major = train[train.dropout_prob==1]
dropout_upsampled = resample(dropout_minor,
                          replace=True, # sample with replacement
                          n_samples=len(dropout_major), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([dropout_major, dropout_upsampled])
y_train = upsampled.dropout_prob
X_train = upsampled.drop(['dropout_prob'], axis=1)

In [188]:
# Predicting result with RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)
# Generating Pickle file
pickle.dump(classifier, open('pkl_rfc_log_norm_scale_ggm.pkl', 'wb'))
print("Training Score : ", classifier.score(X_train, y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

Training Score :  0.9362165008452734
Testing Score :  0.8066576155277316


##### Method 4 : PREDICTIONS using pkl_rfc_log_norm_scale_ggm.pkl with input as Excel file ----------

In [34]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import pickle

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

##### Generating Testing data

In [46]:
# Loading the data
data = pd.read_csv('MOOC_Visual.csv', parse_dates=['startdate', 'enddate'])
# Removing duplicate rows
duplicate_index = data[data.drop('enrollment_id', axis=1).duplicated()].index
data = data.drop(duplicate_index)
# Removing Outliers
data = data[data['access']<700]
data = data[data['discussion']<1000]
data = data[data['navigate']<200]
data = data[data['page_close']<250]
data = data[data['problem']<750]
data = data[data['video']<250]
data = data[data['wiki']<120]
data = data[data['effective_time']<255]

X = data[['enrollment_id', 'startdate', 'enddate', 'access', 'discussion', 'navigate',  "page_close", 'problem', "video", 'wiki']]
X.columns = ['enrollment_id', 'start_date', 'end_date', 'access', 'discussion', 'navigate',  "page_close", 'problem', "video", 'wiki']
y = data['dropout_prob']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)


##### Testing input data with the pickle model

In [63]:
X_train.loc[57762]

enrollment_id                  57763
start_date       2013-11-12 00:00:00
end_date         2013-12-09 00:00:00
access                           118
discussion                         7
navigate                          27
page_close                        79
problem                           37
video                             36
wiki                               2
Name: 57762, dtype: object

In [77]:
Year = 2004
def Leap_Year(Year):
    C1 = Year%400==0 
    C2 = Year%4==0
    C3 = Year%100!=0

    if C1 | (C2 & C3):
        return "Leap Year"
    else:
        return "Not Leap Year"

In [83]:
import math
Num = 735382

count=0
for i in range(1, math.floor(Num/365)):
    if Leap_Year(i)=="Leap Year":
        count+=1
count

488

In [93]:
math.floor((Num-count)/365), (Num-count)%365

(2013, 149)

In [102]:
math.floor((Num-count)/365), math.floor((149-5)/30), (149-5)%30

(2013, 4, 24)

In [38]:
# Loading the data frame from input excel file
# df = pd.read_csv("sdo_test_csv_file")
df = X_test.copy()

# Converting data-time format
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
# Extracting new feature from start_date and end_date
present_days = (df["end_date"] - df["start_date"]).dt.days + 1
df.insert(3, "present_days", present_days)
df['start_date'] = df['start_date'].map(datetime.toordinal)
df['end_date'] = df['end_date'].map(datetime.toordinal)
X = df.drop('enrollment_id', axis=1)
# Scalling the data
scale = StandardScaler()
X_scale = scale.fit_transform(X)
X_scale = pd.DataFrame(X_scale, index=X.index, columns=X.columns)
# Initialising Pickle file
model = pickle.load(open("pkl_rf_model_feature_10.pkl", "rb"))
# Model Prediction
pred_val = model.predict(X_scale)
df['result'] = pred_val
df.to_csv('X_test_Pred.csv', index=False)

##### ==============================================================================================

In [None]:
class preprocessing:
    def __init__(self):
        pass
    
    def processing(self, df):
        

In [13]:
from sklearn.preprocessing import StandardScaler
import datetime
import pandas as pd
import numpy as np

class preprocessing:
    def __init__(self):
        pass
    
    def processing(self, df):
        # Converting Dates into ordinals
        df = self.dates_to_ordinals(df)
        # Extracting new "present_days" feature from "start_date" and "end_date" features
        df = self.extract_features(df)
        # Scaling the values
        df = self.stadardise_values(df)
        return df
    
    # Converting Dates into ordinals
    def dates_to_ordinals(self, df):
        df['start_date'] = pd.to_datetime(df['start_date']).apply(lambda x : x.toordinal())
        df['end_date'] = pd.to_datetime(df['end_date']).apply(lambda x : x.toordinal())
        return df

    # Extracting new "present_days" feature from "start_date" and "end_date" features
    def extract_features(self, df):
        present_days = df['end_date'] - df['start_date'] + 1
        df.insert(2, 'present_days', present_days)
        return df

    # Scaling the values
    def stadardise_values(self, df):
        scale = StandardScaler()
        arr = scale.fit_transform(df)
        df = pd.DataFrame(arr, columns=df.columns)
        return df

In [2]:
data = pd.read_csv('X_test1.csv')

In [30]:
df =data.iloc[2:3, 1:]
X = df.copy()
df

Unnamed: 0,start_date,end_date,access,discussion,navigate,page_close,problem,video,wiki
2,2014-05-29,2014-06-24,185,70,36,124,36,50,1


In [31]:
p = preprocessing()
df = p.processing(df)
df

Unnamed: 0,start_date,end_date,present_days,access,discussion,navigate,page_close,problem,video,wiki
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
import pickle

# Initialising Pickle file
model = pickle.load(open("pkl_rf_model_feature_10.pkl", "rb"))
# Model Prediction
X['result'] = model.predict(df)

In [17]:
df

Unnamed: 0,start_date,end_date,present_days,access,discussion,navigate,page_close,problem,video,wiki
0,0.428159,0.588445,1.245585,1.094722,2.295614,1.303439,2.281465,0.777817,2.161336,0.538816
1,-1.985379,-1.92554,0.653624,0.381211,0.294595,0.452214,0.788377,1.107801,0.912199,0.538816
2,-1.412468,-1.518866,-0.727619,-0.66604,0.383529,-0.399012,-0.447283,1.814907,-0.20545,0.538816
3,0.720709,0.563798,-1.319581,-1.011287,-0.817083,-1.108367,-0.884913,-0.919239,-1.060123,-0.898027
4,0.793847,0.637739,-1.319581,-0.999779,-0.817083,-1.179302,-0.884913,-0.919239,-1.060123,-0.898027
5,0.440349,0.576121,1.048265,1.923315,0.161193,1.729052,0.041832,-0.494975,-0.139706,1.975658
6,0.40378,0.514504,0.850944,-0.021578,-0.817083,-0.328077,-0.498769,-0.730677,-0.271194,-0.898027
7,0.611003,0.563798,-0.431638,-0.700564,-0.683682,-0.469948,-0.395797,-0.636396,-0.336938,-0.898027


In [33]:
X

Unnamed: 0,start_date,end_date,access,discussion,navigate,page_close,problem,video,wiki,result
2,2014-05-29,2014-06-24,185,70,36,124,36,50,1,1


In [19]:
X.to_dict(orient='records')

[{'start_date': '2014-05-29',
  'end_date': '2014-06-24',
  'access': 185,
  'discussion': 70,
  'navigate': 36,
  'page_close': 124,
  'problem': 36,
  'video': 50,
  'wiki': 1,
  'result': 0},
 {'start_date': '2013-11-12',
  'end_date': '2013-12-02',
  'access': 123,
  'discussion': 25,
  'navigate': 24,
  'page_close': 66,
  'problem': 43,
  'video': 31,
  'wiki': 1,
  'result': 1},
 {'start_date': '2013-12-29',
  'end_date': '2014-01-04',
  'access': 32,
  'discussion': 27,
  'navigate': 12,
  'page_close': 18,
  'problem': 58,
  'video': 14,
  'wiki': 1,
  'result': 1},
 {'start_date': '2014-06-22',
  'end_date': '2014-06-22',
  'access': 2,
  'discussion': 0,
  'navigate': 2,
  'page_close': 1,
  'problem': 0,
  'video': 1,
  'wiki': 0,
  'result': 1},
 {'start_date': '2014-06-28',
  'end_date': '2014-06-28',
  'access': 3,
  'discussion': 0,
  'navigate': 1,
  'page_close': 1,
  'problem': 0,
  'video': 1,
  'wiki': 0,
  'result': 1},
 {'start_date': '2014-05-30',
  'end_date': 

In [239]:
cols = ['start_date', 'end_date', 'access', 'discussion', 'navigate', 'page_close', 'problem', 'video', 'wiki']
in_features = [['2014-05-29', '2014-06-24', 185, 70, 36, 124, 36, 50, 1]]
#in_features = np.array(in_features).reshape(1,-1)
pd.DataFrame(in_features, columns=cols )

Unnamed: 0,start_date,end_date,access,discussion,navigate,page_close,problem,video,wiki
0,2014-05-29,2014-06-24,185,70,36,124,36,50,1


In [21]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://gowtham136:user136@cluster0.heyil.mongodb.net/<dbname>?retryWrites=true&w=majority")

In [22]:
db = client.get_database('stdDropoutDB')

In [23]:
collectionD = db['MOOC_Visual']

In [24]:
record = {'start_date': '2014-05-29',
  'end_date': '2014-06-24',
  'access': 185,
  'discussion': 70,
  'navigate': 36,
  'page_close': 124,
  'problem': 36,
  'video': 50,
  'wiki': 1,
  'result': 0}
collectionD.insert_one(record)

<pymongo.results.InsertOneResult at 0x1f552054780>

In [25]:
for record in enumerate(collectionD.find()):
    print(record)

(0, {'_id': ObjectId('5ff21b9174b6e4ef98407410'), 'start_date': '2014-05-29', 'end_date': '2014-06-24', 'access': 185, 'discussion': 70, 'navigate': 36, 'page_close': 124, 'problem': 36, 'video': 50, 'wiki': 1, 'result': 0})


In [27]:
client.close()

In [41]:
from pymongo import MongoClient

class database:
    def __init__(self):
        try:
            # self.client = MongoClient('localhost', 27017)
            # self.db = self.client['stdDropoutDB']
            # self.collectionD = self.db['MOOC_Visual']
            self.client = MongoClient("mongodb+srv://gowtham136:user136@cluster0.heyil.mongodb.net/<dbname>?retryWrites=true&w=majority")
            self.db = self.client.get_database('stdDropoutDB')
            self.collectionT = self.db['MOOC_Visual']
        except Exception as ex:
            print(ex)

    # To add new row
    def update_one(self, df):
        record = df.to_dict(orient='records')[0]
        self.collectionT.insert_one(record)     # Inserting Record
        countOfrecords = self.collectionT.find().count()    # Finding number of records
        message = f"Record is successfully inserted at place {countOfrecords}"  # Sending Message
        self.client.close()
        return message

In [42]:
db = database()
DbMessage = db.update_one(X)
print(DbMessage)

  countOfrecords = self.collectionT.find().count()    # Finding number of records


Record is successfully inserted at place 4


In [40]:
DbMessage = db.update_one(X)
print(DbMessage)

  countOfrecords = self.collectionT.find().count()    # Finding number of records


Record is successfully inserted at place 3


In [45]:
path = 'D:\\DataScience\\02 INEURON\\ML_Projects\\Intership (iNeuron)\\Projects\\ml_education\\student_dropout\\Data\\Batch_Files\\'
'D:\\DataScience\\02 INEURON\\ML_Projects\\Intership (iNeuron)\\Projects\\ml_education\\student_dropout\\Data\\Batch_Files\\X_test1.csv'.split(path)[1]

'X_test1.csv'