In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
np.random.seed(0)

In [110]:
#Reading the csv file 
df = pd.read_csv('datasets_dataset.csv')

#Checking whether we have missing data ?
if df.isnull().sum().any() == 0:
    print("NO MISSING DATA")
else:
    print("THERE IS MISSING DATA")

NO MISSING DATA


In [111]:
##### REPORTS #####

#The number of all students 
number_of_all_registered_std = len(df)
print('{} students have registered in this course.'.format(number_of_all_registered_std) )

#the number of failed students
failed_std = df[df['Grade'] == 0]
number_of_failed_std = len(failed_std)

#the number of tourist students
tourist_std = failed_std[failed_std['Week8_Total'] == 0]
number_of_tourist_std = len(tourist_std)


# Information about failed students(grade = 0) 
percentage_of_failed_std =round((number_of_failed_std * 100) / number_of_all_registered_std)
print('{} students(about {}%) of all students have been failed.'.format(len(failed_std), percentage_of_failed_std))

# Information about 'Tourist' students(grade = 0)
percentage_of_tourist_std = round((number_of_tourist_std * 100) / number_of_failed_std)
print('{} students(about {}%) of failed students have been \'tourist\' students without any activity.'.format(len(tourist_std), percentage_of_tourist_std))


107 students have registered in this course.
48 students(about 45%) of all students have been failed.
35 students(about 73%) of failed students have been 'tourist' students without any activity.


In [112]:
#Drop irrelevent columns

df.drop(['Week1_Stat0', 'Week1_Stat1', 'Week1_Stat2', 'Week1_Stat3', 
         'Week2_Stat0', 'Week2_Stat1', 'Week2_Stat2', 'Week2_Stat3',
         'Week3_Stat0', 'Week3_Stat1', 'Week3_Stat2', 'Week3_Stat3',
         'Week4_Stat0', 'Week4_Stat1', 'Week4_Stat2', 'Week4_Stat3',
         'Week5_Stat0', 'Week5_Stat1', 'Week5_Stat2', 'Week5_Stat3',
         'Week6_Stat0', 'Week6_Stat1', 'Week6_Stat2', 'Week6_Stat3',
         'Week7_Stat0', 'Week7_Stat1', 'Week7_Stat2', 'Week7_Stat3',
         'Week8_Stat0', 'Week8_Stat1', 'Week8_Stat2', 'Week8_Stat3', 
         'Week9_Stat0', 'Week9_Stat1', 'Week9_Stat2', 'Week9_Stat3', 'ID'], axis=1, inplace=True)
#df.columns

In [113]:
#Droping the rows with all zero values
df = df.loc[~(df == 0).all(axis=1)]

# the number of non zero rows
#nonzero_rows = df.shape[0]
df

Unnamed: 0,Week2_Quiz1,Week3_MP1,Week3_PR1,Week5_MP2,Week5_PR2,Week7_MP3,Week7_PR3,Week4_Quiz2,Week6_Quiz3,Week8_Total,Grade
0,5.00,15.0,5.0,16.09,5.00,21.88,5.0,5.00,5.0,82.97,4
1,3.33,15.0,5.0,17.83,5.00,22.27,5.0,4.00,5.0,82.43,4
2,1.67,13.0,5.0,15.22,5.00,27.05,2.5,5.00,5.0,79.44,3
3,2.50,14.0,5.0,10.00,5.00,31.02,5.0,3.13,5.0,80.65,3
4,0.00,15.0,5.0,12.17,4.93,15.91,5.0,4.67,5.0,67.68,2
...,...,...,...,...,...,...,...,...,...,...,...
72,4.17,15.0,5.0,18.26,5.00,24.66,5.0,4.25,5.0,86.34,4
73,2.50,14.0,5.0,18.26,5.00,28.64,0.0,3.00,5.0,81.40,4
74,5.00,15.0,5.0,19.57,5.00,34.60,5.0,5.00,5.0,99.17,5
75,4.17,11.0,5.0,18.26,4.90,31.02,5.0,5.00,4.5,88.85,4


In [114]:
# A list of the features columns names
features = df.columns[:9]
#feature = list(train[features].columns)
features

Index(['Week2_Quiz1', 'Week3_MP1', 'Week3_PR1', 'Week5_MP2', 'Week5_PR2',
       'Week7_MP3', 'Week7_PR3', 'Week4_Quiz2', 'Week6_Quiz3'],
      dtype='object')

In [115]:
# Creating a new column that generate a random number between 0 and 1
## if that vale is less than or equal to 0.75, then sets the value of that cell as True(this is training set) 
### and False otherwise(this is test set)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

In [116]:
# Creating 2 new dataframes, one with training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [117]:
# Creating training set 
X_train = train[features]
Y_train = train['Grade']

In [118]:
# Creating test set
X_test = test[features]
Y_test = test['Grade']

In [119]:
#random forest classifier 
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [120]:
# Train the Classifier to take the training features and learn how they relate to the training y
model = clf.fit(X_train , Y_train)

In [121]:
#Applying the clasifier we trained to the test data(never seen before)
prediction = model.predict(X_test)

In [122]:
#Y_train = pd.factorize(train['Grade'])[0]

In [123]:
# viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[:10]

array([[0.02, 0.04, 0.1 , 0.52, 0.32],
       [0.  , 0.22, 0.53, 0.2 , 0.05],
       [0.  , 0.  , 0.08, 0.14, 0.78],
       [0.  , 0.  , 0.08, 0.37, 0.55],
       [0.64, 0.11, 0.2 , 0.04, 0.01],
       [1.  , 0.  , 0.  , 0.  , 0.  ],
       [0.06, 0.06, 0.7 , 0.04, 0.14],
       [0.  , 0.1 , 0.61, 0.23, 0.06],
       [0.  , 0.03, 0.23, 0.62, 0.12],
       [0.24, 0.04, 0.61, 0.1 , 0.01]])

In [124]:
pd.crosstab(Y_test, prediction, rownames=['Actual Grades'], colnames=['Predicted Grades'])

Predicted Grades,0,3,4,5
Actual Grades,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4,0,0,0
2,0,1,0,0
3,0,3,0,0
4,0,2,3,0
5,0,0,0,4


In [125]:
#View a list of the features and their importance scores
list(zip(X_train, clf.feature_importances_))

[('Week2_Quiz1', 0.06130661660852681),
 ('Week3_MP1', 0.14707976144676863),
 ('Week3_PR1', 0.026771000407506754),
 ('Week5_MP2', 0.2181015106426134),
 ('Week5_PR2', 0.031097760386755286),
 ('Week7_MP3', 0.31949469415232423),
 ('Week7_PR3', 0.040793030657896245),
 ('Week4_Quiz2', 0.10052088893139616),
 ('Week6_Quiz3', 0.054834736766212526)]

In [126]:
# Calculating accuracy of the prediction
Accuracy = metrics.accuracy_score(Y_test, prediction)

In [127]:
# The highly effective features in the prediction
feature_imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

In [128]:
# printting the results 
print("Accuracy = ", Accuracy)
print("The 3 most important features are = \n{}".format(feature_imp[ :3]))

Accuracy =  0.8235294117647058
The 3 most important features are = 
Week7_MP3    0.319495
Week5_MP2    0.218102
Week3_MP1    0.147080
dtype: float64
