In [33]:
## Importing necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [34]:
## Reading the data
data = pd.read_csv('student_performance_large_dataset.csv')

In [35]:
data.head()

Unnamed: 0,Student_ID,Age,Gender,Study_Hours_per_Week,Preferred_Learning_Style,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,S00001,18,Female,48,Kinesthetic,14,Yes,100,69,66,Yes,High,9,8,C
1,S00002,29,Female,30,Reading/Writing,20,No,71,40,57,Yes,Medium,28,8,D
2,S00003,20,Female,47,Kinesthetic,11,No,60,43,79,Yes,Low,13,7,D
3,S00004,23,Female,13,Auditory,0,Yes,63,70,60,Yes,Low,24,10,B
4,S00005,19,Female,24,Auditory,19,Yes,59,63,93,Yes,Medium,26,8,C


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Student_ID                               10000 non-null  object
 1   Age                                      10000 non-null  int64 
 2   Gender                                   10000 non-null  object
 3   Study_Hours_per_Week                     10000 non-null  int64 
 4   Preferred_Learning_Style                 10000 non-null  object
 5   Online_Courses_Completed                 10000 non-null  int64 
 6   Participation_in_Discussions             10000 non-null  object
 7   Assignment_Completion_Rate (%)           10000 non-null  int64 
 8   Exam_Score (%)                           10000 non-null  int64 
 9   Attendance_Rate (%)                      10000 non-null  int64 
 10  Use_of_Educational_Tech                  10000 non-null  ob

In [37]:
## Checking for nulls
data.isnull().sum()

Student_ID                                 0
Age                                        0
Gender                                     0
Study_Hours_per_Week                       0
Preferred_Learning_Style                   0
Online_Courses_Completed                   0
Participation_in_Discussions               0
Assignment_Completion_Rate (%)             0
Exam_Score (%)                             0
Attendance_Rate (%)                        0
Use_of_Educational_Tech                    0
Self_Reported_Stress_Level                 0
Time_Spent_on_Social_Media (hours/week)    0
Sleep_Hours_per_Night                      0
Final_Grade                                0
dtype: int64

In [38]:
## Checking for duplicates
data.duplicated().sum()

np.int64(0)

In [39]:
## Dropping unnecessary columns
data.drop('Student_ID',axis=1,inplace=True)

In [40]:
## Handling categorical variables using OHE
categorical_cols = [i for i in data.columns if data[i].dtype == 'object']

In [41]:
categorical_cols.remove('Final_Grade')

In [42]:
categorical_cols

['Gender',
 'Preferred_Learning_Style',
 'Participation_in_Discussions',
 'Use_of_Educational_Tech',
 'Self_Reported_Stress_Level']

In [43]:
data = pd.get_dummies(data,columns=categorical_cols,drop_first=True)

In [44]:
## Applying ordinal mapping on the final grade
grades = {'A':0,'B':1,'C':2,'D':3}
data['Final_Grade'] = data['Final_Grade'].map(grades)

In [45]:
data.head()

Unnamed: 0,Age,Study_Hours_per_Week,Online_Courses_Completed,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade,Gender_Male,Gender_Other,Preferred_Learning_Style_Kinesthetic,Preferred_Learning_Style_Reading/Writing,Preferred_Learning_Style_Visual,Participation_in_Discussions_Yes,Use_of_Educational_Tech_Yes,Self_Reported_Stress_Level_Low,Self_Reported_Stress_Level_Medium
0,18,48,14,100,69,66,9,8,2,False,False,True,False,False,True,True,False,False
1,29,30,20,71,40,57,28,8,3,False,False,False,True,False,False,True,False,True
2,20,47,11,60,43,79,13,7,3,False,False,True,False,False,False,True,True,False
3,23,13,0,63,70,60,24,10,1,False,False,False,False,False,True,True,True,False
4,19,24,19,59,63,93,26,8,2,False,False,False,False,False,True,True,False,True


In [46]:
## Independent and Dependent Features
x = data.drop('Final_Grade',axis=1)
y = data['Final_Grade']

In [47]:
## Train-Test Split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [48]:
x_train.shape,x_test.shape

((6700, 17), (3300, 17))

In [49]:
## Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [50]:
x_train_scaled = scaler.fit_transform(x_train)

In [51]:
x_test_scaled = scaler.transform(x_test)

In [52]:
## Model Training
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)

In [53]:
model.fit(x_train_scaled,y_train)

In [54]:
y_pred = model.predict(x_test_scaled)

In [55]:
## Performance Metrics
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       884
           1       1.00      1.00      1.00       801
           2       1.00      1.00      1.00       805
           3       1.00      1.00      1.00       810

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300

[[884   0   0   0]
 [  0 801   0   0]
 [  0   0 805   0]
 [  0   0   0 810]]


In [56]:
import pickle
pickle.dump(model,open('model.pkl','wb'))