In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# We Model Directly

In [101]:
df = pd.read_csv('StudentPerformanceFactors.csv')

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11281 entries, 0 to 11280
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               11281 non-null  int64 
 1   Attendance                  11281 non-null  int64 
 2   Parental_Involvement        11281 non-null  object
 3   Access_to_Resources         11281 non-null  object
 4   Extracurricular_Activities  11281 non-null  object
 5   Sleep_Hours                 11281 non-null  int64 
 6   Previous_Scores             11281 non-null  int64 
 7   Motivation_Level            11281 non-null  object
 8   Internet_Access             11281 non-null  object
 9   Tutoring_Sessions           11281 non-null  int64 
 10  Family_Income               11281 non-null  object
 11  Teacher_Quality             11170 non-null  object
 12  School_Type                 11281 non-null  object
 13  Peer_Influence              11281 non-null  ob

In [103]:
df.drop(inplace = True, labels = ['Parental_Involvement', 'Internet_Access', 'Family_Income', 'Parental_Education_Level', 'Gender', 'Tutoring_Sessions'], axis = 1)

In [104]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Access_to_Resources',
       'Extracurricular_Activities', 'Sleep_Hours', 'Previous_Scores',
       'Motivation_Level', 'Teacher_Quality', 'School_Type', 'Peer_Influence',
       'Physical_Activity', 'Learning_Disabilities', 'Distance_from_Home',
       'Exam_Score'],
      dtype='object')

## Modeling, CatBoost

In [105]:
categorical = df.select_dtypes(include = 'object')
categorical

Unnamed: 0,Access_to_Resources,Extracurricular_Activities,Motivation_Level,Teacher_Quality,School_Type,Peer_Influence,Learning_Disabilities,Distance_from_Home
0,High,No,Low,Medium,Public,Positive,No,Near
1,Medium,No,Low,Medium,Public,Negative,No,Moderate
2,Medium,Yes,Medium,Medium,Public,Neutral,No,Near
3,Medium,Yes,Medium,Medium,Public,Negative,No,Moderate
4,Medium,Yes,Medium,High,Public,Neutral,No,Near
...,...,...,...,...,...,...,...,...
11276,High,Yes,High,High,Private,Positive,No,Moderate
11277,Medium,Yes,High,Medium,Private,Negative,Yes,Moderate
11278,Medium,No,High,High,Public,Positive,No,Near
11279,High,No,Medium,Medium,Private,Positive,No,Moderate


In [106]:
for col in categorical.columns:
    df[col] = categorical[col].map(lambda x: x.lower() if type(x) == str else str(x))

df.head()

Unnamed: 0,Hours_Studied,Attendance,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Distance_from_Home,Exam_Score
0,23,84,high,no,7,73,low,medium,public,positive,3,no,near,67
1,19,64,medium,no,8,59,low,medium,public,negative,4,no,moderate,61
2,24,98,medium,yes,7,91,medium,medium,public,neutral,4,no,near,74
3,29,89,medium,yes,8,98,medium,medium,public,negative,4,no,moderate,71
4,19,92,medium,yes,6,65,medium,high,public,neutral,4,no,near,70


In [107]:
def group(x):
    if x >= 90:
        return 'A'
    
    if x >= 80:
        return 'B'
    
    if x >= 70:
        return 'C'
    
    if x >= 60:
        return 'D'
    
    if x >= 50:
        return 'F'

df['Exam_Score'] = df['Exam_Score'].map(group)


In [108]:
from sklearn.model_selection import train_test_split

x, y = df[df.columns[:-1]], df['Exam_Score']

x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle = True, random_state = 420)

In [109]:
y_train.value_counts()

Exam_Score
D    5601
C    1689
A     703
B     409
F      58
Name: count, dtype: int64

In [110]:
# from imblearn.over_sampling import RandomOverSampler

# sampler = RandomOverSampler()
# x_train, y_train = sampler.fit_resample(x_train, y_train)

In [111]:
y_train.value_counts()

Exam_Score
D    5601
C    1689
A     703
B     409
F      58
Name: count, dtype: int64

In [112]:
from imblearn.over_sampling import SMOTENC

smote = SMOTENC(categorical_features = categorical.columns.tolist())

x_train, y_train = smote.fit_resample(x_train, y_train)

In [113]:
y_train.value_counts()

Exam_Score
A    5601
D    5601
C    5601
B    5601
F    5601
Name: count, dtype: int64

In [114]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier()

cat.fit(x_train, y_train, cat_features = categorical.columns.tolist())

Learning rate set to 0.093785
0:	learn: 1.4800586	total: 63.7ms	remaining: 1m 3s
1:	learn: 1.3816887	total: 117ms	remaining: 58.5s
2:	learn: 1.3103057	total: 165ms	remaining: 55s
3:	learn: 1.2502160	total: 213ms	remaining: 53s
4:	learn: 1.1953681	total: 264ms	remaining: 52.6s
5:	learn: 1.1481455	total: 346ms	remaining: 57.4s
6:	learn: 1.1094009	total: 400ms	remaining: 56.7s
7:	learn: 1.0754102	total: 455ms	remaining: 56.4s
8:	learn: 1.0463148	total: 506ms	remaining: 55.7s
9:	learn: 1.0162132	total: 556ms	remaining: 55s
10:	learn: 0.9896077	total: 610ms	remaining: 54.8s
11:	learn: 0.9662781	total: 662ms	remaining: 54.5s
12:	learn: 0.9436180	total: 720ms	remaining: 54.7s
13:	learn: 0.9231415	total: 777ms	remaining: 54.7s
14:	learn: 0.9045205	total: 827ms	remaining: 54.3s
15:	learn: 0.8853884	total: 875ms	remaining: 53.8s
16:	learn: 0.8720273	total: 933ms	remaining: 54s
17:	learn: 0.8606745	total: 991ms	remaining: 54.1s
18:	learn: 0.8463349	total: 1.04s	remaining: 53.9s
19:	learn: 0.83117

<catboost.core.CatBoostClassifier at 0x3111ec410>

In [115]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, precision_score, recall_score, classification_report

In [118]:
x_test, y_test = smote.fit_resample(x_test, y_test)

In [119]:
y_hat = cat.predict(x_test)

p = classification_report(y_test, y_hat)
# r = recall_score(y_test, y_hat)

print(p)
# print(r)

# mse = mean_squared_error(y_test, y_hat)
# mae = mean_absolute_error(y_test, y_hat)
# r2 = r2_score(y_test, y_hat)

# print("R2: ", r2)
# print("MSE: ", mse)
# print("MAE: ", mae)

              precision    recall  f1-score   support

           A       0.51      0.59      0.55      1842
           B       0.54      0.34      0.42      1842
           C       0.74      0.75      0.74      1842
           D       0.54      0.81      0.65      1842
           F       0.97      0.68      0.80      1842

    accuracy                           0.63      9210
   macro avg       0.66      0.63      0.63      9210
weighted avg       0.66      0.63      0.63      9210



# Save the model

In [122]:
cat.save_model('catboost_ml_model', format = 'cbm')