In [1]:
import numpy as np
import pandas as pd
import missingno as msno
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/semi_cleaned_StudentPerformanceFactors.csv")

In [3]:
df.shape

(6378, 20)

In [4]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

In [5]:
df.head(15)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67.0
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61.0
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74.0
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71.0
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70.0
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,Medium,Public,Positive,3,No,Postgraduate,Near,Male,71.0
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,Medium,Private,Neutral,2,No,High School,Moderate,Male,67.0
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,High,Public,Negative,2,No,High School,Far,Male,66.0
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,Low,Private,Neutral,1,No,College,Near,Male,69.0
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,High,Public,Positive,5,No,High School,Moderate,Male,72.0


In [6]:
df.tail(5)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
6373,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68.0
6374,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69.0
6375,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68.0
6376,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68.0
6377,15,67,Medium,Low,Yes,9,94,Medium,Yes,0,Medium,Medium,Public,Positive,4,No,Postgraduate,Near,Male,64.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6378 entries, 0 to 6377
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Hours_Studied               6378 non-null   int64  
 1   Attendance                  6378 non-null   int64  
 2   Parental_Involvement        6378 non-null   object 
 3   Access_to_Resources         6378 non-null   object 
 4   Extracurricular_Activities  6378 non-null   object 
 5   Sleep_Hours                 6378 non-null   int64  
 6   Previous_Scores             6378 non-null   int64  
 7   Motivation_Level            6378 non-null   object 
 8   Internet_Access             6378 non-null   object 
 9   Tutoring_Sessions           6378 non-null   int64  
 10  Family_Income               6378 non-null   object 
 11  Teacher_Quality             6378 non-null   object 
 12  School_Type                 6378 non-null   object 
 13  Peer_Influence              6378 

In [8]:
# ==== IMPORTS ====
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, LabelEncoder
from category_encoders import BinaryEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, classification_report,auc,roc_curve


In [9]:
num_cols =  df.select_dtypes(include=['int64', 'float64']).columns.drop(['Tutoring_Sessions','Sleep_Hours','Physical_Activity','Exam_Score'])
bin_cat_cols = ['Extracurricular_Activities', 'Internet_Access', 'School_Type', 'Peer_Influence','Peer_Influence','Learning_Disabilities','Gender']
ord_cat_cols = ['Parental_Involvement', 'Access_to_Resources', 'Sleep_Hours', 'Motivation_Level', 'Tutoring_Sessions','Family_Income',
'Teacher_Quality','Physical_Activity','Parental_Education_Level','Distance_from_Home']
# ord_cat_cols = ['Family_Income']
for col in bin_cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

for col in ord_cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

In [10]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,1,0,0,3,73,1,1,0,1,2,1,2,3,0,1,2,1,67.0
1,19,64,1,2,0,4,59,1,1,2,2,2,1,0,4,0,0,1,0,61.0
2,24,98,2,2,1,3,91,2,1,2,2,2,1,1,4,0,2,2,1,74.0
3,29,89,1,2,1,4,98,2,1,1,2,2,1,0,4,0,1,1,1,71.0
4,19,92,2,2,1,2,65,2,1,3,2,0,1,1,4,0,0,2,0,70.0


In [11]:
# Data Splitting

X = df.drop(columns=['Exam_Score']).copy()
y = df['Exam_Score'].copy()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6378 entries, 0 to 6377
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Hours_Studied               6378 non-null   int64
 1   Attendance                  6378 non-null   int64
 2   Parental_Involvement        6378 non-null   int32
 3   Access_to_Resources         6378 non-null   int32
 4   Extracurricular_Activities  6378 non-null   int32
 5   Sleep_Hours                 6378 non-null   int64
 6   Previous_Scores             6378 non-null   int64
 7   Motivation_Level            6378 non-null   int32
 8   Internet_Access             6378 non-null   int32
 9   Tutoring_Sessions           6378 non-null   int64
 10  Family_Income               6378 non-null   int32
 11  Teacher_Quality             6378 non-null   int32
 12  School_Type                 6378 non-null   int32
 13  Peer_Influence              6378 non-null   int64
 14  Physical

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((5102, 19), (1276, 19), (5102,), (1276,))

In [15]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

In [16]:
# for col in X.columns:
#     print(col, X[col].unique())
# unique_values = {col: X[col].unique() for col in X.columns}


In [17]:
X.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender'],
      dtype='object')

In [18]:
# num_cols =  df.select_dtypes(include=['int64', 'float64']).columns.drop(['Tutoring_Sessions','Sleep_Hours','Physical_Activity','Exam_Score'])
# bin_cat_cols = ['Extracurricular_Activities', 'Internet_Access', 'School_Type', 'Peer_Influence','Peer_Influence','Learning_Disabilities','Gender']
# lab_cat_cols = ['Parental_Involvement', 'Access_to_Resources', 'Sleep_Hours', 'Motivation_Level', 'Tutoring_Sessions','Family_Income',
# 'Teacher_Quality','Physical_Activity','Parental_Education_Level','Distance_from_Home']
# ord_cat_cols = ['Family_Income']

num_cols=['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender']

# Pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    # ('bin_enc', BinaryEncoder(), bin_cat_cols),
    # ('lab_cat', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), lab_cat_cols)
], remainder='passthrough')

# Model
lr = LinearRegression()
pipe = Pipeline([
    ('preprocessor', transformer),
    ('model', lr)
])



scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2')
print("Cross-validated R²: %.2f (± %.2f)" % (scores.mean(), scores.std() * 2))

Cross-validated R²: 0.65 (± 0.10)


In [19]:
import plotly.express as px

X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)
lr.fit(X_train_transformed, y_train)

feature_importance = dict(zip(transformer.get_feature_names_out(), lr.coef_))
feature_importance = pd.Series(feature_importance).sort_values()
px.bar(feature_importance, width=800, height=900, orientation='h', title='Feature Importance')

In [20]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_percentage_error

y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
mape = mean_absolute_percentage_error(y_train, y_pred)
accuracy = 1 - mape
print("Approximate Accuracy (1 - MAPE): %.2f%%" % (accuracy * 100))

Approximate Accuracy (1 - MAPE): 98.46%


In [22]:
# Save the preprocessed data and Log Price
import pickle

# Save the unprocessed data
with open('../data/unprocessed_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, X_test, y_test), f)

with open('../data/processed_data.pkl', 'wb') as f:
    pickle.dump((X_train_transformed, y_train, X_test_transformed, y_test), f)