In [1]:
import os 
import pandas as pd
import csv
import numpy as np
import joblib
l=[]

In [2]:
with open("student_habits_performance.csv",mode='r') as file:
    r=csv.reader(file)
    for i in r:
        l.append(i)

In [3]:
df=pd.DataFrame(l[1::],columns=["student_id","age","gender","study_hours_per_day","social_media_hours","netflix_hours","part_time_job","attendance_percentage","sleep_hours","diet_quality","exercise_frequency","parental_education_level","internet_quality","mental_health_rating","extracurricular_participation","exam_score"])

In [4]:
df.value_counts('extracurricular_participation')

extracurricular_participation
No     682
Yes    318
Name: count, dtype: int64

In [5]:
cols=["age","study_hours_per_day","social_media_hours","netflix_hours","attendance_percentage","sleep_hours","exam_score"]
df[cols]=df[cols].astype(float)

In [6]:
cols=["exercise_frequency","mental_health_rating"]
df[cols]=df[cols].astype(int)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   float64
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       1000 non-null   object 
 12  internet_quality               1000 non-null   ob

In [8]:
df_shuffled = df.sample(frac=1, random_state=42)
split_index=int(len(df_shuffled)*0.8)
df_train=df_shuffled.iloc[:split_index]
df_test=df_shuffled.iloc[split_index:]

In [9]:
X_train = df_train.drop(columns='exam_score').values
y_train = df_train['exam_score'].values
train = (X_train, y_train)

In [10]:
X_test = df_test.drop(columns='exam_score').values
y_test = df_test['exam_score'].values
test = (X_test, y_test)


In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
from sklearn.preprocessing import LabelEncoder
df_train_encoded=df_train.drop(columns='student_id').copy()
df_test_encoded=df_test.drop(columns='student_id').copy()
cols=['gender','part_time_job', 'diet_quality', 'internet_quality', 'parental_education_level','extracurricular_participation']
label_encoders = {}

for col in cols:
    le = LabelEncoder()
    df_train_encoded[col] = le.fit_transform(df_train_encoded[col])
    df_test_encoded[col] = le.transform(df_test_encoded[col])
    label_encoders[col] = le
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

In [13]:
X_train = df_train_encoded.drop(columns='exam_score').values.tolist()
y_train = df_train_encoded['exam_score'].tolist()

X_test = df_test_encoded.drop(columns='exam_score').values.tolist()
y_test = df_test_encoded['exam_score'].tolist()

train = (X_train, y_train)
test = (X_test, y_test)

In [14]:
model=LinearRegression()
model.fit(train[0],train[1])

In [15]:
predictions = model.predict(test[0])

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(test[1], predictions)  # y_test vs predicted
r2 = r2_score(test[1], predictions)

print("📊 Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

📊 Model Evaluation:
Mean Squared Error (MSE): 35.19146786829487
R² Score: 0.8685848774767356


In [17]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_test, y_test, cv=5, scoring='r2')
print("Average R²:", scores.mean())

Average R²: 0.8598156926388343


In [18]:

joblib.dump(model, 'student_score_predictor.pkl')

['student_score_predictor.pkl']

In [19]:
def preprocess_input(user_input_dict):
    label_encoders=joblib.load('label_encoders.pkl')
    df=pd.DataFrame([user_input_dict])
    df=df.drop(columns=['student_id'],errors='ignore')
    for col in cols:
        if col in df:
            le=label_encoders[col]
            df[col]=le.transform(df[col])
    return df.values.tolist()[0]

def predict_exam_score(user_input_dict):
    input_features=preprocess_input(user_input_dict)
    model=joblib.load('student_score_predictor.pkl')
    prediction=model.predict([input_features])
    return prediction[0]


In [20]:


label_encoders = joblib.load('label_encoders.pkl')

for col, encoder in label_encoders.items():
    print(f"{col}: {encoder.classes_}")


gender: ['Female' 'Male' 'Other']
part_time_job: ['No' 'Yes']
diet_quality: ['Fair' 'Good' 'Poor']
internet_quality: ['Average' 'Good' 'Poor']
parental_education_level: ['Bachelor' 'High School' 'Master' 'None']
extracurricular_participation: ['No' 'Yes']
