In [23]:
# 📌 Step 1: Data Loading & Exploration
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
import pickle
import warnings
warnings.filterwarnings("ignore")

In [24]:
df = pd.read_csv("new_student_dataset_25.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] 
df

Unnamed: 0,student_id,hours_studied,previous_scores,learning_style,interest_subjects,assignments_done,test_score
0,S001,3.6,83.2,Kinesthetic,Math,92.5,76.3
1,S002,7.7,51.0,Auditory,Math,72.5,72.4
2,S003,6.1,68.3,Auditory,History,54.8,52.2
3,S004,5.2,72.6,Auditory,Math,68.5,96.6
4,S005,2.1,42.6,Auditory,English,83.4,75.9
5,S006,2.1,73.4,Auditory,Math,83.3,81.7
6,S007,1.4,49.4,Auditory,English,79.6,92.8
7,S008,7.1,43.6,Kinesthetic,English,63.7,77.5
8,S009,5.2,92.2,Kinesthetic,English,78.1,57.7
9,S010,6.0,93.1,Auditory,History,69.1,46.3


In [25]:
# 📌 Encode categorical features
le = LabelEncoder()
df['learning_style_encoded'] = le.fit_transform(df['learning_style'])
df['interest_subject_encoded'] = le.fit_transform(df['interest_subjects'])

In [26]:
df

Unnamed: 0,student_id,hours_studied,previous_scores,learning_style,interest_subjects,assignments_done,test_score,learning_style_encoded,interest_subject_encoded
0,S001,3.6,83.2,Kinesthetic,Math,92.5,76.3,1,2
1,S002,7.7,51.0,Auditory,Math,72.5,72.4,0,2
2,S003,6.1,68.3,Auditory,History,54.8,52.2,0,1
3,S004,5.2,72.6,Auditory,Math,68.5,96.6,0,2
4,S005,2.1,42.6,Auditory,English,83.4,75.9,0,0
5,S006,2.1,73.4,Auditory,Math,83.3,81.7,0,2
6,S007,1.4,49.4,Auditory,English,79.6,92.8,0,0
7,S008,7.1,43.6,Kinesthetic,English,63.7,77.5,1,0
8,S009,5.2,92.2,Kinesthetic,English,78.1,57.7,1,0
9,S010,6.0,93.1,Auditory,History,69.1,46.3,0,1


In [27]:
# 📌 Features & Target
features = ['hours_studied', 'previous_scores', 'assignments_done', 
            'learning_style_encoded', 'interest_subject_encoded']
X = df[features]
y = df['test_score']


In [28]:
X

Unnamed: 0,hours_studied,previous_scores,assignments_done,learning_style_encoded,interest_subject_encoded
0,3.6,83.2,92.5,1,2
1,7.7,51.0,72.5,0,2
2,6.1,68.3,54.8,0,1
3,5.2,72.6,68.5,0,2
4,2.1,42.6,83.4,0,0
5,2.1,73.4,83.3,0,2
6,1.4,49.4,79.6,0,0
7,7.1,43.6,63.7,1,0
8,5.2,92.2,78.1,1,0
9,6.0,93.1,69.1,0,1


In [29]:
#spliting data into independant anmd denpendaent variable
x=df[['hours_studied','previous_scores','assignments_done','learning_style_encoded','interest_subject_encoded']]
y=df[['test_score']]

In [30]:
print (len(df['hours_studied'].unique()))
print (len(df['previous_scores'].unique()))
print (len(df['assignments_done'].unique()))

20
25
25


In [31]:
#implement oneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe=OneHotEncoder()
ohe.fit(x[["hours_studied","previous_scores","assignments_done"]])

In [32]:
ohe.categories_

[array([1.1, 1.4, 2. , 2.1, 2.3, 2.5, 3. , 3.1, 3.6, 4. , 4.2, 4.7, 5.2,
        5.3, 6. , 6.1, 6.8, 7.1, 7.7, 7.8]),
 array([41.9, 42.6, 43.6, 45.4, 46.7, 49.4, 50.2, 51. , 54.2, 56.8, 57.1,
        64.2, 67.2, 68.3, 68.6, 70.1, 72.6, 73.4, 76.4, 77.6, 83.2, 84.5,
        90. , 92.2, 93.1]),
 array([52. , 54.8, 55.5, 57. , 60.1, 61.8, 62.8, 63.7, 68.5, 69.1, 72. ,
        72.5, 73.8, 78.1, 78.2, 79.6, 83.3, 83.4, 84.8, 85.5, 86.1, 92.4,
        92.5, 94.8, 98.6])]

In [34]:
# 📌 Train Regression Model
reg_model = LinearRegression()
reg_model.fit(X_scaled, y)

In [36]:
# 📌 Train KMeans Clustering
kmeans_model = KMeans(n_clusters=3, random_state=42)
kmeans_model.fit(X_scaled)

In [37]:
# 📌 Save the models and scaler
with open("model.pkl", "wb") as f:
    pickle.dump(reg_model, f)

with open("clustering.pkl", "wb") as f:
    pickle.dump(kmeans_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("✅ model.pkl, clustering.pkl, and scaler.pkl files created successfully!")

✅ model.pkl, clustering.pkl, and scaler.pkl files created successfully!
