In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load datasets
math_df = pd.read_csv('student+performance/student/student-mat.csv', sep=';')
por_df = pd.read_csv('student+performance/student/student-por.csv', sep=';')

In [3]:
# Add subject column
math_df['subject'] = 'Math' # 0
por_df['subject'] = 'Portuguese' # 1

In [4]:
# Concatenate datasets
combined_df = pd.concat([math_df, por_df], ignore_index=True)
combined_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,subject
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,Math
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,Math
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,Math
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,Math
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,Math


In [5]:
# Create a binary classification target: pass = G3 >= 10
combined_df['pass'] = combined_df['G3'].apply(lambda x: 1 if x >= 10 else 0)

In [6]:
# Label encode categorical features
categorical_cols = combined_df.select_dtypes(include='object').columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
    label_encoders[col] = le

In [7]:
# Drop G1 and G2 to avoid leakage; keep G3 for regression
features = combined_df.drop(columns=['G1', 'G2', 'G3', 'pass'])
target_reg = combined_df['G3']
target_cls = combined_df['pass']

In [8]:
# Save cleaned data for future steps
processed_df = features.copy()
processed_df['G3'] = target_reg
processed_df['pass'] = target_cls
processed_df.to_csv('student+performance/student/student_combined_processed.csv', index=False)

In [9]:
# Print checks
print("Preprocessing complete!")
print("Features shape:", features.shape)
print("Sample regression targets:", target_reg.head().tolist())
print("Sample classification targets:", target_cls.head().tolist())

Preprocessing complete!
Features shape: (1044, 31)
Sample regression targets: [6, 6, 10, 15, 10]
Sample classification targets: [0, 0, 1, 1, 1]
