In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib
%pip install joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Scikit-learn components
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer

In [None]:
# For saving/loading the model
import joblib

In [None]:
# Load the Dataset
# Load with ';' separator and quoted strings
mat_df = pd.read_csv("Data/PerformanceCheck/student-mat.csv", sep=';')
por_df = pd.read_csv("Data/PerformanceCheck/student-por.csv", sep=';')

mat_df.head()
mat_df.shape, por_df.shape #check shapes

In [None]:
#data exploration
# Info
mat_df.info()

In [None]:
# Basic statistics for numeric columns
mat_df.describe()

In [None]:
#Class balance for the final grade G3
mat_df['G3'].value_counts().sort_index()

In [None]:
#Simple Plot
plt.figure(figsize=(6,4))
sns.histplot(mat_df['G3'], bins=21, kde=False)
plt.xlabel('Final grade (G3, 0â€“20)')
plt.ylabel('Count')
plt.title('Distribution of final Math grades')
plt.show()

In [None]:
# classification target (pass=1>=10 vs fail=0<=10)
mat_df = mat_df.copy()  # avoid accidental view issues

# Create binary target
mat_df['pass_math'] = (mat_df['G3'] >= 10).astype(int)

mat_df[['G3', 'pass_math']].head(10)

In [None]:
#check class balance
mat_df['pass_math'].value_counts(normalize=True)

In [None]:
# Drop grade columns from features
feature_cols = [col for col in mat_df.columns if col not in ['G1', 'G2', 'G3', 'pass_math']]

X = mat_df[feature_cols]
y = mat_df['pass_math']

X.head()

In [None]:
#identify numeric vs categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_cols, categorical_cols

In [None]:
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

In [None]:
# Build
# Preprocess numeric features: standardize
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocess categorical features: one-hot encode
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Full pipeline: preprocessing + classifier
log_reg_clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])

In [None]:
#Train the model
log_reg_clf.fit(X_train, y_train)

In [None]:
#inspect the class labels
log_reg_clf.named_steps['clf'].classes_