#### 2️⃣ 🔧  Data Transformation & Feature Engineering
<small>

⚡ **Goal:** Make data model-ready and prevent leakage.  

-  **Encoding:** One-hot encode categorical features (school, sex, address, Mjob, etc.).  
-  **Scaling:** Standardize numeric variables.  
-  **Feature Engineering:**  
  - Attendance proxy (from absences).  
  - Average grade (mean of G1–G3).  
  - Risk target → 3-class (Low, Medium, High).  
-  **Leakage Control:**  
  - Variant A: Include G1 & G2 → strong predictors.  
  - Variant B: Exclude G1 & G2 → early-stage scenario.  

---

In [2]:
# ===============================
# 📚 Essential Libraries for Project
# ===============================

# Data handling
import pandas as pd
import numpy as np

# Fetch UCI ML Repository datasets
from ucimlrepo import fetch_ucirepo

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import plotly.graph_objects as go
import missingno as msno

# Handle Warning
import warnings
warnings.filterwarnings("ignore")


# Machine Learning (Supervised & Unsupervised)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import (
    LogisticRegression,
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
)
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    AdaBoostRegressor,
)
from sklearn.svm import SVC, SVR
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    silhouette_score,
    make_scorer,
    f1_score,
    precision_score,
    recall_score,
    mean_squared_error,
    r2_score,
    mean_absolute_error,
)


from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Dimensionality Reduction & Feature Selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

# Stats & Hypothesis Testing
import scipy.stats as stats

# Dashboard
import streamlit as st

# Save Models
from joblib import dump, load
from pickle import dump, load


# Set style for consistent plotting
plt.style.use("default")
sns.set_palette("husl")

In [5]:
# Load cleaned data
df = pd.read_csv("student_data_cleaned.csv")

In [6]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [10]:
# Identify column types
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [11]:
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")

Categorical columns (17): ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
Numerical columns (16): ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [None]:
# Convert appropriate columns to categories for memory efficiency
for col in categorical_cols:
    df[col] = df[col].astype("category")

print(f"Optimized categorical columns for memory efficiency")

Optimized categorical columns for memory efficiency


In [None]:
# Check data types
df.dtypes

school        category
sex           category
age              int64
address       category
famsize       category
Pstatus       category
Medu             int64
Fedu             int64
Mjob          category
Fjob          category
reason        category
guardian      category
traveltime       int64
studytime        int64
failures         int64
schoolsup     category
famsup        category
paid          category
activities    category
nursery       category
higher        category
internet      category
romantic      category
famrel           int64
freetime         int64
goout            int64
Dalc             int64
Walc             int64
health           int64
absences         int64
G1               int64
G2               int64
G3               int64
dtype: object

2.1 ONE-HOT ENCODING 

In [15]:
# List of categorical columns to encode
cats_to_encode = [col for col in categorical_cols if col in df.columns]
print(f"Columns to encode: {cats_to_encode}")

Columns to encode: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [16]:
# Apply one-hot encoding
df_encoded = pd.get_dummies(
    df, columns=cats_to_encode, prefix=cats_to_encode, drop_first=True
)

print(f"Shape before encoding: {df.shape}")
print(f"Shape after encoding: {df_encoded.shape}")
print(f"New features created: {df_encoded.shape[1] - df.shape[1]}")

Shape before encoding: (649, 33)
Shape after encoding: (649, 42)
New features created: 9


In [59]:
df_encoded.sample(3)

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,internet_yes,romantic_yes,attendance_rate,grade_avg,pass_binary,risk_category,study_efficiency,has_failures,family_edu_avg,family_edu_max
191,17,3,3,1,2,0,4,3,4,1,...,True,False,0.733333,10.0,1,Medium_Risk,3.333333,0,3.0,3
244,17,4,3,1,2,0,3,2,3,1,...,True,True,1.0,12.666667,1,Medium_Risk,4.222222,0,3.5,4
270,16,4,4,1,1,0,5,3,2,1,...,True,False,0.733333,14.666667,1,Low_Risk,7.333333,0,4.0,4


2.2 FEATURE ENGINEERING

In [None]:
# Create attendance proxy from absences
df_encoded["attendance_rate"] = 1 - (
    df_encoded["absences"] / df_encoded["absences"].max()
)
print("Created 'attendance_rate' feature")

Created 'attendance_rate' feature


In [None]:
# Create average grade from G1, G2, G3
if all(col in df_encoded.columns for col in ["G1", "G2", "G3"]):
    df_encoded["grade_avg"] = (
        df_encoded["G1"] + df_encoded["G2"] + df_encoded["G3"]
    ) / 3
    print("Created 'grade_avg' feature")

Created 'grade_avg' feature


In [25]:
# Create binary pass/fail target
df_encoded["pass_binary"] = (df_encoded["G3"] >= 10).astype(int)
pass_rate = df_encoded["pass_binary"].mean()
print(f"Created 'pass_binary' target (pass rate: {pass_rate:.1%})")

Created 'pass_binary' target (pass rate: 84.6%)


In [26]:
# Create 3-tier risk classification
def create_risk_categories(g3_score):
    if g3_score >= 14:
        return "Low_Risk"
    elif g3_score >= 10:
        return "Medium_Risk"
    else:
        return "High_Risk"

In [None]:
# Create risk category from G3
df_encoded['risk_category'] = df_encoded['G3'].apply(create_risk_categories)
risk_dist = df_encoded['risk_category'].value_counts(normalize=True)

In [None]:
# Display risk category distribution
print(f"Created 'risk_category' target:")
for risk, percent in risk_dist.items():
    print(f"{risk}: {percent:.1%}")

Created 'risk_category' target:
Medium_Risk: 54.7%
Low_Risk: 29.9%
High_Risk: 15.4%


In [None]:
# Create study efficiency metric
df_encoded["study_efficiency"] = df_encoded["grade_avg"] / (
    df_encoded["studytime"] + 1
)  # +1 to avoid division by zero
print(" Created 'study_efficiency' feature")

Created 'study_efficiency' feature


In [None]:
# Create failure history indicator
df_encoded["has_failures"] = (df_encoded["failures"] > 0).astype(int)
print("Created 'has_failures' feature")

Created 'has_failures' feature


In [None]:
# Create family education level (combine Medu and Fedu)
df_encoded["family_edu_avg"] = (df_encoded["Medu"] + df_encoded["Fedu"]) / 2
df_encoded["family_edu_max"] = df_encoded[["Medu", "Fedu"]].max(axis=1)

print(f"\nFinal shape after feature engineering: {df_encoded.shape}")


Final shape after feature engineering: (649, 50)


In [36]:
df_encoded.sample(5)

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,internet_yes,romantic_yes,attendance_rate,grade_avg,pass_binary,risk_category,study_efficiency,has_failures,family_edu_avg,family_edu_max
408,18,2,2,1,2,0,4,4,4,1,...,True,False,0.266667,10.666667,1,Medium_Risk,3.555556,0,2.0,2
188,16,3,3,2,2,0,4,4,5,1,...,True,False,1.0,14.333333,1,Low_Risk,4.777778,0,3.0,3
299,20,2,1,2,2,0,1,2,3,1,...,True,True,0.466667,11.333333,1,Medium_Risk,3.777778,0,1.5,2
82,15,3,2,1,2,0,4,4,4,1,...,True,False,0.733333,11.333333,1,Medium_Risk,3.777778,0,2.5,3
283,16,3,3,3,2,1,5,3,3,1,...,True,False,0.733333,10.0,1,Medium_Risk,3.333333,1,3.0,3


In [37]:
# Display number of rows and columns
print('NNumber of rows:', df_encoded.shape[0])
print('Number of columns:', df_encoded.shape[1])

NNumber of rows: 649
Number of columns: 50


2.3 PREPARE DATASETS FOR ANALYSIS

In [None]:
# Dataset Without G1/G2 (no leakage)
leakage_cols = ["G1", "G2", "grade_avg"]  # grade_avg contains G1/G2
df_no_leakage = df_encoded.drop(
    columns=[col for col in leakage_cols if col in df_encoded.columns]
)

In [None]:
# Dataset With G1/G2 (potential leakage)
df_with_leakage = df_encoded.copy()

print(f"Dataset WITHOUT G1/G2 leakage: {df_no_leakage.shape}")
print(f"Dataset WITH G1/G2 included: {df_with_leakage.shape}")

Dataset WITHOUT G1/G2 leakage: (649, 47)
Dataset WITH G1/G2 included: (649, 50)


2.4 FEATURE SCALING FOR ML 

In [40]:
def prepare_features_and_targets(df, target_col="G3", exclude_cols=None):
    """Prepare features and targets with scaling"""
    if exclude_cols is None:
        exclude_cols = ["G3", "pass_binary", "risk_category"]

    # Separate features from targets
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    X = df[feature_cols]

    # Get numerical columns for scaling
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns

    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = X.copy()
    X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])

    # Prepare targets
    y_continuous = df[target_col] if target_col in df.columns else None
    y_binary = df["pass_binary"] if "pass_binary" in df.columns else None
    y_multiclass = df["risk_category"] if "risk_category" in df.columns else None

    return X_scaled, y_continuous, y_binary, y_multiclass, scaler

In [41]:
print("Preparing dataset WITHOUT G1/G2 leakage...")
X_no_leak, y_cont_no_leak, y_bin_no_leak, y_multi_no_leak, scaler_no_leak = (
    prepare_features_and_targets(df_no_leakage)
)

Preparing dataset WITHOUT G1/G2 leakage...


In [42]:
print("Preparing dataset WITH G1/G2...")
X_with_leak, y_cont_with_leak, y_bin_with_leak, y_multi_with_leak, scaler_with_leak = (
    prepare_features_and_targets(df_with_leakage)
)

Preparing dataset WITH G1/G2...


In [43]:
print(f"Features (no leakage): {X_no_leak.shape}")
print(f"Features (with G1/G2): {X_with_leak.shape}")

Features (no leakage): (649, 44)
Features (with G1/G2): (649, 47)


2.5 CLUSTERING FEATURE SELECTION

In [44]:
# Select behavioral features for clustering
clustering_features = ["studytime", "absences", "goout", "freetime", "attendance_rate"]

In [45]:
# Add encoded versions of categorical behavioral features
behavioral_cats = [
    "famsup",
    "schoolsup",
    "paid",
    "activities",
    "higher",
    "internet",
    "romantic",
]
for cat in behavioral_cats:
    # Find encoded columns that start with the category name
    encoded_cols = [col for col in df_encoded.columns if col.startswith(f"{cat}_")]
    clustering_features.extend(encoded_cols)

In [46]:
# Select available features
available_clustering_features = [
    col for col in clustering_features if col in df_encoded.columns
]
print(f"Selected clustering features ({len(available_clustering_features)}):")
for feat in available_clustering_features:
    print(f"  - {feat}")

Selected clustering features (12):
  - studytime
  - absences
  - goout
  - freetime
  - attendance_rate
  - famsup_yes
  - schoolsup_yes
  - paid_yes
  - activities_yes
  - higher_yes
  - internet_yes
  - romantic_yes


In [47]:
# Prepare clustering dataset
X_clustering = df_encoded[available_clustering_features].copy()

In [48]:
# Scale clustering features
scaler_clustering = StandardScaler()
X_clustering_scaled = pd.DataFrame(
    scaler_clustering.fit_transform(X_clustering),
    columns=X_clustering.columns,
    index=X_clustering.index,
)

In [49]:
print(f"Clustering dataset prepared: {X_clustering_scaled.shape}")

Clustering dataset prepared: (649, 12)


In [66]:
X_clustering_scaled.sample(3)

Unnamed: 0,studytime,absences,goout,freetime,attendance_rate,famsup_yes,schoolsup_yes,paid_yes,activities_yes,higher_yes,internet_yes,romantic_yes
305,-1.122808,1.09974,0.693785,0.780478,-1.09974,-1.259229,-0.34211,-0.252853,-0.97114,-2.899275,0.550648,-0.763496
93,0.083653,-0.369851,-0.15738,-0.171647,0.369851,0.794137,-0.34211,-0.252853,1.029717,0.344914,0.550648,-0.763496
408,0.083653,1.834535,0.693785,0.780478,-1.834535,0.794137,-0.34211,-0.252853,1.029717,-2.899275,0.550648,-0.763496


2.6 TRAIN-TEST SPLITS

In [50]:
# No leakage dataset
X_train_no_leak, X_test_no_leak, y_train_no_leak, y_test_no_leak = train_test_split(
    X_no_leak,
    y_bin_no_leak,
    test_size=.2,
    random_state=42,
    stratify=y_bin_no_leak,
)

In [51]:
# With leakage dataset
X_train_with_leak, X_test_with_leak, y_train_with_leak, y_test_with_leak = (
    train_test_split(
        X_with_leak,
        y_bin_with_leak,
        test_size=.2,
        random_state=42,
        stratify=y_bin_with_leak,
    )
)

In [52]:
print(f"  No leakage - Train: {X_train_no_leak.shape}, Test: {X_test_no_leak.shape}")
print(f"  With G1/G2 - Train: {X_train_with_leak.shape}, Test: {X_test_with_leak.shape}")

  No leakage - Train: (519, 44), Test: (130, 44)
  With G1/G2 - Train: (519, 47), Test: (130, 47)


2.7 SAVE TRANSFORMED DATASETS

In [53]:
# Save main datasets
df_no_leakage.to_csv("student_data_no_leakage.csv", index=False)
df_with_leakage.to_csv("student_data_with_leakage.csv", index=False)

In [54]:
# Save clustering data
X_clustering_scaled.to_csv("student_data_clustering.csv", index=False)

In [55]:
# Save train-test splits
X_train_no_leak.to_csv("X_train_no_leak.csv", index=False)
X_test_no_leak.to_csv("X_test_no_leak.csv", index=False)
y_train_no_leak.to_csv("y_train_no_leak.csv", index=False)
y_test_no_leak.to_csv("y_test_no_leak.csv", index=False)

X_train_with_leak.to_csv("X_train_with_leak.csv", index=False)
X_test_with_leak.to_csv("X_test_with_leak.csv", index=False)
y_train_with_leak.to_csv("y_train_with_leak.csv", index=False)
y_test_with_leak.to_csv("y_test_with_leak.csv", index=False)

print("All transformed datasets saved successfully")

All transformed datasets saved successfully


In [67]:
# 10. TRANSFORMATION SUMMARY
# ==========================

print("\n" + "=" * 70)
print("DATA TRANSFORMATION SUMMARY")
print("=" * 70)

print(f"\nORIGINAL DATASET:")
print(f"- Shape: {df.shape}")
print(f"- Categorical features: {len(categorical_cols)}")
print(f"- Numerical features: {len(numerical_cols)}")

print(f"\nTRANSFORMED DATASET:")
print(f"- Shape: {df_encoded.shape}")
print(f"- Total features: {df_encoded.shape[1]}")
print(f"- New features created: {df_encoded.shape[1] - df.shape[1]}")

print(f"\nFEATURE ENGINEERING RESULTS:")
engineered_features = [
    "attendance_rate",
    "grade_avg",
    "pass_binary",
    "risk_category",
    "study_efficiency",
    "has_failures",
    "family_edu_avg",
    "family_edu_max",
]
available_engineered = [f for f in engineered_features if f in df_encoded.columns]
print(f"- Features created: {len(available_engineered)}")
for feat in available_engineered:
    print(f"  • {feat}")

print(f"\nTARGET VARIABLES:")
print(f"- G3 (continuous): {df_encoded['G3'].min():.0f}-{df_encoded['G3'].max():.0f}")
print(f"- Pass/Fail (binary): {df_encoded['pass_binary'].value_counts().to_dict()}")
print(f"- Risk categories: {df_encoded['risk_category'].value_counts().to_dict()}")

print(f"\nDATA LEAKAGE ANALYSIS:")
print(f"- Dataset without G1/G2: {df_no_leakage.shape[1]} features")
print(f"- Dataset with G1/G2: {df_with_leakage.shape[1]} features")
print(
    f"- Feature difference: {df_with_leakage.shape[1] - df_no_leakage.shape[1]} features"
)

print(f"\nCLUSTERING PREPARATION:")
print(f"- Behavioral features selected: {len(available_clustering_features)}")
print(f"- Clustering dataset shape: {X_clustering_scaled.shape}")


print("\n" + "=" * 70)
print("DATA TRANSFORMATION COMPLETE ✓")
print("=" * 70)


DATA TRANSFORMATION SUMMARY

ORIGINAL DATASET:
- Shape: (649, 33)
- Categorical features: 17
- Numerical features: 16

TRANSFORMED DATASET:
- Shape: (649, 50)
- Total features: 50
- New features created: 17

FEATURE ENGINEERING RESULTS:
- Features created: 8
  • attendance_rate
  • grade_avg
  • pass_binary
  • risk_category
  • study_efficiency
  • has_failures
  • family_edu_avg
  • family_edu_max

TARGET VARIABLES:
- G3 (continuous): 0-19
- Pass/Fail (binary): {1: 549, 0: 100}
- Risk categories: {'Medium_Risk': 355, 'Low_Risk': 194, 'High_Risk': 100}

DATA LEAKAGE ANALYSIS:
- Dataset without G1/G2: 47 features
- Dataset with G1/G2: 50 features
- Feature difference: 3 features

CLUSTERING PREPARATION:
- Behavioral features selected: 12
- Clustering dataset shape: (649, 12)

DATA TRANSFORMATION COMPLETE ✓
