 ## Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
try:
    df = pd.read_csv('../data/processed_data.csv')
    print("CSV file loaded successfully!")
except FileNotFoundError:
    print("Error: 'processed_data.csv' not found. Please make sure the file is in the same directory as your notebook or provide the correct path.")
    exit()

CSV file loaded successfully!


In [3]:
print(df.head())

   anxiety_level  self_esteem  mental_health_history  depression  headache  \
0             14           20                      0          11         2   
1             15            8                      1          15         5   
2             12           18                      1          14         2   
3             16           12                      1          15         4   
4             16           28                      0           7         2   

   blood_pressure  sleep_quality  breathing_problem  noise_level  \
0               1              2                  4            2   
1               3              1                  4            3   
2               1              2                  2            2   
3               3              1                  3            4   
4               3              5                  1            3   

   living_conditions  ...  basic_needs  academic_performance  study_load  \
0                  3  ...            2        

### Define student performance matrics with the most importance features

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd

# STEP 1: Pick your top features
top_features = ['anxiety_level', 'stress_level', 'headache', 'safety',
                'blood_pressure', 'teacher_student_relationship', 'bullying', 'academic_performance' , 'extracurricular_activities' ]

# STEP 2: Scale your data (MinMaxScaler to keep it 0-1 for easier plots)
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[top_features + ['sleep_quality']]),
                         columns=top_features + ['sleep_quality'])

print(df_scaled.head())

   anxiety_level  stress_level  headache  safety  blood_pressure  \
0       0.666667           0.5       0.4     0.6             0.0   
1       0.714286           1.0       1.0     0.4             1.0   
2       0.571429           0.5       0.4     0.6             0.0   
3       0.761905           1.0       0.8     0.4             1.0   
4       0.761905           0.5       0.4     0.8             1.0   

   teacher_student_relationship  bullying  academic_performance  \
0                           0.6       0.4                   0.6   
1                           0.2       1.0                   0.2   
2                           0.6       0.4                   0.4   
3                           0.2       1.0                   0.4   
4                           0.2       1.0                   0.8   

   extracurricular_activities  sleep_quality  
0                         0.6            0.4  
1                         1.0            0.2  
2                         0.4            0.4  


In [10]:
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
# STEP 3: Create feature combos (engineering)
df_scaled['mentalHealth'] = df_scaled['anxiety_level'] + df_scaled['stress_level']
df_scaled['relationship'] = df_scaled['teacher_student_relationship'] + df_scaled['academic_performance']
df_scaled['phihealth'] = df_scaled['headache'] + df_scaled['blood_pressure']
df_scaled['acedemic_positive'] = df_scaled['extracurricular_activities']
df_scaled['acedemic_negative'] = df_scaled['bullying']

df_scaled['student_performance_weighted'] = (
    0.3 * df_scaled['academic_performance'] +  # direct academic
    0.2 * df_scaled['teacher_student_relationship'] +  # social
    -0.2 * df_scaled['stress_level'] +  # mental pressure
    -0.2 * df_scaled['anxiety_level'] +
    -0.1 * df_scaled['headache']  # health symptom
)

pca = PCA(n_components=1)
df_scaled['student_performance_pca'] = pca.fit_transform(
    df_scaled[['mentalHealth', 'relationship', 'phihealth']]
)

pls = PLSRegression(n_components=1)
df_scaled['student_performance_pls'] = pls.fit_transform(
    df_scaled[['mentalHealth', 'relationship', 'phihealth', 'acedemic_positive', 'acedemic_negative']],
    df_scaled[['academic_performance']]
)[0]


# df_scaled['student_performance'] = (df_scaled['relationship'] + df_scaled['acedemic']) - (df_scaled['phihealth']  + df_scaled['mentalHealth'] )

print(df_scaled.head())

   anxiety_level  stress_level  headache  safety  blood_pressure  \
0       0.666667           0.5       0.4     0.6             0.0   
1       0.714286           1.0       1.0     0.4             1.0   
2       0.571429           0.5       0.4     0.6             0.0   
3       0.761905           1.0       0.8     0.4             1.0   
4       0.761905           0.5       0.4     0.8             1.0   

   teacher_student_relationship  bullying  academic_performance  \
0                           0.6       0.4                   0.6   
1                           0.2       1.0                   0.2   
2                           0.6       0.4                   0.4   
3                           0.2       1.0                   0.4   
4                           0.2       1.0                   0.8   

   extracurricular_activities  sleep_quality  mentalHealth  relationship  \
0                         0.6            0.4      1.166667           1.2   
1                         1.0       

## RandomForestRegressor model
- ✅ When it's better:
- You want higher accuracy.
- You have complex or noisy data.

🚗 How it's better:
It uses multiple trees and averages them for better performance and less overfitting.

Complex data, better accuracy	Averages many trees for robust predictions

In [12]:
# Use all combo features instead of only one
# X = df_scaled[['mentalHealth', 'relationship', 'phihealth', 'acedemic']]  # better input
# Optional: Try with PCA single feature too:
X = df_scaled[['student_performance_pls'] ]

y = df_scaled['sleep_quality']

# Train-test split
X_train , X_test , y_train , y_test = train_test_split(X,y ,test_size=0.2 , random_state=42)
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# Random Forest with tuning
rf = RandomForestRegressor(random_state=42)
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}
rf_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=5, cv=3, random_state=42)
rf_search.fit(X_train, y_train)

rf_best = rf_search.best_estimator_
y_pred_rf = rf_best.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"🌲RandomForest MAE: {mae_rf:.3f}, R²: {r2_rf:.3f}")



🌲RandomForest MAE: 0.126, R²: 0.604


## Gradient Boosting Regressor
- When it's better:
- You need state-of-the-art performance.
- You’re okay with a bit more training time.

🚗 How it's better:
It learns from mistakes of previous trees step-by-step, often giving top accuracy.

High performance, Learns from previous errors, better accuracy

In [96]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

# Initialize GradientBoostingRegressor
model_gb = GradientBoostingRegressor(random_state=42)

# Define the parameter grid for tuning
param_grid = {
    "n_estimators": [100, 200, 300],         # Number of boosting stages (trees)
    "learning_rate": [0.01, 0.05, 0.1],      # How much each tree contributes to the model
    "max_depth": [3, 5, 7],                   # Maximum depth of each tree
    "min_samples_split": [2, 5, 10],          # Minimum number of samples required to split a node
    "min_samples_leaf": [1, 2, 4],            # Minimum number of samples required at a leaf node
    "subsample": [0.8, 0.9, 1.0]              # Fraction of samples used for each tree
}
# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=model_gb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
print(f"Best parameters found: {grid_search.best_params_}")

# Predict using the best model
y_pred_gb = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print the evaluation metrics
print(f"🚀 GradientBoosting MAE: {mae_gb:.3f}, R²: {r2_gb:.3f}")



Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best parameters found: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300, 'subsample': 1.0}
🚀 GradientBoosting MAE: 0.128, R²: 0.637


## XGBoost / LightGBM
- When it's better:
You're dealing with large datasets.

You need fast and accurate results.

🚗 How it's better:
These models are super fast and often win machine learning competitions for regression tasks.

Big data, need for speed and power	Extremely fast and accurate

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

X = df_scaled[['student_performance_pls']]
y = df_scaled['sleep_quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False)

y_pred_xgb = xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"🏎️XGBoost (EarlyStop) MAE: {mae_xgb:.3f}, R²: {r2_xgb:.3f}")


In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgbm.fit(X_train, y_train,
         eval_set=[(X_test, y_test)])

y_pred_lgbm = lgbm.predict(X_test)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

print(f"💡LightGBM MAE: {mae_lgbm:.3f}, R²: {r2_lgbm:.3f}")


## Ridge Regression
- When it's better:
Your dataset has many features (columns).

There's multicollinearity (features are highly correlated).

🚲 How it's better:
Ridge adds a penalty to large coefficients, preventing overfitting and making the model more stable.

    Many features, correlated data	Prevents overfitting, stabilizes coefficients

In [None]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0], cv=3)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"🏔️RidgeCV MAE: {mae_ridge:.3f}, R²: {r2_ridge:.3f}")


## Lasso Regression 
✅ When it's better:
You have many irrelevant or noisy features.

You want feature selection built-in.

🚲 How it's better:
Lasso automatically shrinks some coefficients to zero, removing unimportant features.

    Feature selection	Automatically removes unimportant variables

In [None]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(alphas=[0.001, 0.01, 0.1, 1.0, 10.0], cv=3, random_state=42)
lasso.fit(X_train, y_train)

y_pred_lasso = lasso.predict(X_test)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"⚙️LassoCV MAE: {mae_lasso:.3f}, R²: {r2_lasso:.3f}")


## ElasticNet 
✅ When it's better:
You want a balance of Ridge's stability and Lasso's feature selection.

Data is high-dimensional (many columns).

🚲 How it's better:
ElasticNet combines the strengths of Ridge and Lasso into one model.

A mix of Ridge and Lasso	Balanced regularization and selection

In [None]:
from sklearn.linear_model import ElasticNetCV

elastic = ElasticNetCV(alphas=[0.01, 0.1, 1.0], l1_ratio=[0.2, 0.5, 0.8], cv=3, random_state=42)
elastic.fit(X_train, y_train)

y_pred_elastic = elastic.predict(X_test)
mae_elastic = mean_absolute_error(y_test, y_pred_elastic)
r2_elastic = r2_score(y_test, y_pred_elastic)

print(f"🧬ElasticNetCV MAE: {mae_elastic:.3f}, R²: {r2_elastic:.3f}")


## Decision Tree Regressor
✅ When it's better:
Data has non-linear relationships.

You want interpretable rules (if-then).

🚗 How it's better:
It splits data into decision paths and handles non-linearity very well.

Complex data, better accuracy	Averages many trees for robust predictions

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state=42)
param_grid = {
    "max_depth": [5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
tree_search = RandomizedSearchCV(tree, param_distributions=param_grid, n_iter=5, cv=3, random_state=42)
tree_search.fit(X_train, y_train)

tree_best = tree_search.best_estimator_
y_pred_tree = tree_best.predict(X_test)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

print(f"🌳DecisionTree MAE: {mae_tree:.3f}, R²: {r2_tree:.3f}")
