In [None]:
#================================================================
# Add all the libraries imports needed in the project here
#================================================================
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder,LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if your notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

#================================================================
# Imports custom preprocessing functions from 'functions.py'
#================================================================

from lib.functions import (is_binary_like,
                           _pretty_p,
                           evaluate_classifier
                           )

#================================================================
#  Initialize config
#================================================================
config = None
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [2]:
# =============================================================================
# DATA LOADING
# =============================================================================
df = pd.read_csv(config['input_data']['file1'])
df.head()

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,...,basic_needs,academic_performance,study_load,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level
0,14,20,0,11,2,1,2,4,2,3,...,2,3,2,3,3,2,3,3,2,1
1,15,8,1,15,5,3,1,4,3,1,...,2,1,4,1,5,1,4,5,5,2
2,12,18,1,14,2,1,2,2,2,2,...,2,2,3,3,2,2,3,2,2,1
3,16,12,1,15,4,3,1,3,4,2,...,2,2,4,1,4,1,4,4,5,2
4,16,28,0,7,2,3,5,1,3,2,...,3,4,3,1,2,1,5,0,5,1


In [3]:
# =============================================================================
# CONFIG
# =============================================================================
target_col = "stress_level"   # 
alpha = 0.05                  # significance level

In [4]:
# =============================================================================
# 1) Identify numeric features and exclude target
# =============================================================================
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != target_col]

In [5]:
# =============================================================================
# 2) Exclude "binary-like" numerics (e.g., 0/1 flags)
# =============================================================================

binary_like = [c for c in num_cols if is_binary_like(df[c])]
cand_cols   = [c for c in num_cols if c not in binary_like]

print("Binary-like numeric columns (excluded from ANOVA):", binary_like)
print("Candidate numeric columns for ANOVA:", cand_cols)

Binary-like numeric columns (excluded from ANOVA): ['mental_health_history']
Candidate numeric columns for ANOVA: ['anxiety_level', 'self_esteem', 'depression', 'headache', 'blood_pressure', 'sleep_quality', 'breathing_problem', 'noise_level', 'living_conditions', 'safety', 'basic_needs', 'academic_performance', 'study_load', 'teacher_student_relationship', 'future_career_concerns', 'social_support', 'peer_pressure', 'extracurricular_activities', 'bullying']


In [10]:
# 3) Run one-way ANOVA per feature across stress levels 0,1,2

rows = []
levels = [0, 1, 2]

for col in cand_cols:
    groups = [df.loc[df[target_col] == g, col].dropna().values for g in levels]
    # require at least 2 observations per group
    if all(len(g) > 1 for g in groups):
        f_stat, p_val = stats.f_oneway(*groups)

        # Optional effect size: eta-squared (η²) = SS_between / SS_total
        overall_mean = df[col].mean()
        nks   = [len(g) for g in groups]
        means = [g.mean() for g in groups]
        ss_between = sum(nk * (mk - overall_mean) ** 2 for nk, mk in zip(nks, means))
        ss_total   = ((df[col] - overall_mean) ** 2).sum()
        eta2 = ss_between / ss_total if ss_total > 0 else np.nan

        decision = "discard" if p_val >= alpha else "keep"
        rows.append({"feature": col, "f_stat": f_stat, "p_value": _pretty_p(p_val), "eta_sq": eta2, "decision": decision})
    else:
        rows.append({"feature": col, "f_stat": np.nan, "p_value": np.nan, "eta_sq": np.nan, "decision": "insufficient data"})

anova_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
display(anova_df)

Unnamed: 0,feature,f_stat,p_value,eta_sq,decision
0,anxiety_level,655.453225,< 0.0001,0.544418,keep
16,peer_pressure,539.338954,< 0.0001,0.495789,keep
15,social_support,430.279745,< 0.0001,0.439608,keep
14,future_career_concerns,684.270952,< 0.0001,0.555067,keep
13,teacher_student_relationship,513.033251,< 0.0001,0.483295,keep
12,study_load,372.595924,< 0.0001,0.404514,keep
11,academic_performance,639.222921,< 0.0001,0.538192,keep
10,basic_needs,600.12007,< 0.0001,0.52247,keep
17,extracurricular_activities,549.800151,< 0.0001,0.500592,keep
9,safety,613.63475,< 0.0001,0.528024,keep


In [None]:
# 4) Lists to keep/discard
to_discard = anova_df.loc[anova_df["decision"] == "discard", "feature"].tolist()
to_keep    = anova_df.loc[anova_df["decision"] == "keep", "feature"].tolist()

print(f"\nalpha = {alpha}")
print(f"KEEP (p_value < alpha): {len(to_keep)} features")
print(to_keep)
print(f"\nDISCARD (pp_value >= alpha): {len(to_discard)} features")
print(to_discard)



alpha = 0.05
KEEP (p < alpha): 19 features
['anxiety_level', 'peer_pressure', 'social_support', 'future_career_concerns', 'teacher_student_relationship', 'study_load', 'academic_performance', 'basic_needs', 'extracurricular_activities', 'safety', 'noise_level', 'breathing_problem', 'sleep_quality', 'blood_pressure', 'headache', 'depression', 'self_esteem', 'living_conditions', 'bullying']

DISCARD (p >= alpha): 0 features
[]


**Insight (ANOVA selection):**
- We tested each numeric feature across stress levels (0/1/2). Features with p < 0.05 show significantly different means across groups and were kept; features with p ≥ 0.05 were discarded as they don’t separate stress levels on average.