In [145]:
import pandas as pd
import kagglehub
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [146]:
# 1. Download the dataset using your provided command
path = kagglehub.dataset_download("mabubakrsiddiq/eyesight-and-vision-health-synthetic-dataset")

In [147]:
# Find the actual CSV file inside the downloaded directory
csv_file = [f for f in os.listdir(path) if f.endswith('.csv')][0]
file_path = os.path.join(path, csv_file)

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(file_path)

print(df)

# 2. Preprocessing
# Drop unnecessary identifier columns that hold no predictive value
cols_to_drop = ['id']
if 'Unnamed: 0' in df.columns:
    cols_to_drop.append('Unnamed: 0')
    
df = df.drop(columns=cols_to_drop, errors='ignore')

      Unnamed: 0     id  exercise_hours  mental_health_score  \
0              0      1        3.441116            50.112741   
1              1      2        7.494288            66.181801   
2              2      3        2.733887            69.674360   
3              3      4        8.122516            70.996764   
4              4      5        1.769984            50.017834   
...          ...    ...             ...                  ...   
9995        9995   9996        1.338703            56.768322   
9996        9996   9997        0.525389            85.442520   
9997        9997   9998        3.190260            87.468755   
9998        9998   9999        7.222904            74.331541   
9999        9999  10000        1.737612            94.941119   

      screen_time_hours  screen_brightness_avg  age   height_cm  \
0              4.387540              68.531464   56  172.766324   
1              9.596943              54.460165   19  180.683155   
2             12.272036       

In [148]:

# Separate the features (X) from the target variable (y)
X = df.drop(columns=['eye_health_score'])
y = df['eye_health_score']

# 3. Train-Test Split
# 80% of data for training, 20% reserved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Feature Scaling 
# Crucial for Linear Regression so that larger numbers (like height) 
# don't overpower smaller numbers (like screen time)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Model Training (Baseline Linear Regression)
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 6. Evaluation
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Simple Linear Regression Baseline ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.4f}")

--- Simple Linear Regression Baseline ---
Mean Absolute Error (MAE): 4.07
Mean Squared Error (MSE): 26.00
R-squared (R2): 0.8077


In [149]:
df_xgb = df.copy()

df_xgb['wear_glasses'] = (df_xgb['glasses_number'] > 0).astype(int)

# check for class weights
wear_glasses = (df_xgb["wear_glasses"] == 1).sum()
no_glasses = (df_xgb["wear_glasses"] == 0).sum()
ratio = no_glasses / wear_glasses
print(f"wear glasses: {wear_glasses}, dont wear glasses: {no_glasses}, ratio: {ratio}")

corr = df_xgb.corr()

df_xgb["age"] = df_xgb["age"] ** 2

# test features - test one by one
df_xgb["near_work_intensity"] = df_xgb["screen_time_hours"] * (1 / df_xgb["screen_distance_cm"])
df_xgb["light_dose_near"] = df_xgb["screen_time_hours"] * df_xgb["screen_brightness_avg"]
df_xgb["night_mode_ratio"] = df_xgb["night_mode_usage"] / (df_xgb["screen_time_hours"] + 0.1)
df_xgb["mh_age_interaction"] = df_xgb["mental_health_score"] * df_xgb["age"]
# eyesight literature recommends to bucket screen time
df_xgb["screen_bin"] = pd.cut(
    df_xgb["screen_time_hours"],
    bins=[0, 1, 4, 8, np.inf],
    labels=["<=1h", "1–4h", "4–8h", ">8h"]
)

df_xgb = df_xgb.drop(columns=['eye_health_score', 'glasses_number', 'height_cm', 'screen_time_hours'])

df_xgb

wear glasses: 5225, dont wear glasses: 4775, ratio: 0.9138755980861244


Unnamed: 0,exercise_hours,mental_health_score,screen_brightness_avg,age,outdoor_light_exposure_hours,night_mode_usage,screen_distance_cm,wear_glasses,near_work_intensity,light_dose_near,night_mode_ratio,mh_age_interaction,screen_bin
0,3.441116,50.112741,68.531464,3136,1.821210,79.091607,33.408167,1,0.131331,300.684546,17.624713,157153.555934,4–8h
1,7.494288,66.181801,54.460165,361,0.455726,90.535187,54.127821,0,0.177301,522.651088,9.336467,23891.630295,>8h
2,2.733887,69.674360,74.334277,5776,0.301454,11.488773,50.769790,0,0.241719,912.232925,0.928608,402439.101422,>8h
3,8.122516,70.996764,56.450697,4225,1.226576,83.373275,51.267787,1,0.172039,497.898805,9.346712,299961.329684,>8h
4,1.769984,50.017834,87.181496,625,0.521827,89.394952,54.595573,0,0.200803,955.764419,8.080587,31261.145941,>8h
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.338703,56.768322,58.164274,121,0.407882,74.331800,63.556015,0,0.013441,49.685391,77.897542,6868.966907,<=1h
9996,0.525389,85.442520,86.873212,1681,4.117332,29.781284,37.564927,1,0.183400,598.504895,4.260917,143628.876867,4–8h
9997,3.190260,87.468755,63.268931,3249,2.321639,76.794358,52.927722,1,0.147305,493.276116,9.725115,284185.983544,4–8h
9998,7.222904,74.331541,84.804226,484,2.794297,76.676388,43.333935,0,0.082083,301.648222,20.967046,35976.465687,1–4h


In [150]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve

X = df_xgb.drop(columns=['wear_glasses'])
y = df_xgb['wear_glasses']

cat_cols = ['screen_bin']
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=ratio,
    n_jobs=-1,
    random_state=42,
)

model = Pipeline(steps=[("preprocess", preprocess), ("clf", clf),])

model.fit(X_train, y_train)

# default threshold 0.5 for reference
# WHAT IS THRESHOLD AGAIN?!?
y_proba = model.predict_proba(X_test)[:, 1]
y_pred_05 = (y_proba >= 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion (thr=0.5):")
print(confusion_matrix(y_test, y_pred_05))
print(classification_report(y_test, y_pred_05, digits=3))

# gridsearch for threshold tuning to prioritize recallx
def evaluate_threshold(y_true, y_scores, thr):
    y_pred = (y_scores >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    recall = tp / (tp + fn + 1e-9)
    precision = tp / (tp + fp + 1e-9)
    return {"threshold": thr, "recall": recall, "precision": precision,
            "fn": fn, "fp": fp}

thresholds = np.arange(0.1, 0.91, 0.05)
results = [evaluate_threshold(y_test, y_proba, t) for t in thresholds]
res_df = pd.DataFrame(results)
print(res_df.sort_values("recall", ascending=False).head(10))


ROC AUC: 0.7371958215386156
Confusion (thr=0.5):
[[665 290]
 [366 679]]
              precision    recall  f1-score   support

           0      0.645     0.696     0.670       955
           1      0.701     0.650     0.674      1045

    accuracy                          0.672      2000
   macro avg      0.673     0.673     0.672      2000
weighted avg      0.674     0.672     0.672      2000

   threshold    recall  precision   fn   fp
0       0.10  0.994258   0.527679    6  930
1       0.15  0.977033   0.536803   24  881
2       0.20  0.963636   0.558204   38  797
3       0.25  0.924402   0.578790   79  703
4       0.30  0.876555   0.601445  129  607
5       0.35  0.829665   0.625993  178  518
6       0.40  0.777990   0.657235  232  424
7       0.45  0.728230   0.685586  284  349
8       0.50  0.649761   0.700722  366  290
9       0.55  0.575120   0.724096  444  229
