In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
DATA_DIR = Path("../data/")
DATA_NAME = "student_depression_dataset.csv"

# I. Giới thiệu về tập dữ liệu

## Khái quát về dữ liệu
TODO

## Mô tả về dữ liệu
TODO

## Nguồn dữ liệu
TODO

## Thu thập
TODO

## Lí do chọn tập dữ liệu
TODO

## Giấy phép sử dụng
TODO


In [None]:
df = pd.read_csv(DATA_DIR / DATA_NAME)
df.head()

In [None]:
df.info()

Dữ liệu bao gồm `27901` và `18` cột, bao gồm:
- `id`:
- `Gender`:
- `Age`:
- `City`:
- `Profession`:
- `Academic Pressure`:
- `Work Pressure`:
- `CGPA`:
- `Study Satisfaction`:
- `Job Satisfaction`:
- `Sleep Duration`:
- `Dietary Habits`:
- `Degree`:
- `Have you ever had suicidal thoughts ?`:
- `Work/Study Hours`:
- `Financial Stress`:
- `Family History of Mental Illness`:
- `Depression`:

# II. Phân tích dữ liệu cơ bản

## 2.1. Kiểm tra Missing Values & Data Quality

In [None]:
# Check missing values
print("Missing Values:")
print(df.isnull().sum())
print("\n" + "=" * 50 + "\n")

# Check duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")
print("\n" + "=" * 50 + "\n")

# Basic statistics
print("Shape:", df.shape)
print(f"Depression rate: {df['Depression'].mean():.2%}")

## 2.2. Phân bố các biến số

In [None]:
# Numerical features distribution
numerical_cols = [
    "Age",
    "Academic Pressure",
    "Work Pressure",
    "CGPA",
    "Study Satisfaction",
    "Job Satisfaction",
    "Work/Study Hours",
]

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, edgecolor="black", alpha=0.7)
    axes[idx].set_title(f"{col}", fontsize=12, fontweight="bold")
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel("Frequency")
    axes[idx].grid(alpha=0.3)

# Depression distribution
axes[7].bar(
    ["No Depression", "Depression"],
    df["Depression"].value_counts().sort_index().values,
    color=["#2ecc71", "#e74c3c"],
    alpha=0.7,
    edgecolor="black",
)
axes[7].set_title("Depression Distribution", fontsize=12, fontweight="bold")
axes[7].set_ylabel("Count")
axes[7].grid(alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[8])

plt.tight_layout()
plt.show()

# Statistics
print("\nNumerical Features Statistics:")
print(df[numerical_cols].describe())

## 2.3. Phân bố các biến phân loại

In [None]:
# Categorical features distribution
categorical_cols = [
    "Gender",
    "Profession",
    "Sleep Duration",
    "Dietary Habits",
    "Degree",
    "Have you ever had suicidal thoughts ?",
    "Financial Stress",
    "Family History of Mental Illness",
]

fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    value_counts = df[col].value_counts()
    axes[idx].barh(
        range(len(value_counts)), value_counts.values, alpha=0.7, edgecolor="black"
    )
    axes[idx].set_yticks(range(len(value_counts)))
    axes[idx].set_yticklabels(value_counts.index, fontsize=9)
    axes[idx].set_title(f"{col}", fontsize=11, fontweight="bold")
    axes[idx].set_xlabel("Count")
    axes[idx].grid(alpha=0.3, axis="x")

    # Add value labels
    for i, v in enumerate(value_counts.values):
        axes[idx].text(v, i, f" {v}", va="center", fontsize=8)

# Remove empty subplot
fig.delaxes(axes[8])

plt.tight_layout()
plt.show()

# III. Phân tích mối quan hệ với Depression

## 3.1. Tỷ lệ trầm cảm theo các yếu tố phân loại

In [None]:
# Depression rate by categorical features
fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    depression_rate = (
        df.groupby(col)["Depression"]
        .agg(["mean", "count"])
        .sort_values("mean", ascending=False)
    )

    bars = axes[idx].barh(
        range(len(depression_rate)),
        depression_rate["mean"] * 100,
        alpha=0.7,
        edgecolor="black",
    )
    axes[idx].set_yticks(range(len(depression_rate)))
    axes[idx].set_yticklabels(depression_rate.index, fontsize=9)
    axes[idx].set_title(f"Depression Rate by {col}", fontsize=11, fontweight="bold")
    axes[idx].set_xlabel("Depression Rate (%)")
    axes[idx].grid(alpha=0.3, axis="x")

    # Color bars based on rate
    for i, (bar, rate) in enumerate(zip(bars, depression_rate["mean"])):
        if rate > 0.5:
            bar.set_color("#e74c3c")  # Red for high
        elif rate > 0.3:
            bar.set_color("#f39c12")  # Orange for medium
        else:
            bar.set_color("#2ecc71")  # Green for low

    # Add value labels
    for i, (rate, count) in enumerate(
        zip(depression_rate["mean"], depression_rate["count"])
    ):
        axes[idx].text(
            rate * 100, i, f" {rate*100:.1f}% (n={count})", va="center", fontsize=8
        )

# Remove empty subplot
fig.delaxes(axes[8])

plt.tight_layout()
plt.show()

## 3.2. So sánh các biến số giữa nhóm có và không có trầm cảm

In [None]:
# Box plots comparing depressed vs non-depressed
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    data_to_plot = [df[df["Depression"] == 0][col], df[df["Depression"] == 1][col]]
    bp = axes[idx].boxplot(
        data_to_plot,
        labels=["No Depression", "Depression"],
        patch_artist=True,
        widths=0.6,
    )

    # Color boxes
    bp["boxes"][0].set_facecolor("#2ecc71")
    bp["boxes"][1].set_facecolor("#e74c3c")

    axes[idx].set_title(f"{col}", fontsize=12, fontweight="bold")
    axes[idx].set_ylabel(col)
    axes[idx].grid(alpha=0.3, axis="y")

    # Add mean values
    mean_no_dep = df[df["Depression"] == 0][col].mean()
    mean_dep = df[df["Depression"] == 1][col].mean()
    axes[idx].text(
        1,
        axes[idx].get_ylim()[1] * 0.95,
        f"μ={mean_no_dep:.2f}",
        ha="center",
        fontsize=9,
        bbox=dict(boxstyle="round", facecolor="#2ecc71", alpha=0.3),
    )
    axes[idx].text(
        2,
        axes[idx].get_ylim()[1] * 0.95,
        f"μ={mean_dep:.2f}",
        ha="center",
        fontsize=9,
        bbox=dict(boxstyle="round", facecolor="#e74c3c", alpha=0.3),
    )

# Remove empty subplots
fig.delaxes(axes[7])
fig.delaxes(axes[8])

plt.tight_layout()
plt.show()

# Statistical comparison
print("\nMean values comparison:")
print("=" * 80)
comparison_df = pd.DataFrame(
    {
        "No Depression": df[df["Depression"] == 0][numerical_cols].mean(),
        "Depression": df[df["Depression"] == 1][numerical_cols].mean(),
    }
)
comparison_df["Difference"] = (
    comparison_df["Depression"] - comparison_df["No Depression"]
)
comparison_df["% Change"] = (
    comparison_df["Difference"] / comparison_df["No Depression"] * 100
).round(2)
print(comparison_df)

## 3.3. Correlation Analysis

In [None]:
# Correlation matrix
correlation_features = numerical_cols + ["Depression"]
corr_matrix = df[correlation_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.show()

# Features most correlated with Depression
depression_corr = (
    corr_matrix["Depression"].drop("Depression").sort_values(ascending=False)
)
print("\nCorrelation with Depression (sorted):")
print("=" * 50)
print(depression_corr)

# Visualize correlation with Depression
fig, ax = plt.subplots(figsize=(10, 6))
colors = ["#e74c3c" if x > 0 else "#2ecc71" for x in depression_corr.values]
bars = ax.barh(
    range(len(depression_corr)),
    depression_corr.values,
    color=colors,
    alpha=0.7,
    edgecolor="black",
)
ax.set_yticks(range(len(depression_corr)))
ax.set_yticklabels(depression_corr.index)
ax.set_xlabel("Correlation Coefficient", fontsize=12)
ax.set_title("Features Correlation with Depression", fontsize=14, fontweight="bold")
ax.axvline(x=0, color="black", linestyle="-", linewidth=0.8)
ax.grid(alpha=0.3, axis="x")

# Add value labels
for i, v in enumerate(depression_corr.values):
    ax.text(v, i, f" {v:.3f}", va="center", fontsize=10, fontweight="bold")

plt.tight_layout()
plt.show()

# IV. Insights chuyên sâu

## 4.1. Nhóm nguy cơ cao: Kết hợp nhiều yếu tố

In [None]:
# Analyze high-risk combinations
# 1. Suicidal thoughts + Financial stress
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Suicidal thoughts vs Financial Stress
cross_tab1 = (
    pd.crosstab(
        df["Have you ever had suicidal thoughts ?"],
        df["Financial Stress"],
        df["Depression"],
        aggfunc="mean",
    )
    * 100
)
sns.heatmap(
    cross_tab1,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[0, 0],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[0, 0].set_title(
    "Depression Rate: Suicidal Thoughts × Financial Stress", fontweight="bold"
)
axes[0, 0].set_xlabel("Financial Stress")
axes[0, 0].set_ylabel("Suicidal Thoughts")

# Sleep Duration vs Study Satisfaction
# Create bins for study satisfaction
df["Study_Sat_Group"] = pd.cut(
    df["Study Satisfaction"],
    bins=[0, 2, 3, 5],
    labels=["Low (0-2)", "Medium (2-3)", "High (3-5)"],
)
cross_tab2 = (
    pd.crosstab(
        df["Sleep Duration"], df["Study_Sat_Group"], df["Depression"], aggfunc="mean"
    )
    * 100
)
sns.heatmap(
    cross_tab2,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[0, 1],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[0, 1].set_title(
    "Depression Rate: Sleep Duration × Study Satisfaction", fontweight="bold"
)
axes[0, 1].set_xlabel("Study Satisfaction")
axes[0, 1].set_ylabel("Sleep Duration")

# Academic Pressure + Work Pressure combination
df["Pressure_Group"] = "Low Pressure"
df.loc[
    (df["Academic Pressure"] >= 4) | (df["Work Pressure"] >= 4), "Pressure_Group"
] = "High Pressure"
df.loc[
    (df["Academic Pressure"] >= 4) & (df["Work Pressure"] >= 4), "Pressure_Group"
] = "Very High Pressure"

pressure_dep = df.groupby("Pressure_Group")["Depression"].agg(["mean", "count"])
bars = axes[1, 0].bar(
    pressure_dep.index,
    pressure_dep["mean"] * 100,
    color=["#2ecc71", "#f39c12", "#e74c3c"],
    alpha=0.7,
    edgecolor="black",
)
axes[1, 0].set_title("Depression Rate by Combined Pressure Levels", fontweight="bold")
axes[1, 0].set_ylabel("Depression Rate (%)")
axes[1, 0].set_xlabel("Pressure Group")
axes[1, 0].grid(alpha=0.3, axis="y")
for i, (bar, count) in enumerate(zip(bars, pressure_dep["count"])):
    height = bar.get_height()
    axes[1, 0].text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{height:.1f}%\n(n={count})",
        ha="center",
        va="bottom",
        fontsize=10,
        fontweight="bold",
    )

# Family History + Age Group
df["Age_Group"] = pd.cut(
    df["Age"], bins=[0, 20, 25, 30, 100], labels=["<20", "20-25", "25-30", "30+"]
)
cross_tab4 = (
    pd.crosstab(
        df["Age_Group"],
        df["Family History of Mental Illness"],
        df["Depression"],
        aggfunc="mean",
    )
    * 100
)
sns.heatmap(
    cross_tab4,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[1, 1],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[1, 1].set_title("Depression Rate: Age Group × Family History", fontweight="bold")
axes[1, 1].set_xlabel("Family History of Mental Illness")
axes[1, 1].set_ylabel("Age Group")

plt.tight_layout()
plt.show()

print("\nHigh-Risk Group Analysis:")
print("=" * 80)
print(
    f"Students with suicidal thoughts: {df[df['Have you ever had suicidal thoughts ?']=='Yes']['Depression'].mean()*100:.1f}% depression rate"
)
print(
    f"Students with family history: {df[df['Family History of Mental Illness']=='Yes']['Depression'].mean()*100:.1f}% depression rate"
)
print(
    f"Very high pressure group: {df[df['Pressure_Group']=='Very High Pressure']['Depression'].mean()*100:.1f}% depression rate"
)

## 4.2. Phân tích theo Nghề nghiệp và Bằng cấp

In [None]:
# Profession vs Degree analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Depression rate by Profession and Degree
cross_prof_degree = (
    pd.crosstab(df["Profession"], df["Degree"], df["Depression"], aggfunc="mean") * 100
)
sns.heatmap(
    cross_prof_degree,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[0],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[0].set_title(
    "Depression Rate: Profession × Degree", fontsize=13, fontweight="bold"
)
axes[0].set_xlabel("Degree", fontsize=11)
axes[0].set_ylabel("Profession", fontsize=11)

# Average metrics by profession
prof_analysis = (
    df.groupby("Profession")
    .agg(
        {
            "Depression": "mean",
            "Academic Pressure": "mean",
            "Work Pressure": "mean",
            "CGPA": "mean",
            "Study Satisfaction": "mean",
            "Job Satisfaction": "mean",
        }
    )
    .round(2)
)

# Plot radar chart style comparison
prof_sorted = prof_analysis.sort_values("Depression", ascending=False)
x = range(len(prof_sorted.columns))
width = 0.15

for i, profession in enumerate(prof_sorted.index):
    # Normalize values to 0-1 scale for better visualization
    values = prof_sorted.loc[profession].values
    # Depression already 0-1, pressures/satisfactions are 0-5, CGPA is 0-10
    normalized = [
        values[0],  # Depression (already 0-1)
        values[1] / 5,  # Academic Pressure
        values[2] / 5,  # Work Pressure
        values[3] / 10,  # CGPA
        values[4] / 5,  # Study Satisfaction
        values[5] / 5,  # Job Satisfaction
    ]
    axes[1].bar(
        [p + width * i for p in x],
        normalized,
        width,
        label=profession,
        alpha=0.8,
        edgecolor="black",
    )

axes[1].set_xlabel("Metrics", fontsize=11)
axes[1].set_ylabel("Normalized Score (0-1)", fontsize=11)
axes[1].set_title("Normalized Metrics by Profession", fontsize=13, fontweight="bold")
axes[1].set_xticks([p + width * 1.5 for p in x])
axes[1].set_xticklabels(prof_sorted.columns, rotation=45, ha="right")
axes[1].legend(title="Profession", bbox_to_anchor=(1.05, 1), loc="upper left")
axes[1].grid(alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

print("\nDetailed Analysis by Profession:")
print("=" * 80)
print(prof_analysis.sort_values("Depression", ascending=False))

## 4.3. Tác động của giấc ngủ và thói quen ăn uống

In [None]:
# Sleep and Dietary habits impact
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Sleep Duration × Dietary Habits
cross_sleep_diet = (
    pd.crosstab(
        df["Sleep Duration"], df["Dietary Habits"], df["Depression"], aggfunc="mean"
    )
    * 100
)
sns.heatmap(
    cross_sleep_diet,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[0, 0],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[0, 0].set_title(
    "Depression Rate: Sleep × Dietary Habits", fontweight="bold", fontsize=12
)
axes[0, 0].set_xlabel("Dietary Habits")
axes[0, 0].set_ylabel("Sleep Duration")

# 2. Distribution of sleep duration
sleep_dist = (
    df.groupby("Sleep Duration").agg({"Depression": ["mean", "count"]}).round(3)
)
sleep_dist.columns = ["Depression Rate", "Count"]
sleep_dist = sleep_dist.sort_values("Depression Rate", ascending=False)

bars = axes[0, 1].barh(
    range(len(sleep_dist)),
    sleep_dist["Depression Rate"] * 100,
    alpha=0.7,
    edgecolor="black",
)
axes[0, 1].set_yticks(range(len(sleep_dist)))
axes[0, 1].set_yticklabels(sleep_dist.index)
axes[0, 1].set_xlabel("Depression Rate (%)", fontsize=11)
axes[0, 1].set_title(
    "Depression Rate by Sleep Duration", fontweight="bold", fontsize=12
)
axes[0, 1].grid(alpha=0.3, axis="x")

# Color bars
for i, bar in enumerate(bars):
    rate = sleep_dist["Depression Rate"].iloc[i]
    if rate > 0.5:
        bar.set_color("#e74c3c")
    elif rate > 0.3:
        bar.set_color("#f39c12")
    else:
        bar.set_color("#2ecc71")

# Add labels
for i, (rate, count) in enumerate(
    zip(sleep_dist["Depression Rate"], sleep_dist["Count"])
):
    axes[0, 1].text(
        rate * 100, i, f" {rate*100:.1f}% (n={int(count)})", va="center", fontsize=9
    )

# 3. Dietary Habits analysis
diet_dist = df.groupby("Dietary Habits").agg({"Depression": ["mean", "count"]}).round(3)
diet_dist.columns = ["Depression Rate", "Count"]
diet_dist = diet_dist.sort_values("Depression Rate", ascending=False)

bars = axes[1, 0].barh(
    range(len(diet_dist)),
    diet_dist["Depression Rate"] * 100,
    alpha=0.7,
    edgecolor="black",
)
axes[1, 0].set_yticks(range(len(diet_dist)))
axes[1, 0].set_yticklabels(diet_dist.index)
axes[1, 0].set_xlabel("Depression Rate (%)", fontsize=11)
axes[1, 0].set_title(
    "Depression Rate by Dietary Habits", fontweight="bold", fontsize=12
)
axes[1, 0].grid(alpha=0.3, axis="x")

# Color bars
for i, bar in enumerate(bars):
    rate = diet_dist["Depression Rate"].iloc[i]
    if rate > 0.5:
        bar.set_color("#e74c3c")
    elif rate > 0.3:
        bar.set_color("#f39c12")
    else:
        bar.set_color("#2ecc71")

# Add labels
for i, (rate, count) in enumerate(
    zip(diet_dist["Depression Rate"], diet_dist["Count"])
):
    axes[1, 0].text(
        rate * 100, i, f" {rate*100:.1f}% (n={int(count)})", va="center", fontsize=9
    )

# 4. Combined lifestyle score
# Create lifestyle score: good sleep + good diet
df["Good_Sleep"] = (
    df["Sleep Duration"].isin(["7-8 hours", "More than 8 hours"]).astype(int)
)
df["Good_Diet"] = df["Dietary Habits"].isin(["Healthy", "Moderate"]).astype(int)
df["Lifestyle_Score"] = df["Good_Sleep"] + df["Good_Diet"]

lifestyle_analysis = df.groupby("Lifestyle_Score").agg(
    {"Depression": ["mean", "count"]}
)
lifestyle_analysis.columns = ["Depression Rate", "Count"]

bars = axes[1, 1].bar(
    lifestyle_analysis.index,
    lifestyle_analysis["Depression Rate"] * 100,
    color=["#e74c3c", "#f39c12", "#f1c40f", "#2ecc71"],
    alpha=0.7,
    edgecolor="black",
)
axes[1, 1].set_xlabel("Lifestyle Score\n(0=Poor, 2=Good)", fontsize=11)
axes[1, 1].set_ylabel("Depression Rate (%)", fontsize=11)
axes[1, 1].set_title(
    "Depression Rate by Lifestyle Score", fontweight="bold", fontsize=12
)
axes[1, 1].set_xticks(lifestyle_analysis.index)
axes[1, 1].grid(alpha=0.3, axis="y")

# Add labels
for bar, count in zip(bars, lifestyle_analysis["Count"]):
    height = bar.get_height()
    axes[1, 1].text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{height:.1f}%\n(n={int(count)})",
        ha="center",
        va="bottom",
        fontsize=10,
    )

plt.tight_layout()
plt.show()

print("\nLifestyle Impact Summary:")
print("=" * 80)
print(
    f"Good sleep only: {df[(df['Good_Sleep']==1) & (df['Good_Diet']==0)]['Depression'].mean()*100:.1f}% depression"
)
print(
    f"Good diet only: {df[(df['Good_Sleep']==0) & (df['Good_Diet']==1)]['Depression'].mean()*100:.1f}% depression"
)
print(
    f"Both good: {df[(df['Good_Sleep']==1) & (df['Good_Diet']==1)]['Depression'].mean()*100:.1f}% depression"
)
print(
    f"Both poor: {df[(df['Good_Sleep']==0) & (df['Good_Diet']==0)]['Depression'].mean()*100:.1f}% depression"
)

## 4.4. Phân tích CGPA và Work/Study Hours

In [None]:
# CGPA and Study/Work hours analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. CGPA distribution by depression status
axes[0, 0].hist(
    [df[df["Depression"] == 0]["CGPA"], df[df["Depression"] == 1]["CGPA"]],
    bins=30,
    label=["No Depression", "Depression"],
    color=["#2ecc71", "#e74c3c"],
    alpha=0.6,
    edgecolor="black",
)
axes[0, 0].set_xlabel("CGPA", fontsize=11)
axes[0, 0].set_ylabel("Frequency", fontsize=11)
axes[0, 0].set_title(
    "CGPA Distribution by Depression Status", fontweight="bold", fontsize=12
)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Work/Study Hours distribution by depression status
axes[0, 1].hist(
    [
        df[df["Depression"] == 0]["Work/Study Hours"],
        df[df["Depression"] == 1]["Work/Study Hours"],
    ],
    bins=30,
    label=["No Depression", "Depression"],
    color=["#2ecc71", "#e74c3c"],
    alpha=0.6,
    edgecolor="black",
)
axes[0, 1].set_xlabel("Work/Study Hours", fontsize=11)
axes[0, 1].set_ylabel("Frequency", fontsize=11)
axes[0, 1].set_title(
    "Work/Study Hours by Depression Status", fontweight="bold", fontsize=12
)
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. CGPA vs Work/Study Hours scatter (colored by depression)
depressed = df[df["Depression"] == 1]
not_depressed = df[df["Depression"] == 0]

axes[1, 0].scatter(
    not_depressed["CGPA"],
    not_depressed["Work/Study Hours"],
    alpha=0.3,
    s=10,
    c="#2ecc71",
    label="No Depression",
)
axes[1, 0].scatter(
    depressed["CGPA"],
    depressed["Work/Study Hours"],
    alpha=0.3,
    s=10,
    c="#e74c3c",
    label="Depression",
)
axes[1, 0].set_xlabel("CGPA", fontsize=11)
axes[1, 0].set_ylabel("Work/Study Hours", fontsize=11)
axes[1, 0].set_title("CGPA vs Work/Study Hours", fontweight="bold", fontsize=12)
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# 4. Depression rate by CGPA and Hours categories
df["CGPA_Category"] = pd.cut(
    df["CGPA"],
    bins=[0, 6, 7.5, 10],
    labels=["Low (<6)", "Medium (6-7.5)", "High (7.5-10)"],
)
df["Hours_Category"] = pd.cut(
    df["Work/Study Hours"],
    bins=[0, 5, 8, 24],
    labels=["Low (<5h)", "Medium (5-8h)", "High (>8h)"],
)

cross_cgpa_hours = (
    pd.crosstab(
        df["Hours_Category"], df["CGPA_Category"], df["Depression"], aggfunc="mean"
    )
    * 100
)
sns.heatmap(
    cross_cgpa_hours,
    annot=True,
    fmt=".1f",
    cmap="RdYlGn_r",
    ax=axes[1, 1],
    cbar_kws={"label": "Depression Rate (%)"},
)
axes[1, 1].set_title(
    "Depression Rate: Work/Study Hours × CGPA", fontweight="bold", fontsize=12
)
axes[1, 1].set_xlabel("CGPA Category")
axes[1, 1].set_ylabel("Work/Study Hours Category")

plt.tight_layout()
plt.show()

# Statistical analysis
print("\nCGPA and Hours Analysis:")
print("=" * 80)
print(f"Average CGPA (No Depression): {df[df['Depression']==0]['CGPA'].mean():.2f}")
print(f"Average CGPA (Depression): {df[df['Depression']==1]['CGPA'].mean():.2f}")
print(
    f"\nAverage Work/Study Hours (No Depression): {df[df['Depression']==0]['Work/Study Hours'].mean():.2f}"
)
print(
    f"Average Work/Study Hours (Depression): {df[df['Depression']==1]['Work/Study Hours'].mean():.2f}"
)

print("\n" + "=" * 80)
print("Depression rate by CGPA category:")
print(df.groupby("CGPA_Category")["Depression"].agg(["mean", "count"]))
print("\nDepression rate by Hours category:")
print(df.groupby("Hours_Category")["Depression"].agg(["mean", "count"]))

# V. Tổng kết Insights

## Key Findings:

### 1. **Yếu tố nguy cơ cao nhất:**
- Suicidal thoughts (ý nghĩ tự tử)
- Family history of mental illness (tiền sử gia đình)
- Financial stress (căng thẳng tài chính)
- Poor sleep duration (ngủ kém)

### 2. **Áp lực học tập & công việc:**
- Sinh viên với cả academic và work pressure cao có nguy cơ trầm cảm cao hơn đáng kể
- Correlation dương với Depression

### 3. **Lối sống lành mạnh:**
- Giấc ngủ tốt (7-8 giờ) và chế độ ăn healthy giảm nguy cơ trầm cảm
- Kết hợp cả hai yếu tố có tác động mạnh nhất

### 4. **CGPA & Study Hours:**
- CGPA thấp tương quan với tỷ lệ trầm cảm cao hơn
- Học/làm quá nhiều giờ cũng tăng nguy cơ

### 5. **Nghề nghiệp:**
- Một số nghề nghiệp có tỷ lệ trầm cảm cao hơn đáng kể
- Sự kết hợp giữa profession và degree level ảnh hưởng đến mental health

### 6. **Nhóm nguy cơ cực cao:**
- Sinh viên có suicidal thoughts + financial stress
- Có family history + young age
- Poor sleep + poor diet + high pressure