In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [13]:
print("Cluster Analysis Script Loaded")

Cluster Analysis Script Loaded


In [14]:
DATA_PATH = "clustered_dataset_k4.csv"   
SEP = ";"                                 
CLUSTER_COL = "cluster"                     
OUTPUT_CSV = "cluster_summary_table.csv"
OUTPUT_XLSX = "cluster_summary_table.xlsx"

# --- LOAD DATA (IMPORTANT PART) ---
df = pd.read_csv(DATA_PATH, sep=",")   # or just pd.read_csv(DATA_PATH)

# Clean possible spaces/BOM from column names
df.columns = (
    df.columns
      .str.strip()            # remove spaces at start/end
      .str.replace('\ufeff', '', regex=False)  # remove BOM if present
)


print("Columns found:", df.columns.tolist())  # debug: see real names


Columns found: ['age', 'gender', 'education', 'marital', 'income', 'AUDIT_total', 'DAST_total', 'IAT_total', 'PGSI_total', 'PCL5_total', 'MSPSS_total', 'SWLS_total', 'WHO5_total', 'cluster', 'AUDIT_total_z', 'DAST_total_z', 'IAT_total_z', 'PGSI_total_z', 'PCL5_total_z', 'MSPSS_total_z', 'SWLS_total_z', 'WHO5_total_z']


In [None]:

# 1) DEMOGRAPHIC / CATEGORICAL VARIABLES (ORDERED)
DEMO_CATEGORICAL = ["gender", "education", "marital", "income_cat"]

# 2) OTHER NUMERIC VARIABLES
OTHER_NUMERIC = [
    "AUDIT_total",
    "DAST_total",
    "IAT_total",
    "PGSI_total",
    "PCL5_total",
    "MSPSS_total",
    "SWLS_total",
    "WHO5_total"
]


# Mapping from numeric codes to labels for categorical vars
CATEGORY_LABELS = {
    "gender": {
        0: "Male",
        1: "Female",
        2: "Non-binary",
        3: "Prefer not to say",
    },
    "education": {
        5: "Elementary school",
        8: "Middle school",
        13: "High School",
        18: "Bachelor's Degree",
        22: "Master's Degree",
        25: "Doctoral Degree",
    },
    "marital": {
        0: "Single",
        1: "Married",
        2: "Divorced",
        3: "Widowed",
        4: "Separated",
        5: "Prefer not to say",
    }
}


# Pretty labels for left column
DISPLAY_NAMES = {
    "gender": "Gender",
    "education": "Education level",
    "marital": "Marital status",
    "income_cat": "Income bracket",
    "AUDIT_total": "Alcohol use (AUDIT)",
    "DAST_total": "Drug use (DAST)",
    "IAT_total": "Internet Addiction (IAT)",
    "PGSI_total": "Gambling (PGSI)",
    "PCL5_total": "Trauma (PCL-5)",
    "MSPSS_total": "Social Support (MSPSS)",
    "SWLS_total": "Life Satisfaction (SWLS)",
    "WHO5_total": "Well-being (WHO-5)",
}



In [None]:
def format_p_value(p):
    if pd.isna(p):
        return ""
    if p < 0.001:
        return "<0.001"
    return f"{p:.3f}"

def summarize_numeric(df, var, cluster_col):
    """For numeric variables: median (Q1; Q3) + Kruskal-Wallis"""
    result = {}
    groups = {}

    for cl, sub in df.groupby(cluster_col):
        values = sub[var].dropna().astype(float)
        groups[cl] = values
        if len(values) == 0:
            result[cl] = ""
        else:
            median = np.median(values)
            q1 = np.percentile(values, 25)
            q3 = np.percentile(values, 75)
            result[cl] = f"{median:.1f} ({q1:.1f}; {q3:.1f})"

    valid = [g for g in groups.values() if len(g) > 0]
    if len(valid) >= 2:
        _, p = stats.kruskal(*valid)
    else:
        p = np.nan

    result["p"] = format_p_value(p)
    return result

def summarize_categorical(df, var, cluster_col):
    """For categorical variables: mode + % + chi-square test"""
    result = {}

    if var in CATEGORY_LABELS:
        mapping = CATEGORY_LABELS[var]
        series_all = df[var].map(mapping)
    else:
        series_all = df[var].astype(str)

    for cl, sub in df.groupby(cluster_col):
        series = series_all.loc[sub.index].dropna()
        if len(series) == 0:
            result[cl] = ""
        else:
            counts = series.value_counts()
            mode_label = counts.idxmax()
            pct = 100 * counts.max() / len(series)
            result[cl] = f"{mode_label} ({pct:.0f}%)"

    contingency = pd.crosstab(df[var], df[cluster_col])
    if contingency.shape[0] >= 2 and contingency.shape[1] >= 2:
        _, p, _, _ = stats.chi2_contingency(contingency)
    else:
        p = np.nan

    result["p"] = format_p_value(p)
    return result

In [22]:
df.columns = df.columns.str.strip().str.replace("\ufeff", "", regex=False)

# Convert cluster to string for consistent grouping
df[CLUSTER_COL] = df[CLUSTER_COL].astype(str)


# Convert income to categorical (Option B)
df["income_cat"] = pd.cut(
    df["income"],
    bins=[0, 20000, 40000, 60000, np.inf],
    labels=["<20k", "20k–40k", "40k–60k", ">60k"],
    right=False
)

clusters = sorted(df[CLUSTER_COL].unique())
cluster_counts = {cl: (df[CLUSTER_COL] == cl).sum() for cl in clusters}
cluster_col_names = [f"Cluster {cl} (n={cluster_counts[cl]})" for cl in clusters]

rows = []
# 1) DEMOGRAPHIC CATEGORICAL
for var in DEMO_CATEGORICAL:
    summary = summarize_categorical(df, var, CLUSTER_COL)
    row = {"Variable": DISPLAY_NAMES.get(var, var)}
    for cl, col_name in zip(clusters, cluster_col_names):
        row[col_name] = summary.get(cl, "")
    row["P value"] = summary["p"]
    rows.append(row)

# 2) OTHER NUMERIC VARIABLES
for var in OTHER_NUMERIC:
    summary = summarize_numeric(df, var, CLUSTER_COL)
    row = {"Variable": DISPLAY_NAMES.get(var, var)}
    for cl, col_name in zip(clusters, cluster_col_names):
        row[col_name] = summary.get(cl, "")
    row["P value"] = summary["p"]
    rows.append(row)


In [23]:
# Create final table
table_df = pd.DataFrame(rows, columns=["Variable"] + cluster_col_names + ["P value"])

# Save to Excel
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
    table_df.to_excel(writer, index=False, sheet_name="Cluster Summary")

print("Excel file created:", OUTPUT_XLSX)
print("Cluster sizes:", cluster_counts)

Excel file created: cluster_summary_table.xlsx
Cluster sizes: {'0': np.int64(54), '1': np.int64(55), '2': np.int64(57), '3': np.int64(55)}


Results reflect:

Cluster 0 : Mostly females, could be married, with high school education finished and moderate income. They have low gambling risk, moderate scores for trauma, social support, and well-being. (Mia, the Overwhelmed Student looking for support)

Cluster 1 : Predominantly male, married, and elementary school education. Very high alcohol use and gambling scores, low drug use, and the lowest life satisfaction and well-being. (Marko, the Social Drinker on the Edge)

Cluster 2 : Mostly single people, with middle school education and lower income. They show the highest internet addiction scores and gambling risk, with high life satisfaction and social support. (Alex, the Always-Online Friend)

Cluster 3 : Predominantely woman, with Master’s degrees and the highest income. Moderate substance/gambling scores, high trauma exposure, strong social support, and the highest life satisfaction and well-being. (Sara, the Caring Professional with High well-being)