In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import kruskal, mannwhitneyu, chi2_contingency, fisher_exact

Statistical analysis means that we want to see which columns are relevant and which are not that significantly relevant. I decided to look at the first 5 columns and do my reseach over them.

First I have to decide on which column I wantto group my data first.

In [36]:
file_path = "filled_personal_medians.csv" 
df = pd.read_csv(file_path)

personal_cols = ["age", "gender", "education", "marital", "income"]
group_col = personal_cols[2]  # grouping variable (ordinal)
alpha = 0.05              # significance level

df = df[df[group_col].notna()].copy()

# Select only numeric variables (exclude the grouping variable)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if group_col in personal_cols:
    num_cols.remove(group_col)

Kruskal–Wallis tests across education levels
We do Kruskal-Wallis because education is a quantitative variable, its value raises as the individual studied more:

[5] Elementary school [8] Middle school [13] High School [18]Bachelor's Degree [22] Master's Degree [25] Doctoral Degree


In [37]:
kw_results = []
for col in num_cols:
    grouped = [g[col].dropna().values for _, g in df[[group_col, col]].dropna().groupby(group_col)]
    valid_groups = [arr for arr in grouped if len(arr) > 0]
    if len(valid_groups) >= 2:
        try:
            stat, p = kruskal(*valid_groups)
        except Exception:
            stat, p = np.nan, np.nan
        n = int(df[col].notna().sum())
        kw_results.append((col, len(valid_groups), n, stat, p))

kw_df = pd.DataFrame(kw_results, columns=["variable", "k_groups", "n_used", "kw_stat", "p_value"])
kw_df.sort_values("p_value", inplace=True)

Bonferroni correction (across all variables)

In [38]:
m_tests = len(kw_df)
kw_df["p_bonf"] = (kw_df["p_value"] * m_tests).clip(upper=1.0)
kw_df["significant_bonf"] = kw_df["p_bonf"] < alpha

print("\n=== Kruskal–Wallis (top 15 variables) ===")
print(kw_df.head(15))
print("\nNumber of significant variables after Bonferroni:", kw_df["significant_bonf"].sum())



=== Kruskal–Wallis (top 15 variables) ===
       variable  k_groups  n_used     kw_stat       p_value        p_bonf  \
3        income         6     221  115.473575  2.851070e-23  2.936602e-21   
96   DAST_total         6     221  103.924631  7.860045e-21  8.095847e-19   
98   PGSI_total         6     221   99.398720  7.075754e-20  7.288026e-18   
30       pgsi_7         6     221   65.289292  9.760915e-13  1.005374e-10   
97    IAT_total         6     221   64.769401  1.251218e-12  1.288755e-10   
50       iat_18         6     221   63.195152  2.652409e-12  2.731982e-10   
14       dast_1         6     221   61.242997  6.725908e-12  6.927686e-10   
40        iat_8         6     221   60.860068  8.071392e-12  8.313533e-10   
102  WHO5_total         6     221   60.422840  9.939088e-12  1.023726e-09   
27       pgsi_4         6     221   58.899338  2.051553e-11  2.113099e-09   
101  SWLS_total         6     221   54.740921  1.475786e-10  1.520060e-08   
20       dast_7         6     221

Post-hoc Mann–Whitney tests for significant variables

In [30]:
posthoc_rows = []
sig_vars = kw_df.loc[kw_df["significant_bonf"], "variable"].tolist()

edu_levels = sorted(df[group_col].dropna().unique())
pairs = list(combinations(edu_levels, 2))
m_pairs = len(pairs)

for col in sig_vars:
    for a, b in pairs:
        A = df.loc[df[group_col] == a, col].dropna().values
        B = df.loc[df[group_col] == b, col].dropna().values
        if len(A) > 0 and len(B) > 0:
            try:
                u_stat, p = mannwhitneyu(A, B, alternative="two-sided")
            except Exception:
                u_stat, p = np.nan, np.nan
            p_corr = min(p * m_pairs, 1.0)
            sig = p_corr < alpha
            # rank-biserial correlation effect size
            if not np.isnan(u_stat):
                n1, n2 = len(A), len(B)
                r_rb = 1 - (2*u_stat)/(n1*n2)
            else:
                r_rb = np.nan
            posthoc_rows.append([col, a, b, len(A), len(B), u_stat, p, p_corr, sig, r_rb])

posthoc_df = pd.DataFrame(
    posthoc_rows,
    columns=[
        "variable", "edu_a", "edu_b", "n_a", "n_b", "u_stat",
        "p_value", "p_bonf", "significant_bonf", "rank_biserial_r"
    ]
)


In [None]:
kw_df.to_csv("statistical-analysis-results/education_kruskal_bonferroni.csv", index=False)
posthoc_df.to_csv("statistical-analysis-results/education_posthoc_mannwhitney_bonferroni.csv", index=False)

print("\n=== Significant variables after Bonferroni ===")
print(kw_df[kw_df["significant_bonf"]].head(20))

print("\n=== Post-hoc significant pairs (Bonferroni corrected) ===")
print(posthoc_df[posthoc_df["significant_bonf"]].head(20))

print("\n✅ Results saved as:")
print(" - education_kruskal_bonferroni.csv")
print(" - education_posthoc_mannwhitney_bonferroni.csv")


=== Significant variables after Bonferroni ===
        variable  k_groups  n_used     kw_stat       p_value        p_bonf  \
98    PGSI_total        69     221  185.253722  8.390192e-13  8.641897e-11   
96    DAST_total        69     221  184.097586  1.220510e-12  1.257125e-10   
20        dast_7        69     221  164.882089  5.092588e-10  5.245366e-08   
58        pcl5_6        69     221  161.247957  1.523499e-09  1.569204e-07   
100  MSPSS_total        69     221  156.320665  6.561273e-09  6.758112e-07   
102   WHO5_total        69     221  156.228211  6.741564e-09  6.943811e-07   
56        pcl5_4        69     221  155.938173  7.339413e-09  7.559595e-07   
53        pcl5_1        69     221  153.415594  1.529695e-08  1.575586e-06   
101   SWLS_total        69     221  152.476920  2.006124e-08  2.066308e-06   
87        swls_3        69     221  151.790289  2.444379e-08  2.517711e-06   
2      education        69     221  151.737084  2.482025e-08  2.556486e-06   
65       pcl5_13

KRUSKAL-WALLIS RESULTS

variable - the name of the tested variable (column of the table)
kgroups - nr of possible values for education (6)
kw-stat - the statistic from doing Kruskal-Wallis (higher -> more difference between groups)
p_Values - prob. or getting this difference by chaqnce
p_bonf - prob after bonferroni correction
significant_bonf - true if education influences the variable significantly

interpretation of results: For 78 different variables in dataset, there are real, statistically significant differences between education levels.

POST-HOC RESULTS
variable - same
edu_a, edu_b - the 2 education levels being compared
p_value - from mann-whitney (low means significant difference)
p_bonf - bonferroni (<0.05 means significant difference)
rank_biserial_r - 
    0.1 - small effect - weak difference
    0.3 - medium - moderate difference
    >0.5 - large - strong difference
    >0 - higher scores in edu_a
    <0 - higher scores in edu_b

interpretation: it looked at only the 78 columns with differences, and then compared for each....category X of education vs category Y or education. For each pair it used Mann-Whitney U test

Kruskal–Wallis: “Education matters for this variable.”

Mann–Whitney (post-hoc): “Okay, which education levels differ, and by how much?”

** Ok now let's do the statistical analysis for a nominal variable (marital status)

In [8]:
file_path = "filled_personal_medians.csv"   # <-- put your CSV filename here
df = pd.read_csv(file_path)

group_col = "marital"   # grouping variable (nominal)
alpha = 0.05

# Keep only rows with a valid marital value
df = df[df[group_col].notna()].copy()

# Select only numeric variables (exclude the grouping variable)
personal_cols = ["age", "gender", "education", "marital", "income"]
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if group_col in personal_cols:
    num_cols.remove(group_col)

In [9]:
# Chi-square and Fisher's tests work on categories, not continuous numbers.
# So we’ll discretize each numeric variable into LOW / HIGH groups using the median.

categorized = df.copy()
for col in num_cols:
    median_val = categorized[col].median()
    categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")

# Apply Chi-Square or Fisher’s Exact Test for each variable
results = []

for col in num_cols:
    cat_col = col + "_cat"
    contingency = pd.crosstab(categorized[group_col], categorized[cat_col])

    # Check expected frequencies
    if contingency.size == 0:
        continue
    chi2, p, dof, expected = chi2_contingency(contingency)

    # If any expected cell < 5 → use Fisher (only works for 2x2)
    if (expected < 5).any() and contingency.shape == (2, 2):
        _, p_fisher = fisher_exact(contingency)
        test_used = "Fisher"
        p_final = p_fisher
    else:
        test_used = "Chi-square"
        p_final = p

    results.append({
        "variable": col,
        "test_used": test_used,
        "p_value": p_final,
        "significant": p_final < alpha
    })


  categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")
  categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")
  categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")
  categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")
  categorized[col + "_cat"] = np.where(categorized[col] > median_val, "High", "Low")


In [10]:

results_df = pd.DataFrame(results)
results_df.sort_values("p_value", inplace=True)
results_df["p_bonf"] = (results_df["p_value"] * len(results_df)).clip(upper=1.0)
results_df["significant_bonf"] = results_df["p_bonf"] < alpha

results_df.to_csv("statistical-analysis-results/marital_chi_fisher_results.csv", index=False)

print("\n=== Nominal Analysis (Marital) ===")
print(results_df.head(20))
print(f"\nNumber of significant variables (after Bonferroni): {results_df['significant_bonf'].sum()}")
print("\n✅ Results saved as 'marital_chi_fisher_results.csv'")


=== Nominal Analysis (Marital) ===
        variable   test_used       p_value  significant    p_bonf  \
3         income  Chi-square  6.587040e-08         True  0.000007   
47        iat_15  Chi-square  1.630044e-07         True  0.000017   
33         iat_1  Chi-square  1.792521e-07         True  0.000018   
96    DAST_total  Chi-square  2.123482e-07         True  0.000022   
43        iat_11  Chi-square  7.272538e-07         True  0.000075   
39         iat_7  Chi-square  1.008457e-06         True  0.000104   
41         iat_9  Chi-square  1.130483e-06         True  0.000116   
35         iat_3  Chi-square  1.721982e-06         True  0.000177   
97     IAT_total  Chi-square  3.041681e-06         True  0.000313   
49        iat_17  Chi-square  3.388475e-06         True  0.000349   
2      education  Chi-square  4.509672e-06         True  0.000464   
42        iat_10  Chi-square  6.032723e-06         True  0.000621   
51        iat_19  Chi-square  6.159800e-06         True  0.000634  

columns were mentioned before
here only 30 rows (out of 91 left) are significant. Means only 30 questiones of the quiestionnaire seem to have been influenced by the marital status of the person

Conclusion:

There are in total 95 variables (100 questions in the questionaire but we omit the personal cols, which are used here as gbrouping variables)

age: zero relevant variables, could be removed
gender: one relevant variable, could be removed
education: 78 relevant vars, very important
marital satus: 30 relevant vars, could be important but not really
income: 66 relevant variables, pretty important