In [None]:
### Import Libraries.

import os
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

from scipy.stats import kruskal, mannwhitneyu, shapiro, spearmanr
from statsmodels.stats.multitest import multipletests
import statsmodels.api as sm

In [None]:
### load Data.

os.chdir("/folder/")
file_path = "merged_output.xlsx"
xls = pd.ExcelFile(file_path)
merged_df = xls.parse("All")
merged_df = merged_df.fillna('Missing')

In [None]:
### Normality Check per Status (for column 'Marker')

df = merged_df[merged_df['Marker'] != 'Missing'].copy()

for cond in df['Status'].unique():
    data = df[df['Status'] == cond]['Marker'].dropna()
    print(f"\nStatus: {cond} (n = {len(data)})")
    if len(data) < 3:
        print("  Too few samples for Shapiro–Wilk (n < 3); skipping normality test.")
    else:
        stat, p = shapiro(data)
        print(f"  Shapiro-Wilk W = {stat:.4f}, p = {p:.4g}")
    variance = np.var(data, ddof = 1) if len(data) > 1 else np.nan
    print(f"  Sample variance: {variance:.4f}")

    sns.histplot(data, kde = True)
    plt.title(f'Histogram for {cond}')
    plt.xlabel('Marker')
    plt.ylabel('Frequency')
    plt.show()

    if len(data) >= 3:
        sm.qqplot(data, line = 's')
        plt.title(f'Q-Q Plot for {cond}')
        plt.show()

In [None]:
### Main Plots and Nonparametric Tests

df = merged_df[merged_df['Marker'] != 'Missing'].copy()
df['Marker'] = pd.to_numeric(df['Marker'], errors = 'coerce')

status_order = ['C9ALS', 'Control', 'sALS']
df['Status'] = pd.Categorical(df['Status'], categories = status_order, ordered = True)

sns.set(style="white", font_scale=1.2)

status_colors = {
    "C9ALS": "#B1746FFF",
    "Control": "#ADB17DFF",
    "sALS": "#5B8FA8FF"
}

In [None]:
### Prepare groups for Kruskal-Wallis

groups = [df[df['Status'] == s]['Marker'].dropna().values for s in status_order]
kw_res = kruskal(*groups)
kw_p = kw_res.pvalue
print(f"\nKruskal–Wallis test p-value: {kw_p:.4g}")

In [None]:
### Pairwise Mann–Whitney U tests with FDR correction.

comparisons, pvals = [], []
for i in range(len(status_order)):
    for j in range(i + 1, len(status_order)):
        g1, g2 = status_order[i], status_order[j]
        a = df[df['Status'] == g1]['Marker'].dropna()
        b = df[df['Status'] == g2]['Marker'].dropna()
        if (len(a) == 0) or (len(b) == 0):
            p = np.nan
        else:
            stat, p = mannwhitneyu(a, b, alternative = 'two-sided')
        comparisons.append((g1, g2))
        pvals.append(p)

In [None]:
### Filter out NaN p-values before correction

valid_idx = [i for i, p in enumerate(pvals) if not (p is None or np.isnan(p))]
pvals_valid = [pvals[i] for i in valid_idx]

if pvals_valid:
    reject, pvals_corrected, _, _ = multipletests(pvals_valid, alpha = 0.05, method = 'fdr_bh')
    p_adj_full = [np.nan] * len(pvals)
    reject_full = [False] * len(pvals)
    for idx_out, idx_valid in enumerate(valid_idx):
        p_adj_full[idx_valid] = pvals_corrected[idx_out]
        reject_full[idx_valid] = reject[idx_out]
else:
    p_adj_full = [np.nan] * len(pvals)
    reject_full = [False] * len(pvals)

posthoc = pd.DataFrame({
    'Group1': [c[0] for c in comparisons],
    'Group2': [c[1] for c in comparisons],
    'p_raw': pvals,
    'p_adj': p_adj_full,
    'reject': reject_full
})

In [None]:
### Box + Strip Plot with Pairwise Significance.


fig, ax = plt.subplots(figsize = (3, 4))

sns.boxplot(
    data = df,
    x = 'Status',
    y = 'Marker',
    order = status_order,
    hue = 'Status',
    palette = status_colors,
    width = 0.6,
    fliersize = 0,
    linewidth = 0.8,
    boxprops = dict(alpha = 0.3),
    whiskerprops = dict(linewidth = 1.5, alpha = 0.4),
    capprops = dict(linewidth = 1.5, alpha = 0.4),
    medianprops = dict(linewidth = 1.5, alpha = 0.6),
    ax = ax
)

sns.stripplot(
    data = df,
    x = 'Status',
    y = 'Marker',
    order = status_order,
    hue = 'Status',
    palette = status_colors,
    dodge = False,
    size = 6,
    alpha = 0.8,
    ax = ax
)

if ax.get_legend() is not None:
    ax.get_legend().remove()

ax.set_title('Marker', fontsize = 16, weight = 'bold', pad = 12)
ax.set_xlabel('Status', fontsize = 13, weight = 'bold', color = '#404040')
ax.set_ylabel('Marker (ng/ml)', fontsize = 13, weight = 'bold', color = '#404040')

for spine in ['top', 'right']:
    ax.spines[spine].set_visible(False)
for spine in ['bottom', 'left']:
    ax.spines[spine].set_linewidth(1.5)
    ax.spines[spine].set_color("#404040")

for label in ax.get_xticklabels():
    label.set_fontweight('bold')
    label.set_color('#404040')
for label in ax.get_yticklabels():
    label.set_fontweight('bold')
    label.set_color('#404040')

def get_significance_label(p):
    if p is None or np.isnan(p):
        return ''
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return ''

ymax = df['Marker'].max()
ymin = df['Marker'].min()
height_step = (ymax - ymin) * 0.08 if (ymax - ymin) > 0 else 0.1
baseline = ymax + height_step
current_height = baseline

for _, row in posthoc.iterrows():
    if not row['reject']:
        continue
    label = get_significance_label(row['p_adj'])
    if label == '':
        continue
    x1 = status_order.index(row['Group1'])
    x2 = status_order.index(row['Group2'])
    y = current_height
    ax.plot([x1, x1, x2, x2], [y, y + 0.01 * ymax, y + 0.01 * ymax, y], lw = 1.5, c = 'black')
    ax.text((x1 + x2)/2, y + 0.015 * ymax, label, ha = 'center', va = 'bottom', fontsize = 12)
    current_height += height_step

p_text = f"Kruskal–Wallis p = {kw_p:.3g}"
plt.suptitle(p_text, fontsize = 10, y = 0.94)
sns.despine()
plt.grid(False)
plt.tight_layout(rect = [0, 0, 1, 0.95])
plt.savefig("Marker_nonparametric.png", dpi = 800, bbox_inches = 'tight', transparent = True)
plt.show()

print("\n=== Non-parametric test summary ===")
print(f"Kruskal–Wallis p-value: {kw_p:.4g}")
print(posthoc)

In [None]:




# ---- Correlations: MBP vs NEFH per Status (selected) ----
df_corr = merged_df[(merged_df['MBP'] != 'Missing') & (merged_df['NEFH'] != 'Missing')].copy()
df_corr['MBP'] = pd.to_numeric(df_corr['MBP'], errors='coerce')
df_corr['NEFH'] = pd.to_numeric(df_corr['NEFH'], errors='coerce')

selected_statuses = ["sALS", "Control"]
df_filtered = df_corr[df_corr['Status'].isin(selected_statuses)].copy()

num_statuses = df_filtered['Status'].nunique()
col_wrap = min(num_statuses, 3)

sns.set(style="white")
dot_color = "#175c87"
line_color = "grey"

g = sns.FacetGrid(df_filtered, col="Status", col_wrap=col_wrap, height=4, aspect=1.1, despine=True)
g.map_dataframe(
    sns.regplot,
    x="MBP",
    y="NEFH",
    scatter_kws={'s': 60, 'alpha': 0.5, 'color': dot_color},
    line_kws={'color': line_color, 'alpha': 0.7},
)

for ax, status in zip(g.axes.flat, g.col_names):
    subset = df_filtered[df_filtered['Status'] == status]
    if len(subset) >= 2:
        r, p = spearmanr(subset['MBP'], subset['NEFH'])
        annotation = f"ρ = {r:.2f}, p = {p:.2g}"
    else:
        annotation = "Insufficient data"
    ax.set_title(f"{status}\n{annotation}", fontsize=11, weight='medium')
    ax.set_xlabel("MBP (ng/ml)", fontsize=10)
    ax.set_ylabel("NEFH (pg/ml)", fontsize=10)
    ax.tick_params(axis='both', which='major', labelsize=9)

plt.tight_layout()
plt.savefig("Main_2_MBP_NEFH_sALS.png", dpi=800, bbox_inches='tight')
plt.show()
