In [None]:
# Standard libraries
import os
import time
import warnings
from copy import deepcopy
from itertools import combinations, chain

# Progress bar
from tqdm import tqdm

# Data manipulation
import numpy as np
import pandas as pd
from collections import Counter

# Statistical analysis
from scipy.stats import norm
from scipy.stats import spearmanr
from pygam import LogisticGAM, s, f

# Visualization
import shap
import seaborn as sns

# Serialization
import joblib

# Custom
from pre_processing import *
from training import *
from plotting import *
from testing import *

import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, fisher_exact, chi2_contingency
from statsmodels.stats.multitest import multipletests

# Suppress warnings
warnings.filterwarnings('ignore')


In [None]:
def_response_drink_occasions = -1

In [None]:
output_dir = "../../results"

data_study1 = pd.read_csv('../data/intervention_time/osf_study1.csv')
data_study2 = pd.read_csv('../data/intervention_time/osf_study2.csv')

# Study 1 baseline data (train/val input)
b1_alcohol_self = pd.read_csv('../data/baseline/alcoholself_bucket280225.csv', index_col=0)
b2_group_subjective = pd.read_csv('../data/baseline/subjective_grouperceptions_280225.csv', index_col=0)
b3_group_sociometric = pd.read_csv('../data/baseline/data_social.csv')
b4_brain = pd.read_csv('../data/baseline/brain_bucket_280225.csv', index_col=0)
b5_demographic = pd.read_csv('../data/baseline/demographic_bucket280225.csv', index_col=0)
b6_psychometric = pd.read_csv('../data/baseline/psychometrics_bucket280225.csv', index_col=0)

# # Added analysis - To evaluate performance of objective drinking metrics
# b7_objective_group_drinking = pd.read_csv('../data/added_analysis/social_group_drinking.csv', index_col=0)

# Study 2 peer perception data (test input)
b2_group_subjective_study2 = pd.read_csv('/Users/fmagdalena/Documents/GitHub/shine-network-analysis/SHINE/final_buckets/subjective_grouperceptions_test.csv')
baseline_demo_study2 = pd.read_csv('/Users/fmagdalena/Documents/GitHub/SHINE-responsiveness-analysis/data/baseline/demo_study2_full.csv')

# Study 1 & 2 drinking/responsiveness data (output -> prediction target)
if def_response_drink_occasions == -1:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -0.5:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-0_5.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -2:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-2.csv', index_col=0).reset_index()
elif def_response_drink_occasions == 'who':
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_who_rdl.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -0.9:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-0.9.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -1.5:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-1.5.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -0.1:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-0.1.csv', index_col=0).reset_index()

responsive_study2 = pd.read_csv('../data/intervention_time/responsiveness_study2.csv', index_col=0).reset_index()

In [None]:
# Check for duplicates within each DataFrame
duplicates_study1 = responsive_study1['id'].duplicated().any()
duplicates_study2 = responsive_study2['id'].duplicated().any()

print(f"Study 1 has duplicates: {duplicates_study1}")
print(f"Study 2 has duplicates: {duplicates_study2}")

# Check for overlapping IDs between the two studies
ids_study1 = set(responsive_study1['id'])
ids_study2 = set(responsive_study2['id'])
overlap = ids_study1.intersection(ids_study2)

print(f"Number of overlapping IDs: {len(overlap)}") # Should be zero
if overlap:
    print(f"Overlapping IDs: {overlap}")


In [None]:
EXCLUDE_VARS = [
    'group', 'condition', 'active',
    'control', 'difference_drinks_occasions']

# responsive_study1 = responsive_study1[responsive_study1.condition == 'mindful']
# responsive_study2 = responsive_study2[responsive_study2.condition == 'mindful']

responsive_study1.drop(columns=EXCLUDE_VARS, inplace=True, errors='ignore')
responsive_study2.drop(columns=EXCLUDE_VARS, inplace=True, errors='ignore')


In [None]:
# Training datasets -> Study 1
b1_alcohol_self_response = pd.merge(b1_alcohol_self, responsive_study1, on='id', how='inner')
b2_group_subjective_response = pd.merge(b2_group_subjective, responsive_study1, on='id', how='inner')
b2_group_subjective_response_old = pd.merge(responsive_study1, responsive_study1, on='id', how='inner')
b3_group_sociometric_response = pd.merge(b3_group_sociometric, responsive_study1, on='id', how='inner')
b4_brain_response = pd.merge(b4_brain, responsive_study1, on='id', how='inner')
b5_demographic_response = pd.merge(b5_demographic, responsive_study1, on='id', how='inner')
b6_psychometric_response = pd.merge(b6_psychometric, responsive_study1, on='id', how='inner')

# b7_objective_group_drinking_response = pd.merge(b7_objective_group_drinking, responsive_study1, on='id', how='inner')

print(f'Total IDs Study 1: {len(b1_alcohol_self_response)}')
print(f'Responsive IDs Study 1: {b1_alcohol_self_response[b1_alcohol_self_response["responsive"] == 1]["id"].nunique()}')
print('----------')
# Testing dataset -> Study 2
b2_group_subjective_test = pd.merge(b2_group_subjective_study2, responsive_study2, on='id', how='inner')
print(f'Total IDs Study 2: {len(b2_group_subjective_test)}')
print(f'Responsive IDs Study 2: {b2_group_subjective_test[b2_group_subjective_test["responsive"] == 1]["id"].nunique()}')

In [None]:
dataframes = {
    'alc_self': b1_alcohol_self_response,
    'group_sub': b2_group_subjective_response,
    'group_socio': b3_group_sociometric_response,
    'brain': b4_brain_response,
    'demo': b5_demographic_response,
    'psych': b6_psychometric_response
}

In [None]:
# Remove highly correlated features within-bucket

dataframes['brain'].drop(columns=['reward', 'ROI_alc_react_v_rest_neurosynth_cogcontrol', 'ROI_alc_react_v_rest_neurosynth_craving', \
                                  'ROI_alc_react_v_rest_neurosynth_emoreg'], inplace=True)

# dataframes['group_socio'].drop(columns=['leaders_deg_in', 'goToBad_deg_in'], inplace=True)

dataframes['psych'].drop(columns=['ACS_focus', 'DERS_strategies', 'BIS_attention_total'], inplace=True)

## Non-parametric significance tests to check for significant variables between responders and non-repsonders

In [None]:
results = []

for name, df in dataframes.items():
    if 'responsive' not in df.columns:
        continue

    df_results = []  # store results for this dataframe

    for col in df.columns:
        if col in ['id', 'responsive']:
            continue

        x = df[col].dropna()
        y = df.loc[x.index, 'responsive']

        if len(x.unique()) <= 1:
            continue

        if np.issubdtype(x.dtype, np.number) and len(x.unique()) > 2:
            try:
                stat, p = mannwhitneyu(
                    x[y == 0], x[y == 1], alternative='two-sided'
                )
                test_type = 'Mann-Whitney U'
            except Exception:
                stat, p, test_type = np.nan, np.nan, 'error'

        else:
            contingency = pd.crosstab(y, x)
            if contingency.shape == (2, 2):
                stat, p = fisher_exact(contingency)
                test_type = 'Fisher exact'
            else:
                stat, p, _, _ = chi2_contingency(contingency)
                test_type = 'Chi-square'

        df_results.append({
            'dataframe': name,
            'variable': col,
            'test': test_type,
            'statistic': stat,
            'p_value': p
        })

    # FDR correction within this dataframe
    df_results = pd.DataFrame(df_results)
    if not df_results.empty:
        _, p_adj, _, _ = multipletests(df_results['p_value'], method='fdr_bh')
        df_results['p_value_adj'] = p_adj
        results.append(df_results)

results_df = pd.concat(results, ignore_index=True)
results_df = results_df.sort_values(['dataframe', 'p_value_adj'])
results_df.to_csv('./nonparametric_tests_results_fdr.csv', index=False)
results_df.head(20)


In [None]:
results = []

for name, df in dataframes.items():
    if 'responsive' not in df.columns:
        continue

    df_results = []  # store results for this dataframe

    for col in df.columns:
        if col in ['id', 'responsive']:
            continue

        x = df[col].dropna()
        y = df.loc[x.index, 'responsive']

        if len(x.unique()) <= 1:
            continue

        # default: no stats
        resp0_stats = ""
        resp1_stats = ""

        # compute group-wise mean (sd) for numeric features
        if np.issubdtype(x.dtype, np.number):
            x0 = x[y == 0]
            x1 = x[y == 1]

            if len(x0) > 0:
                m0 = x0.mean()
                s0 = x0.std(ddof=1)
                resp0_stats = f"{m0:.2f} ({s0:.2f})"
            if len(x1) > 0:
                m1 = x1.mean()
                s1 = x1.std(ddof=1)
                resp1_stats = f"{m1:.2f} ({s1:.2f})"

        # choose test
        if np.issubdtype(x.dtype, np.number) and len(x.unique()) > 2:
            try:
                stat, p = mannwhitneyu(
                    x[y == 0], x[y == 1], alternative='two-sided'
                )
                test_type = 'Mann-Whitney U'
            except Exception:
                stat, p, test_type = np.nan, np.nan, 'error'

        else:
            contingency = pd.crosstab(y, x)
            if contingency.shape == (2, 2):
                stat, p = fisher_exact(contingency)
                test_type = 'Fisher exact'
            else:
                stat, p, _, _ = chi2_contingency(contingency)
                test_type = 'Chi-square'

        df_results.append({
            'dataframe': name,
            'variable': col,
            'test': test_type,
            'statistic': stat,
            'p_value': p,
            'non_responders_mean_sd': resp0_stats,
            'responders_mean_sd': resp1_stats
        })

    # FDR correction within this dataframe
    df_results = pd.DataFrame(df_results)
    if not df_results.empty:
        _, p_adj, _, _ = multipletests(df_results['p_value'], method='fdr_bh')
        df_results['p_value_adj'] = p_adj
        results.append(df_results)

results_df = pd.concat(results, ignore_index=True)
results_df = results_df.sort_values(['dataframe', 'p_value_adj'])
results_df.to_csv('./nonparametric_tests_results_fdr.csv', index=False)
results_df.head(20)


In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score

df = b2_group_subjective_response.copy()
df = df.drop(columns=['id']).dropna()

X = df.drop(columns=['responsive'])
y = df['responsive']

# add intercept
X = sm.add_constant(X)

# fit logistic regression
model = sm.Logit(y, X).fit(disp=False)

# print coefficients and p-values
print(model.summary())

# get simple predictive performance (AUC)
y_pred = model.predict(X)
auc = roc_auc_score(y, y_pred)
print(f"AUC = {auc:.2f}")

# 

A logistic regression model including all subjective peer perception variables significantly predicted responsiveness (LLR p < .001, Pseudo-R² = .35).

**Table X. Logistic regression predicting responsiveness from subjective peer perception variables (group_sub).**

| Predictor            | β (SE)    | z       | p       | 95% CI (Lower, Upper) |
|----------------------|-----------|---------:|---------:|-----------------------:|
| Intercept            | -4.56 (2.53) | -1.80 | 0.072 | [-9.52, 0.41] |
| avg_alcmost_freq     | 0.02 (0.01)  | 1.20  | 0.231 | [-0.01, 0.04] |
| avg_alcmost          | **-0.92 (0.33)** | **-2.81** | **0.005** | **[-1.56, -0.28]** |
| alc_norm_5_r         | 0.70 (0.36)  | 1.92  | 0.055 | [-0.01, 1.41] |
| groupAtt_alc         | 0.43 (0.63)  | 0.68  | 0.500 | [-0.81, 1.66] |
| groupAtt_binge       | 0.61 (0.43)  | 1.41  | 0.158 | [-0.24, 1.45] |

Model fit: χ²(5) = 26.07, *p* < .001, Pseudo-*R*² = .35, AUC = 0.89.  


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

vars_to_plot = [col for col in df.columns if col not in ['responsive', 'id']]

n_vars = len(vars_to_plot)
fig, axes = plt.subplots(
    nrows=int(np.ceil(n_vars / 3)), ncols=3, figsize=(15, 5 * np.ceil(n_vars / 3))
)

axes = axes.flatten()

for i, col in enumerate(vars_to_plot):
    sns.boxplot(
        x='responsive', y=col, data=df,
        palette=['gray', 'green'], ax=axes[i]
    )
    axes[i].set_title(col)
    axes[i].set_xlabel('Responsive (0=No, 1=Yes)')
    axes[i].set_ylabel('Value')

plt.tight_layout()
plt.show()
