# Predict Intervention Responsiveness

This notebook holds the code for the main analysis of the paper "Peer Perceptions Emerge as Key Predictors in Multimodal Models of Digital Alcohol Intervention Effectiveness" submitted to npj Digital Medicine.

**Data modalities for the main analysis are:**
- `b1_alcohol_self`: Self-reported individual alcohol use and related behaviors.
- `b2_group_subjective`: Participant perceptions of their social group’s drinking norms, attitudes, and approval. How much a participant perceives their peers to drink.
- `b3_group_sociometric`: Social network–derived measures of group structure and connections.
- `b4_brain`: Preprocessed MRI-derived measures of brain activity and connectivity in alcohol-related tasks.
- `b5_demographic`: Basic participant characteristics such as age, gender, and income.
- `b6_psychometric`: Standardized questionnaire-based measures of psychological traits and states.

**Additional analysis:**
- `b7_objective_group_drinking`: Aggregated group-level drinking data (how much a participant's peers actually drink).
(Analyses with this dataframe are commented - they are left in the code so as to make the processing of this data transparent. However, this analysis was added after the main analysis to compare the predictive utility of objective group drinking compared to group perceptions. This is reported in the supplements of the above paper.)

**External test set:**
- `b2_group_subjective_study2`: Participant perceptions of their social group’s drinking norms, attitudes, and approval in an independent sample.


In [None]:
# Standard libraries
import os
import time
import warnings
from copy import deepcopy
from itertools import combinations, chain

# Progress bar
from tqdm import tqdm

# Data manipulation
import numpy as np
import pandas as pd
from collections import Counter

# Statistical analysis
from scipy.stats import norm

# Visualization
import shap

# Serialization
import joblib

# Custom
from pre_processing import *
from training import *
from plotting import *
from testing import *

# Suppress warnings
warnings.filterwarnings('ignore')


Set seed to replicate the exact results from the paper. Set to `None` to run without random seed (note that this can lead to small deviations in outcome metrics due to randomness in the CV and test process).

Please note that the social network data (`b3_group_sociometric`) cannot be publicly provided due to privacy concerns. Running the script without this feature domain may cause some deviations in results from those reported in the main manuscript. Use and processing of this dataframe were left in the notebook but commented so that the processing of this data is still transparent. However, results are qualitatively reproducible and the code shows our procedure in processing and analysing the data.

In [None]:
SEED = 321
# SEED = None

In [None]:
if SEED:
    np.random.seed(SEED)

## Pre-Processing

### Define threshold for responsiveness

Indicate change threshold that qualifies a participant as responsive vs non-responsive

In [None]:
# DEFINE RESPONSIVENESS
# avg reduction in drinking occasions between active and control weeks
def_response_drink_occasions = -1

### Load data

In [None]:
output_dir = "../../results"

data_study1 = pd.read_csv('../data/intervention_time/osf_study1.csv')
data_study2 = pd.read_csv('../data/intervention_time/osf_study2.csv')

# Study 1 baseline data (train/val input)
b1_alcohol_self = pd.read_csv('../data/baseline/alcoholself_bucket280225.csv', index_col=0)
b2_group_subjective = pd.read_csv('../data/baseline/subjective_grouperceptions_280225.csv', index_col=0)
# b3_group_sociometric = pd.read_csv('../data/baseline/data_social.csv')
b4_brain = pd.read_csv('../data/baseline/brain_bucket_280225.csv', index_col=0)
b5_demographic = pd.read_csv('../data/baseline/demographic_bucket280225.csv', index_col=0)
b6_psychometric = pd.read_csv('../data/baseline/psychometrics_bucket280225.csv', index_col=0)

# # Added analysis - To evaluate performance of objective drinking metrics
# b7_objective_group_drinking = pd.read_csv('../data/added_analysis/social_group_drinking.csv', index_col=0)

# Study 2 peer perception data (test input)
b2_group_subjective_study2 = pd.read_csv('../data/baseline/subjective_grouperceptions_test.csv')
baseline_demo_study2 = pd.read_csv('../data/baseline/demo_study2_full.csv')

# Study 1 & 2 drinking/responsiveness data (output -> prediction target)
if def_response_drink_occasions == -1:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -0.5:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-0_5.csv', index_col=0).reset_index()
elif def_response_drink_occasions == -2:
    responsive_study1 = pd.read_csv('../data/intervention_time/responsiveness_study1_-2.csv', index_col=0).reset_index()

responsive_study2 = pd.read_csv('../data/intervention_time/responsiveness_study2.csv', index_col=0).reset_index()

In [None]:
data_study1_control = data_study1[data_study1.condition == 'control']
data_study2_control = data_study2[data_study2.condition == 'control']

len(data_study1_control)
len(data_study2_control)

In [None]:
# Check for duplicates within each DataFrame
duplicates_study1 = responsive_study1['id'].duplicated().any()
duplicates_study2 = responsive_study2['id'].duplicated().any()

print(f"Study 1 has duplicates: {duplicates_study1}")
print(f"Study 2 has duplicates: {duplicates_study2}")

# Check for overlapping IDs between the two studies
ids_study1 = set(responsive_study1['id'])
ids_study2 = set(responsive_study2['id'])
overlap = ids_study1.intersection(ids_study2)

print(f"Number of overlapping IDs: {len(overlap)}")
if overlap:
    print(f"Overlapping IDs: {overlap}")


In [None]:
EXCLUDE_VARS = [
    'group', 'condition', 'active',
    'control', 'difference_drinks_occasions']

responsive_study1.drop(columns=EXCLUDE_VARS, inplace=True)
responsive_study2.drop(columns=EXCLUDE_VARS, inplace=True)

In [None]:
responsive_study2.head()

### Merge Baseline and Target Data

In [None]:
# Training datasets -> Study 1
b1_alcohol_self_response = pd.merge(b1_alcohol_self, responsive_study1, on='id', how='inner')
b2_group_subjective_response = pd.merge(b2_group_subjective, responsive_study1, on='id', how='inner')
b2_group_subjective_response_old = pd.merge(responsive_study1, responsive_study1, on='id', how='inner')
# b3_group_sociometric_response = pd.merge(b3_group_sociometric, responsive_study1, on='id', how='inner')
b4_brain_response = pd.merge(b4_brain, responsive_study1, on='id', how='inner')
b5_demographic_response = pd.merge(b5_demographic, responsive_study1, on='id', how='inner')
b6_psychometric_response = pd.merge(b6_psychometric, responsive_study1, on='id', how='inner')

# b7_objective_group_drinking_response = pd.merge(b7_objective_group_drinking, responsive_study1, on='id', how='inner')

print(f'Total IDs Study 1: {len(b1_alcohol_self_response)}')
print(f'Responsive IDs Study 1: {b1_alcohol_self_response[b1_alcohol_self_response["responsive"] == 1]["id"].nunique()}')
print('----------')
# Testing dataset -> Study 2
b2_group_subjective_test = pd.merge(b2_group_subjective_study2, responsive_study2, on='id', how='inner')
print(f'Total IDs Study 2: {len(b2_group_subjective_test)}')
print(f'Responsive IDs Study 2: {b2_group_subjective_test[b2_group_subjective_test["responsive"] == 1]["id"].nunique()}')

In [None]:
dataframes = {
    'alc_self': b1_alcohol_self_response,
    'group_sub': b2_group_subjective_response,
    # 'group_socio': b3_group_sociometric_response,
    'brain': b4_brain_response,
    'demo': b5_demographic_response,
    'psych': b6_psychometric_response
}

for key, df in dataframes.items():
    print(f"Missing values in '{key}':")
    print(df.isna().sum())
    print()  # for spacing between outputs

## Feature Selection

### Find highly correlated features within buckets
Find redundancy in features if they are highly correlated

In [None]:
dataframes = {
    'alc_self': b1_alcohol_self_response,
    'group_sub': b2_group_subjective_response,
    # 'group_socio': b3_group_sociometric_response,
    'brain': b4_brain_response,
    'demo': b5_demographic_response,
    'psych': b6_psychometric_response
}

In [None]:
TARGET_VAR = 'responsive'

In [None]:
correlated_features = find_highly_correlated_features(dataframes, threshold=0.8, target_var=TARGET_VAR)

# Display results
for name, pairs in correlated_features.items():
    print(f"\n{name} - Highly Correlated Features:")
    for col1, col2, corr_value in pairs:
        print(f"  {col1} ↔ {col2} : Correlation = {corr_value:.2f}")


### Remove highly correlated features:

In [None]:
# Choice is made manually 

dataframes['brain'].drop(columns=['reward', 'ROI_alc_react_v_rest_neurosynth_cogcontrol', 'ROI_alc_react_v_rest_neurosynth_craving', \
                                  'ROI_alc_react_v_rest_neurosynth_emoreg'], inplace=True)

# dataframes['group_socio'].drop(columns=['leaders_deg_in', 'goToBad_deg_in'], inplace=True)

dataframes['psych'].drop(columns=['ACS_focus', 'DERS_strategies', 'BIS_attention_total'], inplace=True)

In [None]:
# Check that all within-category correlations are gone
correlated_features = find_highly_correlated_features(dataframes, threshold=0.8)

for name, pairs in correlated_features.items():
    print(f"\n{name} - Highly Correlated Features:")
    for col1, col2, corr_value in pairs:
        print(f"  {col1} ↔ {col2} : Correlation = {corr_value:.2f}")


In [None]:
# Number of remaining features per category
{key: df.shape[1] for key, df in dataframes.items()}

### Data cleaning

In [None]:
# 1) Count initial number of unique IDs in each DataFrame
print("Initial ID counts per dataframe:")
initial_counts = {name: df['id'].nunique() for name, df in dataframes.items()}
for name, count in initial_counts.items():
    print(f"  {name}: {count}")

# 2) Identify and remove IDs with >10 missing values across all dataframes
# Count missing values per ID across all dataframes
missing_counts = Counter()
for df in dataframes.values():
    id_missing = df.set_index('id').isnull().sum(axis=1)
    for idx, val in id_missing.items():
        if val > 0:
            missing_counts[idx] += val

# Get IDs with >10 missing values in total
bad_ids = {id_ for id_, miss_count in missing_counts.items() if miss_count > 10}

# Drop those IDs from all dataframes
for name in dataframes:
    dataframes[name] = dataframes[name][~dataframes[name]['id'].isin(bad_ids)]

print("\nID counts after removing IDs with >10 total missing features:")
post_clean_counts = {name: df['id'].nunique() for name, df in dataframes.items()}
for name, count in post_clean_counts.items():
    print(f"  {name}: {count}")

# 3) Intersect IDs: keep only IDs present in all dataframes
common_ids = set.intersection(*[set(df['id']) for df in dataframes.values()])
for name in dataframes:
    dataframes[name] = dataframes[name][dataframes[name]['id'].isin(common_ids)]

# 4) Final N
final_N = len(common_ids)
for name in dataframes:
    dataframes[name] = dataframes[name][dataframes[name]['id'].isin(common_ids)]
print(f"\nFinal number of participants present in all dataframes: N = {final_N}")


In [None]:
## Study 2
b2_group_subjective_test

# 1) Count initial number of unique IDs in each DataFrame
print("Initial ID counts study 2:")
initial_counts = b2_group_subjective_test['id'].nunique()
print(f"  group_subjective_test: {initial_counts}")

# Remove rows with more than 1 missing feature
b2_group_subjective_test = b2_group_subjective_test[b2_group_subjective_test.isnull().sum(axis=1) <= 1]

# Check how many IDs remain
remaining_ids = b2_group_subjective_test['id'].nunique()
print(f"  group_subjective_test (after removing >1 missing): {remaining_ids}")
print(f"Responsive: {b2_group_subjective_test['responsive'].sum()} out of {b2_group_subjective_test.shape[0]}")


## Sample Characteristics

In [None]:
# Print number of unique 'id's in each dataframe
for name, df in dataframes.items():
    unique_count = df['id'].nunique()
    print(f"{name}: {unique_count} unique IDs")


In [None]:
# Step 1: Get common IDs
common_ids = set.intersection(*[set(df['id']) for df in dataframes.values()])

# Step 2: Filter each DataFrame to keep only rows with common IDs
for name in dataframes:
    dataframes[name] = dataframes[name][dataframes[name]['id'].isin(common_ids)]


In [None]:
# Step 1: Get common IDs
common_ids = set.intersection(*[set(df['id']) for df in dataframes.values()])

# Step 2: Filter demographics
demo_filtered = dataframes['demo'][dataframes['demo']['id'].isin(common_ids)]

# Step 3: Compute summaries
N = demo_filtered.shape[0]
age_mean = round(demo_filtered['age'].mean(), 2)
age_sd = round(demo_filtered['age'].std(), 2)

# Gender distribution
gender_counts = demo_filtered['gender_numeric'].value_counts().sort_index()
gender_percent = round(100 * gender_counts / gender_counts.sum(), 1)

# Step 4: Print results
print(f"N = {N}")
# Step 2: Age stats
age_mean = demo_filtered['age'].mean()
age_sd =demo_filtered['age'].std()
age_min = demo_filtered['age'].min()
age_max = demo_filtered['age'].max()
print(f"Age: M = {round(age_mean, 2)}, SD = {round(age_sd, 2)}, Min = {age_min}, Max = {age_max}")
print("Gender distribution:")
for gender, count in gender_counts.items():
    percent = gender_percent[gender]
    print(f"  Gender {gender}: {count} ({percent}%)")

# Number of responsive participants
num_responsive = demo_filtered['responsive'].sum()
print(f"Responsive participants: {num_responsive} out of {N} ({round(100 * num_responsive / N, 1)}%)")

# Income distribution
income_median = demo_filtered['income_numeric'].median()
income_mean = round(demo_filtered['income_numeric'].mean(), 2)
income_sd = round(demo_filtered['income_numeric'].std(), 2)
print(f"Income: Median = {income_median}, Mean = {income_mean}, SD = {income_sd}")
income_min = demo_filtered['income_numeric'].min()
income_max = demo_filtered['income_numeric'].max()
income_median = demo_filtered['income_numeric'].median()
print(f"The household income for Study 1 participants was between {income_min} and {income_max}, with a median of {income_median}.")


# Race distribution (including missing)
race_counts = demo_filtered['race_numeric'].value_counts(dropna=False).sort_index()
race_percent = round(100 * race_counts / race_counts.sum(), 1)
print("Race distribution:")
for race, count in race_counts.items():
    label = "Missing" if pd.isna(race) else f"Race {race}"
    percent = race_percent[race]
    print(f"  {label}: {count} ({percent}%)")

# College year distribution (categorical)
college_counts = demo_filtered['college_year'].value_counts().sort_index()
college_percent = round(100 * college_counts / college_counts.sum(), 1)
print("College year distribution:")
for year, count in college_counts.items():
    label = "Missing" if pd.isna(year) else f"Year {year}"
    percent = college_percent[year]
    print(f"  {label}: {count} ({percent}%)")


In [None]:
# Print number of unique 'id's in each dataframe
for name, df in dataframes.items():
    unique_count = df['id'].nunique()
    print(f"{name}: {unique_count} unique IDs")

In [None]:
# Study 2
# Step 1: Merge on 'id' (left join)
merged = pd.merge(b2_group_subjective_test, baseline_demo_study2, on='id', how='left')

merged.columns
# Step 2: Age stats
age_mean = merged['age'].mean()
age_sd = merged['age'].std()

# Step 3: Gender distribution
gender_counts = merged['gender_numeric'].value_counts().sort_index()
gender_percent = round(100 * gender_counts / gender_counts.sum(), 1)

# Step 4: Responsive distribution
responsive_counts = merged['responsive'].value_counts().sort_index()
responsive_percent = round(100 * responsive_counts / responsive_counts.sum(), 1)

# Output
age_min = merged['age'].min()
age_max = merged['age'].max()
print(f"Age: M = {round(age_mean, 2)}, SD = {round(age_sd, 2)}, Min = {age_min}, Max = {age_max}")
# Gender distribution (including missing)
gender_counts = merged['gender_numeric'].value_counts(dropna=False).sort_index()
gender_percent = round(100 * gender_counts / gender_counts.sum(), 1)
print("Gender distribution:")
for gender, count in gender_counts.items():
    label = "Missing" if pd.isna(gender) else f"Gender {gender}"
    percent = gender_percent[gender]
    print(f"  {label}: {count} ({percent}%)")

# Responsive distribution (including missing)
responsive_counts = merged['responsive'].value_counts(dropna=False).sort_index()
responsive_percent = round(100 * responsive_counts / responsive_counts.sum(), 1)
print("Responsive distribution:")
for resp, count in responsive_counts.items():
    label = "Missing" if pd.isna(resp) else f"Responsive = {resp}"
    percent = responsive_percent[resp]
    print(f"  {label}: {count} ({percent}%)")

# Race distribution (including missing)
race_counts = merged['race_numeric'].value_counts(dropna=False).sort_index()
race_percent = round(100 * race_counts / race_counts.sum(), 1)
print("Race distribution:")
for race, count in race_counts.items():
    label = "Missing" if pd.isna(race) else f"Race {race}"
    percent = race_percent[race]
    print(f"  {label}: {count} ({percent}%)")

# College year distribution (including missing)
college_counts = merged['college_year'].value_counts(dropna=False).sort_index()
college_percent = round(100 * college_counts / college_counts.sum(), 1)
print("College year distribution:")
for year, count in college_counts.items():
    label = "Missing" if pd.isna(year) else f"Year {year}"
    percent = college_percent[year]
    print(f"  {label}: {count} ({percent}%)")

# Income distribution
income_median = merged['income_numeric'].median()
income_mean = round(merged['income_numeric'].mean(), 2)
income_sd = round(merged['income_numeric'].std(), 2)
print(f"Income: Median = {income_median}, Mean = {income_mean}, SD = {income_sd}")
income_min = merged['income_numeric'].min()
income_max = merged['income_numeric'].max()
income_median = merged['income_numeric'].median()
print(f"The household income for Study 1 participants was between {income_min} and {income_max}, with a median of {income_median}.")


## Training / Validation

### Nested CV

In [None]:
shap.initjs()

def run_rf_train_test(dataframes, param_grid, eval_metrics, outer_reps=50, k=5, CV_reps=5, model_choice_metric='f1', 
                      res_dir=f"./results/", model_type='xgb', test_set=0.3, permutation=False):

    timestamp = int(time.time())
    res_dir = f"{res_dir}/{timestamp}_{SEED}_{model_type}_outer{outer_reps}_cvrep{CV_reps}_k{k}_{model_choice_metric}_testsize{test_set}_perm{permutation}/"
    os.makedirs(res_dir, exist_ok=True)
    
    keys = list(dataframes.keys())

    # combine data categories
    combinations_keys = list(chain.from_iterable(combinations(keys, r) for r in range(1, 3)))
    combo_validation_scores = {}
    combo_test_scores = {}
    best_models = {} 
    best_shap_vals = {}
    best_paramses = {}

    all_val_scores = {}
    all_test_scores = {}
    all_models_sub = []

    for combo in tqdm(combinations_keys):
        validation_scores = {metric: [] for metric in eval_metrics}
        test_scores = {metric: [] for metric in eval_metrics}
        merged_df = dataframes[combo[0]].copy()
        top_models_group_sub = []
        
        for key in combo[1:]:
            merged_df = merged_df.merge(dataframes[key].copy(), how='inner', on=['id', TARGET_VAR])
        if TARGET_VAR not in merged_df.columns:
            raise ValueError(f"Target variable '{TARGET_VAR}' not found in merged dataframe for combo: {combo}")
    
        all_shap_values = []
        all_test_data = []
        best_overall_score = -np.inf 
        best_model_for_combo = None
        best_params_for_combo = None
        best_shap_for_combo = None

        for _ in range(outer_reps): # i repetitions of train/test

            # Prepare train/test split for this i (random & stratified)
            X_data, Y_data, X_test, Y_test = prepare_features_and_targets(merged_df.copy(), test_set=test_set, target_var=TARGET_VAR)

            # Shuffle labels for permutation tests
            if permutation:
                Y_data = Y_data.sample(frac=1, random_state=None).reset_index(drop=True)
                Y_test = Y_test.sample(frac=1, random_state=None).reset_index(drop=True)

            best_model, best_scores, best_params = random_forest_kfold_grid_search(X_data, Y_data, 
                                                                                    param_grid, k=k, 
                                                                                    CV_reps=CV_reps, 
                                                                                    eval_metric=eval_metrics,
                                                                                    model_choice_metric=model_choice_metric,
                                                                                    res_dir=res_dir,
                                                                                    model_type=model_type,
                                                                                    combo=combo)
            # Collect metrics
            for metric, score in best_scores.items():
                validation_scores[metric].append(score)

            # Retrain the best model on the full training dataset and evaluate on the test set
            best_model.fit(X_data, Y_data)
            test_predictions = best_model.predict(X_test)
            proba_predictions = best_model.predict_proba(X_test)[:, 1]

            explainer = shap.TreeExplainer(best_model)
            shap_values = explainer.shap_values(X_test) 
            shap_values = shap_values[:, :, 1]

            # Append SHAP values and test data for later aggregation
            all_shap_values.append(shap_values)
            all_test_data.append(pd.DataFrame(X_test))

            if best_scores[model_choice_metric] > best_overall_score:
                best_overall_score = best_scores[model_choice_metric]
                best_model_for_combo = best_model
                best_params_for_combo = best_params
                best_shap_for_combo = shap_values  # Store SHAP values if needed

            if combo == ('group_sub',):
                top_models_group_sub.append((best_scores[model_choice_metric], deepcopy(best_model)))

            # Calculate and append metrics for the test set
            test_scores = compute_test_metrics(Y_test, test_predictions, proba_predictions, test_scores)

        # Keep track of the best model based on the model_choice_metric
        if combo not in best_models or best_scores[model_choice_metric] > combo_validation_scores[combo][model_choice_metric]['mean']:
            best_models[combo] = best_model_for_combo
            
            joblib.dump(best_model_for_combo, f"{res_dir}/model_{'_'.join(combo)}.joblib")

            best_shap_vals[combo] = best_shap_for_combo
            best_paramses[combo] = best_params_for_combo

            # Save top 10 models for group_sub combo
            if combo == ('group_sub',):
                top_models_group_sub = locals().get("top_models_group_sub", [])
                top_models_group_sub.append((best_overall_score, deepcopy(best_model_for_combo)))

                # Sort and save top 10 by score
                top_models_group_sub.sort(key=lambda x: x[0], reverse=True)
                top10 = top_models_group_sub[:100]

                subdir = os.path.join(res_dir, "top100_group_sub_models")
                os.makedirs(subdir, exist_ok=True)

                for i, (score, model) in enumerate(top10):
                    joblib.dump(model, f"{subdir}/model_rank{i+1}_score{score:.4f}.joblib")

                # Store back in locals so it's not overwritten each time
                locals()["top_models_group_sub"] = top_models_group_sub

        top2_features = plot_shap_summary_with_percentages(all_shap_values, all_test_data, res_dir, combo)

        plot_pdp_across_runs(
            best_model=best_model_for_combo,
            res_dir=res_dir,
            all_test_data=all_test_data,
            interaction_pair=tuple(top2_features)
        )

        # Calculate mean and 95% CI for validation scores
        z = norm.ppf(0.975)  # 95% confidence level
        final_validation_scores = {}
        for metric, scores in validation_scores.items():
            mean_score = np.mean(scores)
            std_error = np.std(scores, ddof=1) / np.sqrt(len(scores))
            ci_lower = mean_score - z * std_error
            ci_upper = mean_score + z * std_error
            final_validation_scores[metric] = {
                'mean': mean_score,
                '95%_CI': (ci_lower, ci_upper)
            }
        combo_validation_scores[combo] = final_validation_scores
        all_val_scores[combo] = validation_scores
        save_metrics_to_csv(all_val_scores, res_dir, 'all_val_scores.csv')

        # Calculate mean and 95% CI for test scores
        final_test_scores = {}
        for metric, scores in test_scores.items():
            mean_score = np.mean(scores)
            std_error = np.std(scores, ddof=1) / np.sqrt(len(scores))
            ci_lower = mean_score - z * std_error
            ci_upper = mean_score + z * std_error
            final_test_scores[metric] = {
                'mean': mean_score,
                '95%_CI': (ci_lower, ci_upper)
            }
        combo_test_scores[combo] = final_test_scores
        all_test_scores[combo] = test_scores
        save_metrics_to_csv(all_test_scores, res_dir, 'all_test_scores.csv')

        # For validation scores
        df_val = flatten_score_dict(combo_validation_scores, res_dir=res_dir, filename="validation_scores.csv")
        # For test scores
        df_test = flatten_score_dict(combo_test_scores, res_dir=res_dir, filename="test_scores.csv")
        
    return res_dir

### Run Analyses

In this section, all feature categories and their combinations are used to train Random Forest models and evaluate them in a nested CV.

Results of every run are stored, alongside SHAP and PDP plots using the functions above.

In [None]:
dataframes = {
    'demo': b5_demographic_response,
    # 'alc_self': b1_alcohol_self_response,
    # 'psych': b6_psychometric_response,
    # 'group_sub': b2_group_subjective_response,
    # # 'group_socio': b3_group_sociometric_response,
    # 'brain': b4_brain_response,
    # 'group_selfreport': b7_objective_group_drinking_response
}

Defining the model parameter grid to use in k-fold grid search.

In [None]:
param_grid = {
    "n_estimators": [50],
    "max_depth": [3, 5],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [2, 3, 5]
}

eval_metrics = ['auc', 'f1', 'accuracy', 'specificity', 'sensitivity', 'PPV', 'NPV', 'MCC', 'balancedAcc', 'pr_auc', 'tn', 'fn', 'tp', 'fp']

#### 3-fold CV

##### Normal Run

In [None]:
res_dir = run_rf_train_test(
    dataframes=dataframes,
    param_grid=param_grid,
    eval_metrics=eval_metrics,
    outer_reps=100, # reduce for faster run --> this affects the results
    k=3,
    CV_reps=5,
    model_choice_metric='auc',
    res_dir="../results/",
    model_type='rf',
    test_set=0.3,
    permutation=False
)

##### Permutation Test

In [None]:
# res_dir = "../results/1750853719_321_rf_outer100_cvrep5_k3_auc_testsize0.3_permFalse"
# run_rf_train_test(
#     dataframes=dataframes,
#     param_grid=param_grid,
#     eval_metrics=eval_metrics,
#     outer_reps=100, # reduce for faster run --> this affects the results
#     k=3,
#     CV_reps=5,
#     model_choice_metric='auc',
#     res_dir=os.path.join(res_dir, 'permutation_test'),
#     model_type='rf',
#     test_set=0.3,
#     permutation=True
# )

## External Test

As detailed in the accompanying paper, test data is available for the feature category `group subjective (GRP)`. This independent follow-up data sample is used as a test set.

In [None]:
res_dir = '../results/1750853719_321_rf_outer100_cvrep5_k3_auc_testsize0.3_permFalse_og'
loaded_model = joblib.load(f'{res_dir}/model_group_sub.joblib')
# Out-of sample testing, without resampling
res_dir = os.path.join(res_dir, 'oos_test') 
scores, best_params = test_oos(b2_group_subjective_test, res_dir, loaded_model, None, plot=True, target_var=TARGET_VAR)
print(scores)

In [None]:
# Out of sample testing with resampling to the positive rate of the training data
if SEED:
    np.random.seed(SEED)

res_dir = '../results/1750853719_321_rf_outer100_cvrep5_k3_auc_testsize0.3_permFalse_og'
summary_df, all_scores_df = evaluate_top_models(
    res_dir=res_dir,
    test_df=b2_group_subjective_test,
    top_n=1,
    n_iterations=100,
    desired_positive_rate=0.22,
    plot=False,
    target_var=TARGET_VAR
)

In [None]:
summary_df

### Permutation test

In [None]:
import os
import joblib
import pandas as pd
from tqdm import tqdm

# Define paths
model_dir = "../results/1750853719_321_rf_outer100_cvrep5_k3_auc_testsize0.3_permFalse/permutation_test/1754476983_321_rf_outer100_cvrep5_k3_auc_testsize0.3_permTrue/top100_group_sub_models"
model_files = sorted([f for f in os.listdir(model_dir) if f.endswith(".joblib")])

# Storage for all permutation scores
perm_model_scores = []

# Evaluate each model on the real test set
for model_file in tqdm(model_files):
    model_path = os.path.join(model_dir, model_file)
    loaded_model = joblib.load(model_path)
    
    # Run model on the original test set
    scores, _ = test_oos(b2_group_subjective_test, model_dir, loaded_model, None, plot=False)
    
    # Track scores
    scores['model_file'] = model_file
    perm_model_scores.append(scores)

# Create DataFrame of permutation scores
perm_model_df = pd.DataFrame(perm_model_scores)

# Compare each metric in summary_df to the permutation distribution
comparison_results = {}
for metric in summary_df.index:
    if metric not in perm_model_df.columns:
        continue
    threshold = summary_df.loc[metric, 'Mean']
    count = (perm_model_df[metric] >= threshold).sum()
    comparison_results[metric] = {
        'NumPermutations >= True': count,
        'Proportion': count / len(perm_model_df)
    }

# Format as DataFrame
perm_test_summary = pd.DataFrame(comparison_results).T
print(perm_test_summary.sort_values("Proportion"))


In [None]:
from statsmodels.stats.multitest import multipletests

n_perm = len(perm_model_df)

records, pvals = [], []
for metric in summary_df.index:
    if metric not in perm_model_df.columns:
        continue
    thr = summary_df.loc[metric, 'Mean']
    count = int((perm_model_df[metric] >= thr).sum())
    prop = count / n_perm
    p = (count + 1) / (n_perm + 1)  # permutation p with +1 correction

    records.append({
        'Metric': metric,
        'Threshold': thr,
        'NumPermutations >= Threshold': count,
        'Proportion': prop,
        'p_value': p
    })
    pvals.append(p)

# FDR correction across metrics
_, qvals, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh')

# Attach corrected p values
for rec, q in zip(records, qvals):
    rec['p_value_fdr_bh'] = q

perm_test_summary = pd.DataFrame(records).set_index('Metric').sort_values('p_value_fdr_bh')
print(perm_test_summary)


## Sensitivity Analyses

Run CV with different values for k and different train/validation splits.

In [None]:
# # Sensitivity 
# run_rf_train_test(
#     dataframes=dataframes,
#     param_grid=param_grid,
#     eval_metrics=eval_metrics,
#     outer_reps=100,
#     k=3,
#     CV_reps=5,
#     model_choice_metric='auc',
#     res_dir="../results/",
#     model_type='rf',
#     test_set=0.4,
#     permutation=False
# )

# run_rf_train_test(
#     dataframes=dataframes,
#     param_grid=param_grid,
#     eval_metrics=eval_metrics,
#     outer_reps=100,
#     k=5,
#     CV_reps=5,
#     model_choice_metric='auc',
#     res_dir="../results/",
#     model_type='rf',
#     test_set=0.3,
#     permutation=False
# )