In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, ttest_ind
from scipy.stats import mannwhitneyu, ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ema_study1_df = pd.read_csv("../data/intervention_time/EMA_study1.csv")
study_1_ids = pd.read_csv("../data/study_1_ids.csv")

ema_study2_df = pd.read_csv("../data/intervention_time/EMA_study2.csv")
study_2_ids = pd.read_csv("../data/study_2_ids.csv")

cols_to_keep = ["SHINEID", "HadAlcohol", "Alcohol_Alone", "Alcohol_Friend", "Alcohol_Coworker", "Alcohol_Family", "Alcohol_Stranger", "Alcohol_SigOther"]

In [None]:
ema_study1_df = ema_study1_df[cols_to_keep]
ema_study1_df = pd.merge(study_1_ids, ema_study1_df, left_on='id', right_on='SHINEID', how='inner')
ema_study1_df = ema_study1_df[ema_study1_df['HadAlcohol'] == 1]
len(ema_study1_df)

In [None]:
print(f"drank alone: {ema_study1_df.Alcohol_Alone.sum()}, {ema_study1_df.Alcohol_Alone.sum()/546*100}%")
print(f"drank with friend(s): {ema_study1_df.Alcohol_Friend.sum()}, {ema_study1_df.Alcohol_Friend.sum()/546*100}%")
print(f"drank with coworker: {ema_study1_df.Alcohol_Coworker.sum()}, {ema_study1_df.Alcohol_Coworker.sum()/546*100}%")
print(f"drank with family: {ema_study1_df.Alcohol_Family.sum()}, {ema_study1_df.Alcohol_Family.sum()/546*100}%")
print(f"drank with stranger(s): {ema_study1_df.Alcohol_Stranger.sum()}, {ema_study1_df.Alcohol_Stranger.sum()/546*100}%")
print(f"drank with significant other: {ema_study1_df.Alcohol_SigOther.sum()}, {ema_study1_df.Alcohol_SigOther.sum()/546*100}%")

Study 1: Drank with friends 61% of the time, versus 15% of the time with family, 10% of the time with their significant other, 6% of the time with strangers, 5% alone, and 3% with coworkers. 



ema_study2_df = ema_study2_df[cols_to_keep]
ema_study2_df = pd.merge(study_1_ids, ema_study2_df, left_on='id', right_on='SHINEID', how='inner')
ema_study2_df = ema_study2_df[ema_study1_df['HadAlcohol'] == 1]
len(ema_study2_df)

In [None]:
print(f"drank alone: {ema_study2_df.Alcohol_Alone.sum()}, {ema_study2_df.Alcohol_Alone.sum()/1433*100}%")
print(f"drank with friend(s): {ema_study2_df.Alcohol_Friend.sum()}, {ema_study2_df.Alcohol_Friend.sum()/1433*100}%")
print(f"drank with coworker: {ema_study2_df.Alcohol_Coworker.sum()}, {ema_study2_df.Alcohol_Coworker.sum()/1433*100}%")
print(f"drank with family: {ema_study2_df.Alcohol_Family.sum()}, {ema_study2_df.Alcohol_Family.sum()/1433*100}%")
print(f"drank with stranger(s): {ema_study2_df.Alcohol_Stranger.sum()}, {ema_study2_df.Alcohol_Stranger.sum()/1433*100}%")
print(f"drank with significant other: {ema_study2_df.Alcohol_SigOther.sum()}, {ema_study2_df.Alcohol_SigOther.sum()/1433*100}%")

Study 2: Drank with friends 48% of the time, versus 23% of the time with family, 12% of the time with their significant other, 3% of the time with strangers, 13% alone, and 1% with coworkers. 



# Systematic difference of underestimators?

In [None]:
underestimator_ids = pd.read_csv("../data/added_analysis/underestimators_study_1.csv")["id"].astype(str).tolist()

data_study1 = pd.read_csv('../data/intervention_time/osf_study1.csv')
data_study2 = pd.read_csv('../data/intervention_time/osf_study2.csv')

# Study 1 baseline data (train/val input)
b1_alcohol_self = pd.read_csv('../data/baseline/alcoholself_bucket280225.csv', index_col=0)
b2_group_subjective = pd.read_csv('../data/baseline/subjective_grouperceptions_280225.csv', index_col=0)
b3_group_sociometric = pd.read_csv('../data/baseline/data_social.csv')
b4_brain = pd.read_csv('../data/baseline/brain_bucket_280225.csv', index_col=0)
b5_demographic = pd.read_csv('../data/baseline/demographic_bucket280225.csv', index_col=0)
b6_psychometric = pd.read_csv('../data/baseline/psychometrics_bucket280225.csv', index_col=0)

In [None]:
dataframes = {
    'alc_self': b1_alcohol_self,
    'group_sub': b2_group_subjective,
    'group_socio': b3_group_sociometric,
    'brain': b4_brain,
    'demo': b5_demographic,
    'psych': b6_psychometric
}

In [None]:
underestimator_ids = [str(i) for i in underestimator_ids]

for name, df in dataframes.items():
    if 'id' not in df.columns:
        continue

    df = df.copy()
    df['id'] = df['id'].astype(str)
    df['under'] = df['id'].isin(underestimator_ids)

    # Coerce all columns to numeric where possible
    df = df.apply(pd.to_numeric, errors='ignore')
    num_cols = [c for c in df.columns if c not in ['id', 'under'] and np.issubdtype(df[c].dtype, np.number)]

    results = []

    for col in num_cols:
        group1 = df.loc[df['under'], col].dropna()
        group0 = df.loc[~df['under'], col].dropna()
        if len(group1) < 3 or len(group0) < 3:
            continue

        # t-test (Welch)
        t_p = ttest_ind(group1, group0, equal_var=False).pvalue

        # Mann-Whitney U
        u_p = mannwhitneyu(group1, group0, alternative='two-sided').pvalue

        results.append({'feature': col, 't_test_p': t_p, 'mannwhitney_p': u_p})

    res_df = pd.DataFrame(results)
    print(f"\n=== {name} ===")
    if res_df.empty:
        print("No numeric features or insufficient data.")
    else:
        print(res_df.to_string(index=False))

In [None]:
# Collect relevant features into a single dictionary
features_to_plot = {
    "Alcohol attitude (ALC)": ("alc_self", "alcohol_alc_att_2"),
    "Attentional Control: atten shifting (PSY)": ("psych", "ACS_attentionshifting"),
    "Interpersonal autonomy (PSY)": ("psych", "IAS_mean")
}

# Create figure
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=False)
for ax, (title, (df_name, col)) in zip(axes, features_to_plot.items()):
    df = dataframes[df_name].copy()
    df["id"] = df["id"].astype(str)
    df["under"] = df["id"].isin(underestimator_ids)
    
    sns.boxplot(
        data=df, 
        x="under", 
        y=col, 
        ax=ax, 
        palette=["#4A4E69", "#9A8C98"]
    )
    ax.set_xlabel("")
    ax.set_xticklabels(["Others", "Underestimators"])
    ax.set_ylabel(col)
    ax.set_title(title)

plt.tight_layout()
plt.show()

# Test-retest reliability of drinking perceptions

## Self-reports

In [None]:
# Follow-up 
followup_drinking = pd.read_csv('../data/added_analysis/followup1_peers_and_self_drinking.csv', index_col=0)

# Study 1 data
baseline_self = pd.read_csv('../data/added_analysis/baseline_alc_self.csv', index_col=0)
b1_alcohol_self = pd.read_csv('../data/baseline/alcoholself_bucket280225.csv', index_col=0)
b2_group_subjective = pd.read_csv('../data/baseline/subjective_grouperceptions_280225.csv', index_col=0)

In [None]:
drinking_self_comparison = baseline_self.merge(followup_drinking, on='pID')

In [None]:
drinking_self_comparison

In [None]:
from scipy.stats import spearmanr

rho_freq, p_freq = spearmanr(
    drinking_self_comparison['freq_self'],
    drinking_self_comparison['freq_self_numeric'],
    nan_policy='omit'
)

rho_amount, p_amount = spearmanr(
    drinking_self_comparison['amount_self'],
    drinking_self_comparison['amount_self_numeric'],
    nan_policy='omit'
)

print(f"Frequency: ρ = {rho_freq:.2f}, p = {p_freq:.3f}")
print(f"Amount:    ρ = {rho_amount:.2f}, p = {p_amount:.3f}")

## Peer perceptions

In [None]:
drinking_self_comparison = b2_group_subjective.merge(
    followup_drinking,
    left_on='id',
    right_on='pID',
    how='inner',
    suffixes=('', '_followup1')
)

In [None]:
drinking_self_comparison = drinking_self_comparison[['pID', 'avg_alcmost_freq', 'avg_alcmost', 'avg_alcmost_freq_followup1', 'avg_alcmost_followup1']]

In [None]:
drinking_self_comparison

In [None]:
from scipy.stats import spearmanr

rho_freq, p_freq = spearmanr(
    drinking_self_comparison['avg_alcmost_freq'],
    drinking_self_comparison['avg_alcmost_freq_followup1'],
    nan_policy='omit'
)

rho_amount, p_amount = spearmanr(
    drinking_self_comparison['avg_alcmost'],
    drinking_self_comparison['avg_alcmost_followup1'],
    nan_policy='omit'
)

print(f"Frequency: ρ = {rho_freq:.2f}, p = {p_freq:.3f}")
print(f"Amount:    ρ = {rho_amount:.2f}, p = {p_amount:.3f}")

## Stability of peer nominations

In [None]:
nominations_baseline_followup = pd.read_csv('../data/added_analysis/alcmost_nominations_baseline_followup1.csv', index_col=0)
b4_brain = pd.read_csv('../data/baseline/brain_bucket_280225.csv', index_col=0) # this has only the 67 final participants

In [None]:
nominations_baseline_followup

In [None]:
nominations_baseline_followup_study1 = nominations_baseline_followup.merge(b4_brain, left_on='pID', right_on='id', how='inner') #[['id', 'cleaned_baseline, cleaned_followup1', 'jaccard_similarity']]

In [None]:
nominations_baseline_followup_study1 = nominations_baseline_followup_study1[['id', 'cleaned_baseline', 'cleaned_followup1', 'jaccard_similarity']]

In [None]:
nominations_baseline_followup_study1['jaccard_similarity'].mean()