In [1]:
import sys
sys.path.append(r"../")
#sys.path.append(r"../utils/data_manipulation")
import pandas as pd
from utils.data_manipulation.data_imputation import impute_from_column
from utils.consts.pathology_variables import pathology_variables_times
from utils.target_variable import TargetVariable
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pingouin as pg
import plotly.graph_objs as go

import os

from statsmodels.stats.contingency_tables import mcnemar
import statsmodels.api as sm
from statsmodels.stats.contingency_tables import Table


import matplotlib
%matplotlib inline

In [2]:
if not os.path.exists("mcnemar_data_with_control"):
    os.mkdir("mcnemar_data_with_control")

if not os.path.exists("mcnemar_data_with_control/short_effect"):
    os.mkdir("mcnemar_data_with_control/short_effect")

if not os.path.exists("mcnemar_data_with_control/long_effect"):
    os.mkdir("mcnemar_data_with_control/long_effect")


In [3]:
df = pd.read_csv(r"data\treatment_group\DeppClinic_patient_data.csv")
df = df[df['group'].isin(['ipt', 'tau', 'control'])]

print(df.measurement.unique())
print(df.group.unique())

['Time 1' 'Time 2' 'Time 3']
['control' 'ipt' 'tau']


In [4]:
app_ids = pd.read_excel(r"../helper_docs/APP_ID.xlsx")
def used_app(x, app_ids):
    if x['id'] in list(app_ids['ID REDCap']):
        return True
    else:
        return False
df['used_app'] = df.apply(used_app, args=[app_ids], axis=1)

In [5]:
intake_target_variables = list(pathology_variables_times['intake'].keys()) + ['c_ssrs_intake_life_stb']


time2_target_variables = list(pathology_variables_times['time2'].keys()) + ['c_ssrs_stb_score']


target_variables_per_time = {
    'Time 1': intake_target_variables,
    'Time 2': time2_target_variables,
    'Time 3': time2_target_variables}

intake_to_time2_map = {
 'c_ssrs_intake_life_stb': 'c_ssrs_stb_score',
 'suicidal_behavior_intake': 'suicidal_behavior_time2',
 'nssi_intake': 'nssi_time2',
 'suicidal_ideation_life_intake': 'suicidal_ideation_time2',
 'ER_intake': 'ER_time2',
 'Psychiatric_hospitalization_intake': 'Psychiatric_hospitalization_time2'
 
}

In [6]:

time1_df = df[df.measurement.isin(['Time 1'])]
time1_df = time1_df.drop(time2_target_variables, axis=1)
time1_df = time1_df.rename(intake_to_time2_map, axis=1)

time2_df = df[df.measurement.isin(['Time 2'])]
time3_df = df[df.measurement.isin(['Time 3'])]


In [7]:
#df_short.groupby(['measurement', 'used_app', 'suicidal_behavior_time2']).id.nunique()

In [8]:
df_short = pd.concat([time1_df, time2_df])
df_long = pd.concat([time1_df, time3_df])

In [10]:
current_target_vars = ['suicidal_ideation_time2', 'suicidal_behavior_time2', 'nssi_time2', 'c_ssrs_stb_score']

info_cols = ['group', 'id', 'used_app', 'measurement']

df_long[current_target_vars + info_cols].to_csv(f"mcnemar_data_with_control/long_effect_raw_data.csv", index=False)
df_short[current_target_vars + info_cols].to_csv(f"mcnemar_data_with_control/short_effect_raw_data.csv", index=False)


In [11]:
def do_mcnemar_test(df, target_variable):
    contingency_table = pd.crosstab(df['used_app'], df['measurement'], values=df[target_variable], aggfunc='sum')
    results = mcnemar(contingency_table.values)

    print(f'{target_variable = }\nmcnemar(contingency_table.values) =\n')
    print(mcnemar(contingency_table.values))
    print('\t---------------------\n\n\n\n\n')
    
    return contingency_table, results.pvalue

## Short Effect

In [12]:
for target_variable in current_target_vars:
    
    if target_variable != 'c_ssrs_stb_score':
        contingency_table, pval = do_mcnemar_test(df_short, target_variable)

#         sns.barplot(data = df_short, x = 'used_app', y = target_variable, hue='measurement')
#         plt.show()
        contingency_table.to_csv(f"mcnemar_data_with_control/short_effect/{target_variable} - pval = {pval.round(decimals=3)}.csv", index=False)
    else:
        #         anova_result = pg.anova(data=df_short, dv=target_variable, between=['used_app', 'measurement', 'group'])[['Source', 'F', 'p-unc']]
#         print(f"\n{target_variable = }\n{anova_result}\n")
        pass
    

target_variable = 'suicidal_ideation_time2'
mcnemar(contingency_table.values) =

pvalue      0.6682658523131814
statistic   65.0
	---------------------





target_variable = 'suicidal_behavior_time2'
mcnemar(contingency_table.values) =

pvalue      0.04216544399495113
statistic   30.0
	---------------------





target_variable = 'nssi_time2'
mcnemar(contingency_table.values) =

pvalue      0.01861775143319464
statistic   26.0
	---------------------







## Long Effect

In [13]:
for target_variable in current_target_vars:
    
    if target_variable != 'c_ssrs_stb_score':
        contingency_table, pval = do_mcnemar_test(df_long, target_variable)

#         sns.barplot(data = df_short, x = 'used_app', y = target_variable, hue='measurement')
#         plt.show()
        contingency_table.to_csv(f"mcnemar_data_with_control/long_effect/{target_variable} - pval = {pval.round(decimals=3)}.csv", index=False)
    else:
        #         anova_result = pg.anova(data=df_short, dv=target_variable, between=['used_app', 'measurement', 'group'])[['Source', 'F', 'p-unc']]
#         print(f"\n{target_variable = }\n{anova_result}\n")
        pass
    

target_variable = 'suicidal_ideation_time2'
mcnemar(contingency_table.values) =

pvalue      0.24488359344375762
statistic   65.0
	---------------------





target_variable = 'suicidal_behavior_time2'
mcnemar(contingency_table.values) =

pvalue      0.2353798622110552
statistic   37.0
	---------------------





target_variable = 'nssi_time2'
mcnemar(contingency_table.values) =

pvalue      0.59428829514694
statistic   41.0
	---------------------





