# EDA file
This file contains code for the important EDA plots. 

### Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import groupby
#!conda remove scipy scikit-learn -y
#!conda install scipy scikit-learn -y
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_recall_curve, average_precision_score
#%pip install mord
from mord import LogisticAT
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.svm import SVC
#%pip install panelsplit
#from panelsplit import PanelSplit
from sklearn.model_selection import BaseCrossValidator
from sklearn.utils.validation import indexable
from itertools import chain
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.base import BaseEstimator, RegressorMixin
from pandas.plotting import autocorrelation_plot
#%pip install imblearn
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingRegressor
#%pip install xgboost
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from scipy.stats import boxcox
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
#%pip install smogn
from smogn import smoter
from sklearn.metrics import f1_score, roc_auc_score, classification_report, RocCurveDisplay
#%pip install lifelines
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation
from lifelines import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
#%pip install scikit-survival
from sklearn.metrics import f1_score, roc_auc_score, classification_report, precision_recall_curve, auc
from sklearn.preprocessing import QuantileTransformer
import pickle
from scipy.stats import chi2_contingency
from scipy.stats import pointbiserialr


### Data load

In [None]:
import os
os.chdir('C:/Users/Nik/Documents/thesis/PEP data copy/Original Stata files/')
path = "./PEP_all data_long_20220316.dta"
path2 = "./PEP_demographics_20220316.dta"
df = pd.read_stata(path)
df2 = pd.read_stata(path2)
df2 = df2.rename(columns={'StudyID': 'studyID'})
df_merged = pd.merge(df, df2, on='studyID', how='left')
df = df_merged


### Data pre-processing

In [None]:
# defining all useful columns
disease_columns = ['heartattack', 'strokes', 'congestiveHD', 'carcinoma', 'hipfracture', 'chronicLung', 'Hypertension', 'arthritis', 'DM_TG']
outcome_columns = ['N_adl4dis', 'N_IADL5', 'N_mob4dis']
prob_b_columns = [col for col in df.columns if col.startswith('prob_b')]
other_columns = ['SCESD_ge16', 'BMI_ge30']
confounder_columns = ['BMI_3cp', 'age_fu', 'female', 'white', 'vis3cat', 'hear3cat', 'smoker_fu', 'mu3', 'rx_fu', 'hospstay']

In [None]:
def impute_disease(df, disease_columns, date_col):
    """
    Impute NaN values in disease columns based on historical data for each studyID.
    If a disease was ever reported (1) by an individual, subsequent NaNs are set to 1.
    If a disease was never reported before a NaN, it is set to 0.

    Args:
    df (DataFrame): The dataframe containing the data.
    disease_columns (list): List of columns that are disease indicators.
    date_col (str): Column name for the date to ensure chronological order.

    Returns:
    DataFrame: The DataFrame with NaNs in disease columns imputed based on historical data.
    """
    # Sort the df chronologically by studyID and date
    df = df.sort_values(by=['studyID', date_col])

    # Process each participant separately
    for studyID, group in df.groupby('studyID'):
        # Iterate through each disease column
        for disease in disease_columns:
            # Find the first instance where the disease was reported
            first_report_index = group[disease].first_valid_index()

            # Create to identify rows before the first report
            if first_report_index is not None:
                before_first_report = group.index < first_report_index
            else:
                before_first_report = pd.Series(False, index=group.index)

            # Apply cumulative max to forward-fill reported diseases; NaNs before the first report are unaffected
            group[disease] = group[disease].cummax()

            # Fill NaNs before the first reported instance with 0 (since disease was never reported)
            group.loc[before_first_report, disease] = group.loc[before_first_report, disease].fillna(0)

            # Backfill any remaining NaNs after the first report with 1 (assuming disease persists)
            group[disease] = group[disease].fillna(1)

        # Place the modified group back into the main DataFrame
        df.loc[group.index, disease_columns] = group[disease_columns]

    return df

# example
df = impute_disease(df, disease_columns, 'intdate')

# Check the results
#print(df.head())


In [None]:
#prob_b_columns = [col for col in df.columns if col.startswith('prob_b')]
df[prob_b_columns] = df[prob_b_columns].fillna(0)
#df[disease_columns] = df[disease_columns].fillna(0)
df[other_columns] = df[other_columns].fillna(0)
df[outcome_columns] = df[outcome_columns].fillna(0)

 # here i impute tiny bits of data that were missing from symptom columns, i fill 0 bc
# i assume that the symptoms were simply not reported

df[prob_b_columns] = df[prob_b_columns].astype(int)
df[disease_columns] = df[disease_columns].astype(int)
df[other_columns] = df[other_columns].astype(int)
df[outcome_columns] = df[outcome_columns].astype(int)


## EDA

### Distributions and additional prep

#### TR

In [None]:

# Ensure intdate is in datetime format
df['intdate'] = pd.to_datetime(df['intdate'])

# Sort by studyID and interview date
df = df.sort_values(by=['studyID', 'intdate'])

# Initialize the TR column
df['TR'] = 0

# Loop through each participant
for study_id, group in df.groupby('studyID'):
    previous_restrict = 0
    tr_counter = 0
    
    for idx in group.index:
        if group.loc[idx, 'restrict'] == 0:
            tr_counter += 1  # Increase counter when restrict is 0
        else:
            df.loc[idx, 'TR'] = tr_counter  # Assign TR when restrict turns to 1
            tr_counter = 0  # Reset counter
    
    # Handle the case where the last rows for a participant have restrict == 0
    if tr_counter > 0 and group['restrict'].iloc[-1] == 0:
        df.loc[group.index[-1], 'TR'] = tr_counter

df = df[df['restrict'] == 1.0]

# Fill any remaining 0s in TR to ensure all observations are accounted for
df['TR'] = df['TR'].replace(0, np.nan).ffill().fillna(0).astype(int)

# Plot the new TR distribution
plt.figure(figsize=(10, 6))
df['TR'].plot(kind='hist', bins=range(int(df['TR'].min()), int(df['TR'].max()) + 2), alpha=0.7, color='blue')
plt.title('Distribution of TR')
plt.xlabel('Time Since Last Report (months)')
plt.ylabel('Frequency')
plt.xticks(range(int(df['TR'].min()), int(df['TR'].max()) + 1))
plt.tight_layout()
plt.show()


In [None]:
# Plot the frequency of each unique integer value of TR
tr_counts = df['TR'].value_counts().sort_index()

plt.figure(figsize=(15, 10))
tr_counts.plot(kind='bar', edgecolor='k', alpha=0.7)
plt.title('Frequency of Each Integer Value of TR')
plt.xlabel('TR')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Apply log transformation to TR
df['TR_log'] = np.log1p(df['TR'])  # Use log1p to handle TR = 0

# Plot the frequency of the log-transformed TR values
plt.figure(figsize=(15, 10))
plt.hist(df['TR_log'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Distribution of Log-Transformed TR')
plt.xlabel('Log-Transformed TR')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
df['TR_quantile'] = scaler.fit_transform(df[['TR']])

fig, axes = plt.subplots(2, 2, figsize=(12,10))

sns.histplot(df['TR'], ax=axes[0,0])
axes[0,0].set_title('Original TR distribution')

sns.histplot(df['TR_lag1'], ax=axes[0,1])
axes[0,1].set_title('Original lagged TR distribution (lag=1)')

sns.histplot(df['TR_quantile'], ax=axes[1,0])
axes[1,0].set_title('Quantile-transformed TR distribution')

sns.histplot(X[numeric_features], ax=axes[1,1])
axes[1,1].set_title('Quantile-transformed lagged TR distribution (lag=1)')

plt.tight_layout()
plt.show()
             

#### Additional prep

In [None]:

# Ensure intdate is in datetime format
df['intdate'] = pd.to_datetime(df['intdate'])

# Sort by studyID and interview date
df = df.sort_values(by=['studyID', 'intdate'])



#### Map column names
# List of detailed symptoms based on PEP study
detailed_symptoms = [
    "pain or stiffness in your joints",
    "pain or stiffness in your back",
    "leg pain on walking",
    "weakness of your arms or legs",
    "swelling in your feet or ankles",
    "been fatigued (no energy/very tired)",
    "difficulty breathing or shortness of breath",
    "chest pain or tightness",
    "poor or decreased vision",
    "been dizzy or unsteady on your feet",
    "a fall or injury",
    "been afraid of falling",
    "cold or flu symptoms",
    "difficulty with sleeping",
    "nausea, vomiting, diarrhea, or other stomach (abdominal) problem",
    "a problem with your memory or difficulty thinking",
    "been depressed",
    "been anxious or worried",
    "frequent or painful urination",
    "lost control of your urine and wet yourself",
    "has a family member or friend become seriously ill or had an accident",
    "experienced the death or loss of a family member or friend",
    "a change in your medications",
    "a problem with alcohol"
]

# Starting column index for prob_b columns
start_index = 3

# Create a mapping of prob_b columns to the new symptom descriptions
column_mapping = {f'prob_b{i}': symptom for i, symptom in zip(range(start_index, start_index + len(detailed_symptoms)), detailed_symptoms)}

df.rename(columns=column_mapping, inplace=True)

prob_b_columns = list(column_mapping.values())

# Outcome columns
outcome_columns = ['N_adl4dis', 'N_IADL5', 'N_mob4dis']

# Initialize delta columns and combined_change
for col in outcome_columns:
    df['delta_' + col] = df[col] - df.groupby('studyID')[col].shift()
    df['change_' + col] = df['delta_' + col].apply(lambda x: 1 if x != 0 else 0)

df['combined_change'] = df[[f'change_{col}' for col in outcome_columns]].max(axis=1).astype(int)

# Calculate cumulative time since last report for each individual
df['cumulative_time'] = df.groupby('studyID')['TR'].cumsum()

# Initialize actual time until change columns
df['actual_time_until_change'] = np.nan

# Calculate actual time until the change occurred
for study_id, group in df.groupby('studyID'):
    change_indices = group.index[group['combined_change'] == 1].tolist()
    if change_indices:
        change_time = 0
        for idx in group.index:
            if idx in change_indices:
                df.loc[idx, 'actual_time_until_change'] = change_time
                change_time = 0
            change_time += group.loc[idx, 'TR']

# Calculate final intbloc value for each studyID
final_intbloc = df.groupby('studyID')['intbloc'].transform('max')

# Fill NaNs in actual_time_until_change for entries where no change occurred with the final intbloc value
df['actual_time_until_change'].fillna(final_intbloc, inplace=True)


# Filter participants with at least three observations
df = df.groupby('studyID').filter(lambda x: len(x) >= 2)

# Print the number of participants left after filtering
num_participants = df['studyID'].nunique()
print(f"Number of participants with at least two observations: {num_participants}")


# Calculate symptom frequency and duration per participant
for col in prob_b_columns:
    df[f'{col}_frequency'] = df.groupby('studyID')[col].cumsum()
    df[f'{col}_duration'] = df.groupby('studyID')[col].transform(lambda x: (x != 0).astype(int).groupby((x == 0).astype(int).cumsum()).cumsum())

# Calculate symptom onset for each symptom
for col in prob_b_columns:
    df[f'{col}_onset'] = df.groupby('studyID').apply(lambda x: (x[col] * x['TR']).cumsum() - (x[col] * x['TR']).cumsum().where(x[col] == 0).ffill().fillna(0)).values

# Calculate symptom recurrence for each symptom
for col in prob_b_columns:
    df[f'{col}_recurrence'] = df.groupby('studyID')[col].transform(lambda x: (x.diff().fillna(0) == 1).cumsum())

# Define the range for lagged features
lag_range = range(1, 2)

# Create lagged features for symptoms, TR, and temporal features
for col in prob_b_columns + ['TR'] + outcome_columns:
    for lag in lag_range:
        df[f'{col}_lag{lag}'] = df.groupby('studyID')[col].shift(lag)

# Create lagged features for additional temporal features
for col in [f'{symptom}_recurrence' for symptom in prob_b_columns] + \
           [f'{symptom}_onset' for symptom in prob_b_columns] + \
           [f'{symptom}_frequency' for symptom in prob_b_columns] + \
           [f'{symptom}_duration' for symptom in prob_b_columns]:
    for lag in lag_range:
        df[f'{col}_lag{lag}'] = df.groupby('studyID')[col].shift(lag)

# Drop rows with NaN values in any of the lagged feature columns
lagged_columns = [f'{col}_lag{lag}' for col in prob_b_columns + outcome_columns for lag in lag_range]
lagged_temporal_columns = [f'{col}_lag{lag}' for col in 
                           [f'{symptom}_recurrence' for symptom in prob_b_columns] +
                           [f'{symptom}_onset' for symptom in prob_b_columns] +
                           [f'{symptom}_frequency' for symptom in prob_b_columns] +
                           [f'{symptom}_duration' for symptom in prob_b_columns]
                           for lag in lag_range]
df = df.dropna(subset=lagged_columns + lagged_temporal_columns)

# Store the lagged feature names in a variable
lagged_symptom_features = [f'{col}_lag{lag}' for col in prob_b_columns for lag in lag_range]
lagged_outcomes = [f'{col}_lag{lag}' for col in outcome_columns for lag in lag_range]
lagged_tr_features = [f'TR_lag{lag}' for lag in lag_range]
lagged_temporal_features = [f'{symptom}_recurrence_lag{lag}' for symptom in prob_b_columns for lag in lag_range] + \
                           [f'{symptom}_onset_lag{lag}' for symptom in prob_b_columns for lag in lag_range] + \
                           [f'{symptom}_frequency_lag{lag}' for symptom in prob_b_columns for lag in lag_range] + \
                           [f'{symptom}_duration_lag{lag}' for symptom in prob_b_columns for lag in lag_range]

# Calculate total number of symptoms (lagged)
df['total_symptoms_lagged'] = df[lagged_symptom_features].sum(axis=1)

# Additional feature engineering: Calculate magnitude of changes
for col in outcome_columns:
    df[f'magnitude_change_{col}'] = df[f'delta_' + col].abs()

# Define feature sets
binary_features = prob_b_columns
numeric_features = [col for col in df.columns if col not in binary_features + ['actual_time_until_change', 'total_symptoms_lagged']]
temporal_features = [f'{col}_frequency' for col in prob_b_columns] + \
                    [f'{col}_duration' for col in prob_b_columns] + \
                    [f'{col}_onset' for col in prob_b_columns] + \
                    [f'{col}_recurrence' for col in prob_b_columns]

# Separate temporal features into lagged and current sets
current_temporal_features = temporal_features

# Final dataframe for modeling (do not dropna)
df = df.reset_index(drop=True)
# Check the structure of the dataframe
df.isna().sum()

# Plot the distribution of actual_time_until_change
#plt.figure(figsize=(10, 6))
#df['actual_time_until_change'].dropna().plot(kind='hist', bins=range(int(df['actual_time_until_change'].min()), int(df['actual_time_until_change'].max()) + 2), alpha=0.7, color='blue')
#plt.title('Distribution of Actual Time Until Change')
#plt.xlabel('Time Until Change (months)')
#plt.ylabel('Frequency')
#plt.xticks(range(int(df['actual_time_until_change'].min()), int(df['actual_time_until_change'].max()) + 1))
#plt.tight_layout()
#plt.show()


In [None]:

# Calculate the number of observations for each participant
participant_lengths = df.groupby('studyID').size()

# Create a histogram
plt.figure(figsize=(12, 6))
plt.hist(participant_lengths, bins=50, edgecolor='black')
plt.xlabel('Number of Observations')
plt.ylabel('Number of Participants')
plt.title('Distribution of Time Series Lengths')
plt.show()

#### Removing end of life data

In [None]:
# Calculate the end-of-life threshold (last 5 months)
df['end_of_life_threshold'] = df['ddate'] - pd.DateOffset(months=5)


# Remove the last 5 months of life for each participant who has died (Died == 1)
df_no_eol = df[~((df['Died'] == 1) & (df['intdate'] >= df['end_of_life_threshold']))]

df = df_no_eol

In [None]:
# Calculate the end-of-life threshold (last 5 months)
df['end_of_life_threshold'] = df['ddate'] - pd.DateOffset(months=5)

# Plot the distribution of TR before removing end-of-life period
plt.figure(figsize=(15, 10))
plt.hist(df['TR'], bins=range(df['TR'].min(), df['TR'].max() + 1), edgecolor='k', alpha=0.7)
plt.title('Distribution of TR before Removing End-of-Life Period')
plt.xlabel('TR')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Remove the last 5 months of life for each participant who has died (Died == 1)
df_no_eol = df[~((df['Died'] == 1) & (df['intdate'] >= df['end_of_life_threshold']))]

# Plot the distribution of TR after removing end-of-life period
plt.figure(figsize=(15, 10))
plt.hist(df_no_eol['TR'], bins=range(df_no_eol['TR'].min(), df_no_eol['TR'].max() + 1), edgecolor='k', alpha=0.7)
plt.title('Distribution of TR after Removing End-of-Life Period')
plt.xlabel('TR')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


#### Outcomes

In [None]:

# Plot the distributions
plt.figure(figsize=(15, 5))

# Plot for N_adl4dis
plt.subplot(1, 3, 1)
sns.histplot(df['N_adl4dis'], color='blue', label='N_adl4dis')
plt.title('Distribution of N_adl4dis')
plt.xlabel('N_adl4dis')
plt.ylabel('Frequency')
plt.legend()

# Plot for N_IADL5
plt.subplot(1, 3, 2)
sns.histplot(df['N_IADL5'], color='green', label='N_IADL5')
plt.title('Distribution of N_IADL5')
plt.xlabel('N_IADL5')
plt.ylabel('Frequency')
plt.legend()

# Plot for N_mob4dis
plt.subplot(1, 3, 3)
sns.histplot(df['N_mob4dis'], color='red', label='N_mob4dis')
plt.title('Distribution of N_mob4dis')
plt.xlabel('N_mob4dis')
plt.ylabel('Frequency')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
n_rows = 8  
n_cols = 3  

fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(12, 26))  
axes = axes.flatten()  # Flatten the axes array for easier itera

# Loop through the list of columns and create a bar plot for each
for i, col in enumerate(prob_b_columns):
    # Count the frequency of each category in the current column
    value_counts = df[col].value_counts()

    # Create bar plot
    value_counts.plot(kind='bar', ax=axes[i], color=['blue', 'orange'])
    axes[i].set_xticklabels(['No', 'Yes'], rotation=0)
    axes[i].set_ylabel('Frequency')
    axes[i].set_ylim([0, df.shape[0]]) 

plt.title('Frequencies of all 24 pre-specified symptoms')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12,10))
sns.histplot(df['actual_time_until_change'])
plt.title('Distribution of TUC')
plt.xlabel('TUC')
plt.ylabel('Frequency')
plt.show()

### Associations

In [None]:

# Assuming df contains the relevant TR and lagged TR columns
tr_columns = ['TR'] + ['TR_lag1']
tr_df = df[tr_columns]

# Calculate Spearman correlation
tr_spearman_corr = tr_df.corr(method='spearman')

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(tr_spearman_corr, annot=True, cmap='coolwarm', center=0)
plt.title('Spearman Correlation Heatmap for TR and Lagged TR')
plt.show()


In [None]:
sym_tr_corr = df[lagged_symptom_features + ['TR']].corr() # pearson
print(sym_tr_corr['TR'].sort_values(ascending=False))

In [None]:
correlation_matrix = df[prob_b_columns].corr()

# Create a heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Symptoms')
plt.show()

In [None]:
disease_columns = ['heartattack', 'strokes', 'congestiveHD', 'carcinoma', 'hipfracture', 'chronicLung', 'Hypertension', 'arthritis', 'DM_TG']


# Calculate the sum of prob_b_columns for each disease column
result_df = pd.DataFrame(index=prob_b_columns)

for disease in disease_columns:
    # For each disease, calculate the prevalence of each prob_b_column
    result_df[disease] = df[df[disease] == 1][prob_b_columns].sum() / df[disease].sum()

# Step 2: Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(result_df, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Prevalence of prob_b_columns for each Disease')
plt.ylabel('Symptom columns')
plt.xlabel('Disease')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Calculate correlation matrix for total symptoms and other columns
correlation_matrix = df[['total_symptoms'] + columns_to_analyze].corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Total Symptoms and Other Variables')
plt.show()


In [None]:
# List to hold the correlation results
correlation_results = []

# Calculate point-biserial correlation for binary symptoms and continuous TR
for col in lagged_symptom_features:
    corr, p_value = pointbiserialr(df[col], df['TR'])
    correlation_results.append({'Symptom': col, 'Correlation': corr, 'P-Value': p_value})

# Convert results to a DataFrame
correlation_df = pd.DataFrame(correlation_results)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Symptom', y='Correlation', data=correlation_df, palette='viridis')
plt.title('Point-Biserial Correlation between Lagged Symptoms (lag=1) and TR')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# spearman-rank test for lagged symptoms and ordinal outcomes

spearman = df[lagged_symptom_features + outcome_columns].corr(method='spearman')

corr_mat = spearman.loc[lagged_symptom_features, outcome_columns]

plt.figure(figsize=(12,8))
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', center=0)
plt.title('Spearman Rank Correlations between lagged symptoms (lag=1) and outcome scores')
plt.show()

In [None]:
# CHI square test for lagged symptoms vs event observed
chi2_results = {}

for i in lagged_symptom_features:
    cont_table = pd.crosstab(df[i], df['event_observed'])
    chi2, p, dof, ex = chi2_contingency(cont_table)
    chi2_results[i] = {'chi2': chi2, 'p-value': p, 'degrees of freedom': dof}

chi2_p_values = {i: result['p-value'] for i, result in chi2_results.items()}

chi2_df = pd.DataFrame.from_dict(chi2_p_values, orient='index', columns=['p-value'])
chi2_df = chi2_df.sort_values(by='p-value')

plt.figure(figsize=(12,8))
chi2_df['p-value'].plot(kind='bar', color='skyblue')
plt.axhline(y=0.05, color='r', label='Significance Threshold is 0.05')
plt.title('Chi-Squared Test p-values for lagged symptom features (lag=1) versus Change in outcome observed')
plt.xlabel('Lagged Symptoms')
plt.ylabel('p-value')
plt.legend()
plt.show()

In [None]:

# Function to calculate Cramér's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Define the categorical columns
categorical_columns = lagged_symptom_features + prob_b_columns

# Calculate Cramér's V for each pair of categorical columns
cramers_v_matrix = pd.DataFrame(index=categorical_columns, columns=categorical_columns)

for col1 in categorical_columns:
    for col2 in categorical_columns:
        cramers_v_matrix.loc[col1, col2] = cramers_v(df[col1], df[col2])

print(cramers_v_matrix)


In [None]:
cramers_v_matrix = cramers_v_matrix.astype(float)
plt.figure(figsize=(20,16))
sns.heatmap(cramers_v_matrix, annot=False, cmap='coolwarm', cbar={'label': "Cramer's V"})
plt.show()

### Other

In [None]:

# Ensure intdate is in datetime format
df['intdate'] = pd.to_datetime(df['intdate'])

# Outcome columns
outcome_columns = ['N_adl4dis', 'N_IADL5', 'N_mob4dis']

# Calculate differences and combined changes
for col in outcome_columns:
    df['diff_' + col] = df.groupby('studyID')[col].diff().fillna(0)

# Create a combined change column
df['combined_diff'] = df[[f'diff_{col}' for col in outcome_columns]].sum(axis=1)

# Remove rows with NaN values in combined_diff (there shouldn't be any due to fillna(0))
valid_data = df.dropna(subset=['combined_diff'])

# Calculate frequency of each difference
frequency = valid_data['combined_diff'].value_counts().sort_index()

# Define the range for the x-axis
min_diff = valid_data['combined_diff'].min()
max_diff = valid_data['combined_diff'].max()

# Plot the combined change frequency
plt.figure(figsize=(10, 6))
plt.bar(frequency.index, frequency.values, color='blue')
plt.title('Change Frequency for Combined Outcomes')
plt.xlabel('Combined Change in Score')
plt.ylabel('Frequency')
plt.xticks(range(int(min_diff), int(max_diff) + 1))
plt.tight_layout()
plt.show()


##### Author: Nikhil Kashyap, MSc Information Studies: Data Science, UvA