In [None]:
import sys; sys.path.extend([snakemake.params.scripts])

from propensity_matching import propensity_score_matching
from ukb_data import load
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import bootstrap
import numpy as np
import pandas as pd

In [None]:
df, t1_feature_names, t1_feature_fids, treatment_col, match_cols, tmask_col, mask_col =  load(snakemake)


In [None]:
df[t1_feature_fids] = StandardScaler().fit_transform(df[t1_feature_fids])

In [None]:
n_patients = df[df[treatment_col]].shape[0]
print(n_patients)
if n_patients > 5000:
    print(f"Too many patients ({n_patients}) for propensity matching, dropping {n_patients-5000} patients")
    to_drop = n_patients - 5000
    df.loc[df[df[treatment_col]].sample(to_drop).index, treatment_col] = pd.NA
df_subset = df.dropna(subset=treatment_col)
df_subset[treatment_col] = df_subset[treatment_col].astype(bool)

matching, stats = propensity_score_matching(df_subset, treatment_col, match_cols, mask_col)

eids_cn = matching[matching[treatment_col]==False].index.to_list()
eids_dx = matching[matching[treatment_col]==True].index.to_list()

exclude_eids = matching.index.to_list()
if tmask_col is not None:
    exclude_eids += df.loc[df[tmask_col]].index.to_list()

df_ = df.loc[~df.index.isin(exclude_eids)]
x = df_[t1_feature_fids].values
y = df_['age_t2'].values

x_cn = df.loc[eids_cn][t1_feature_fids].values
x_dx = df.loc[eids_dx][t1_feature_fids].values
y_cn = df.loc[eids_cn]['age_t2'].values
y_dx = df.loc[eids_dx]['age_t2'].values

stats

In [None]:
df_res = pd.DataFrame()

In [None]:
idx = np.arange(len(x))
idx_train, idx_test = train_test_split(idx, train_size=int(snakemake.wildcards.ntrain), test_size=1000, random_state=int(snakemake.wildcards.seed))

In [None]:
for n_dropped in range(2500):
    model = RidgeCV(np.logspace(-2, 10, 25)).fit(x[idx_train],y[idx_train])
    r2 = model.score(x[idx_train],y[idx_train])

    y_train_pred = model.predict(x[idx_train])
    y_test_pred = model.predict(x[idx_test])
    bag_train = y[idx_train] - y_train_pred

    df_res.loc[n_dropped,'r2_train'] = r2_score(y[idx_train], y_train_pred)
    df_res.loc[n_dropped,'mae_train'] = mean_absolute_error(y[idx_train], y_train_pred)
    df_res.loc[n_dropped,'r2_test'] = r2_score(y[idx_test], y_test_pred)
    df_res.loc[n_dropped,'mae_test'] = mean_absolute_error(y[idx_test], y_test_pred)

    y_cn_pred = model.predict(x_cn)
    y_dx_pred = model.predict(x_dx)

    bag_cn = y_cn - y_cn_pred
    bag_dx = y_dx - y_dx_pred

    bag_train_corr = bag_train - LinearRegression().fit(y[idx_train].reshape(-1, 1),bag_train).predict(y[idx_train].reshape(-1, 1))
    bag_cn_corr = bag_cn - LinearRegression().fit(y[idx_train].reshape(-1, 1),bag_train).predict(y_cn.reshape(-1, 1))
    bag_dx_corr = bag_dx - LinearRegression().fit(y[idx_train].reshape(-1, 1),bag_train).predict(y_dx.reshape(-1, 1))

    func = lambda a,b: (np.mean(a)-np.mean(b))/np.sqrt((np.std(a)**2+np.std(b)**2)/2)

    effect = func(bag_cn,bag_dx)
    sem = bootstrap((bag_cn, bag_dx), func,paired=True).standard_error

    effect_corr = func(bag_cn_corr,bag_dx_corr)
    sem_corr = bootstrap((bag_cn_corr, bag_dx_corr), func,paired=True).standard_error

    df_res.loc[n_dropped,'effect'] = effect
    df_res.loc[n_dropped,'sem'] = sem
    df_res.loc[n_dropped,'effect_corr'] = effect_corr
    df_res.loc[n_dropped,'sem_corr'] = sem_corr

    print(df_res.loc[n_dropped])

    if snakemake.wildcards.corr == '1':
        bag_train_ = bag_train_corr
    elif snakemake.wildcards.corr == '0':
        bag_train_ = bag_train
    else:
        raise ValueError('corr must be 0 or 1')

    # index of sample with largest residual
    idx_drop = np.argmax(np.abs(bag_train_))
    real_idx_drop = idx_train[idx_drop]
    # print some info on the sample
    print(f'largest residual: {bag_train_[idx_drop]}')
    print(f'age: {y[real_idx_drop]}')
    # drop sample with largest residual
    idx_train = np.delete(idx_train, idx_drop)
    # drop from idx (possible replacements) as well
    idx = idx[~np.isin(idx, [real_idx_drop])]

    # add a new sample with same age and sex as the dropped sample
    try:
        replacement = df_.iloc[idx][(df_['age_t2']==df_.iloc[real_idx_drop]['age_t2'])  & (df_['sex']==df_.iloc[real_idx_drop]['sex'])].sample(1).iloc[0]
    except:
        # same sex and minimal age difference
        print('no perfect match')
        replacement = df_.iloc[idx][df_['sex']==df_.iloc[real_idx_drop]['sex']] .iloc[(df_.iloc[idx][df_['sex']==df_.iloc[real_idx_drop]['sex']]['age_t2']-df_.iloc[real_idx_drop]['age_t2']).abs().argsort()[:1]].iloc[0]

    idx_replacement = df_.index.get_loc(replacement.name)
    idx_train = np.append(idx_train, idx_replacement)

    # print some info on the replacement
    print(f'replacement age: {replacement["age_t2"]}')
    print(f'new average age in train set: {np.mean(y[idx_train])}')




In [None]:
df_res['mask'] = snakemake.wildcards.mask
df_res['tmask'] = snakemake.wildcards.trainmask
df_res['treatment'] = snakemake.wildcards.icd_code
df_res['matching'] = snakemake.wildcards.matching
df_res['ntrain'] = int(snakemake.wildcards.ntrain)
df_res['ndx'] = len(x_dx)

In [None]:
df_res.to_json(snakemake.output.effects)

In [None]:
import seaborn as sns
sns.lineplot(data=df_res.reset_index(), x='index', y='effect_corr')


In [None]:
sns.lineplot(data=df_res.reset_index(), x='index', y='effect')


In [None]:
sns.lineplot(data=df_res.reset_index(), x='index', y='r2_test')