In [1]:
import pandas as pd
df = pd.read_csv("../data/volume-data/cohortBeforeMatching.csv")
df = df.drop(columns=['Unnamed: 0', 'followUpHY']).set_index('subjectId')
df

Unnamed: 0_level_0,gen,age,initialHY,group
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3001,1,65.1,1,1
3003,2,56.7,2,1
3020,2,74.0,2,1
3024,1,52.7,1,1
3059,1,83.0,2,1
...,...,...,...,...
53060,1,68.1,2,0
53060,1,68.1,2,0
54265,1,75.2,2,0
55875,1,59.0,2,0


In [3]:
df = df[df['initialHY'] > 0]
df = df[df['initialHY'] < 3]
df

Unnamed: 0_level_0,gen,age,initialHY,group
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3001,1,65.1,1,1
3003,2,56.7,2,1
3020,2,74.0,2,1
3024,1,52.7,1,1
3059,1,83.0,2,1
...,...,...,...,...
53060,1,68.1,2,0
53060,1,68.1,2,0
54265,1,75.2,2,0
55875,1,59.0,2,0


In [4]:
# normalize columns
all_vars = ['age', 'gen', 'initialHY']
to_normalize = ['age'] 
for var in all_vars:
    if var in to_normalize:
        df[f'{var}_norm'] = (df[var] - df[var].mean())/df[var].std()
    else:
        df[f'{var}_norm'] = df[var]

df_norm = df.drop(columns=all_vars)
df_norm

Unnamed: 0_level_0,group,age_norm,gen_norm,initialHY_norm
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3001,1,0.317457,1,1
3003,1,-0.540718,2,2
3020,1,1.226714,2,2
3024,1,-0.949373,1,1
3059,1,2.146187,1,2
...,...,...,...,...
53060,0,0.623948,1,2
53060,0,0.623948,1,2
54265,0,1.349310,1,2
55875,0,-0.305741,1,2


In [5]:
df_stable = df[df['group'] == 0].drop(columns=['group'])
df_progr = df[df['group'] == 1].drop(columns=['group'])

In [6]:
# find index of nearest neighbor of x in df
def nn(x, df):
    df_hy_match = df[df['initialHY'] == x['initialHY']] # get as many subjects with matching HY as possible
    if len(df_hy_match) == 0:
        df_hy_match = df
    df_hy_match['dist'] = sum((df_hy_match[f'{var}_norm']-x[f'{var}_norm'])**2 for var in all_vars)
    df_hy_match.sort_values('dist', inplace=True)
    return df_hy_match.head(1).index[0]  ## there's probably a better way to do it but it should work

In [7]:
# Matching loop

def match(n_samples, df_stable, df_progr):
    ids_stable_matched = []
    ids_progr_matched = []

    df_stable_ = df_stable.copy()
    df_progr_ = df_progr.copy()

    for i in range(n_samples):
        # get a random stable subject without replacement
        if len(df_stable_[df_stable_['initialHY']==1]) > 0: # add as many stable patients with low H&Y as possible
            stable = df_stable_[df_stable_['initialHY']==1].sample()
        else:
            stable = df_stable_.sample()
        stable_index = stable.index[0]
        ids_stable_matched.append(stable_index)
        df_stable_.drop(index=stable_index, inplace=True)

        # get nn in progr set for this subject without replacement
        progr_index = nn(stable.iloc[0], df_progr_)
        ids_progr_matched.append(progr_index)
        df_progr_.drop(index=progr_index, inplace=True)
    
    df_stable_matched = df[df.index.isin(ids_stable_matched)]
    df_progr_matched = df[df.index.isin(ids_progr_matched)]
    return df_stable_matched, df_progr_matched

In [16]:
for n_samples in [72]:
    stable_matched, progr_matched = match(n_samples, df_stable, df_progr)
    
    print(f'--- nsamples: {n_samples}')
    print("\t\t| Progressive group \t| Stable group")
    print(f"Mean age \t| {round(progr_matched['age'].mean(),1)} \t\t\t| {round(stable_matched['age'].mean(),1)}")
    print(f"\nMen \t\t| {len(progr_matched[progr_matched['gen']==1])} \t\t\t| {len(stable_matched[stable_matched['gen']==1])}")
    print(f"Women \t\t| {len(progr_matched[progr_matched['gen']==2])} \t\t\t| {len(stable_matched[stable_matched['gen']==2])}\n")
    for i in range(1,6):
        print(f"H&Y={i} \t\t| {len(progr_matched[progr_matched['initialHY'] == i])} \t\t\t| {len(stable_matched[stable_matched['initialHY'] == i])}")
    print()

--- nsamples: 72
		| Progressive group 	| Stable group
Mean age 	| 63.0 			| 61.0

Men 		| 48 			| 48
Women 		| 26 			| 30

H&Y=1 		| 35 			| 29
H&Y=2 		| 39 			| 49
H&Y=3 		| 0 			| 0
H&Y=4 		| 0 			| 0
H&Y=5 		| 0 			| 0

