In [1]:
import pandas as pd
df = pd.read_csv('final_df.csv')
df = df.drop(columns=['Unnamed: 0', 'followUpHY']).set_index('subjectId')
df

Unnamed: 0_level_0,gen,age,initialHY,group
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4037,1,52.831492,1.0,1
3168,2,63.094798,2.0,1
3131,1,71.205479,2.0,0
4024,1,72.292350,2.0,0
4001,1,49.893151,2.0,0
...,...,...,...,...
4116,1,64.500000,0.0,0
4117,2,59.900000,1.0,0
4118,2,68.100000,0.0,0
4123,2,60.300000,2.0,0


In [2]:
df = df[df['initialHY'] > 0]
df

Unnamed: 0_level_0,gen,age,initialHY,group
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4037,1,52.831492,1.0,1
3168,2,63.094798,2.0,1
3131,1,71.205479,2.0,0
4024,1,72.292350,2.0,0
4001,1,49.893151,2.0,0
...,...,...,...,...
4112,1,53.500000,1.0,0
4115,1,66.600000,2.0,0
4117,2,59.900000,1.0,0
4123,2,60.300000,2.0,0


In [3]:
# normalize columns
all_vars = ['age', 'gen', 'initialHY']
to_normalize = ['age'] 
for var in all_vars:
    if var in to_normalize:
        df[f'{var}_norm'] = (df[var] - df[var].mean())/df[var].std()
    else:
        df[f'{var}_norm'] = df[var]

df_norm = df.drop(columns=all_vars)
df_norm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{var}_norm'] = (df[var] - df[var].mean())/df[var].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{var}_norm'] = df[var]


Unnamed: 0_level_0,group,age_norm,gen_norm,initialHY_norm
subjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4037,1,-0.841420,1,1.0
3168,1,0.194749,2,2.0
3131,0,1.013592,1,2.0
4024,0,1.123321,1,2.0
4001,0,-1.138071,1,2.0
...,...,...,...,...
4112,0,-0.773928,1,1.0
4115,0,0.548629,1,2.0
4117,0,-0.127793,2,1.0
4123,0,-0.087410,2,2.0


In [4]:
df_stable = df[df['group'] == 0].drop(columns=['group'])
df_progr = df[df['group'] == 1].drop(columns=['group'])

In [5]:
# find index of nearest neighbor of x in df
def nn(x, df):
    df_hy_match = df[df['initialHY'] == x['initialHY']] # get as many subjects with matching HY as possible
    if len(df_hy_match) == 0:
        df_hy_match = df
    df_hy_match['dist'] = sum((df_hy_match[f'{var}_norm']-x[f'{var}_norm'])**2 for var in all_vars)
    df_hy_match.sort_values('dist', inplace=True)
    return df_hy_match.head(1).index[0]  ## there's probably a better way to do it but it should work

In [6]:
# Matching loop

def match(n_samples, df_stable, df_progr):
    ids_stable_matched = []
    ids_progr_matched = []

    df_stable_ = df_stable.copy()
    df_progr_ = df_progr.copy()

    for i in range(n_samples):
        # get a random stable subject without replacement
        if len(df_stable_[df_stable_['initialHY']==1]) > 0: # add as many stable patients with low H&Y as possible
            stable = df_stable_[df_stable_['initialHY']==1].sample()
        else:
            stable = df_stable_.sample()
        stable_index = stable.index[0]
        ids_stable_matched.append(stable_index)
        df_stable_.drop(index=stable_index, inplace=True)

        # get nn in progr set for this subject without replacement
        progr_index = nn(stable.iloc[0], df_progr_)
        ids_progr_matched.append(progr_index)
        df_progr_.drop(index=progr_index, inplace=True)
    
    df_stable_matched = df[df.index.isin(ids_stable_matched)]
    df_progr_matched = df[df.index.isin(ids_progr_matched)]
    return df_stable_matched, df_progr_matched

In [7]:
for n_samples in [50, 60, 72]:
    stable_matched, progr_matched = match(n_samples, df_stable, df_progr)
    
    print(f'--- nsamples: {n_samples}')
    print("\t\t| Progressive group \t| Stable group")
    print(f"Mean age \t| {round(progr_matched['age'].mean(),1)} \t\t\t| {round(stable_matched['age'].mean(),1)}")
    print(f"Men \t\t| {len(progr_matched[progr_matched['gen']==1])} \t\t\t| {len(stable_matched[stable_matched['gen']==1])}")
    print(f"Women \t\t| {len(progr_matched[progr_matched['gen']==2])} \t\t\t| {len(stable_matched[stable_matched['gen']==2])}")
    for i in range(1,6):
        print(f"H&Y={i} \t\t| {len(progr_matched[progr_matched['initialHY'] == i])} \t\t\t| {len(stable_matched[stable_matched['initialHY'] == i])}")
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hy_match['dist'] = sum((df_hy_match[f'{var}_norm']-x[f'{var}_norm'])**2 for var in all_vars)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


--- nsamples: 50
		| Progressive group 	| Stable group
Mean age 	| 58.8 			| 58.2
Men 		| 32 			| 33
Women 		| 18 			| 17
H&Y=1 		| 42 			| 42
H&Y=2 		| 8 			| 8
H&Y=3 		| 0 			| 0
H&Y=4 		| 0 			| 0
H&Y=5 		| 0 			| 0

--- nsamples: 60
		| Progressive group 	| Stable group
Mean age 	| 60.4 			| 58.4
Men 		| 35 			| 37
Women 		| 25 			| 23
H&Y=1 		| 48 			| 42
H&Y=2 		| 12 			| 18
H&Y=3 		| 0 			| 0
H&Y=4 		| 0 			| 0
H&Y=5 		| 0 			| 0

--- nsamples: 72
		| Progressive group 	| Stable group
Mean age 	| 60.2 			| 59.6
Men 		| 42 			| 42
Women 		| 30 			| 30
H&Y=1 		| 60 			| 42
H&Y=2 		| 12 			| 28
H&Y=3 		| 0 			| 2
H&Y=4 		| 0 			| 0
H&Y=5 		| 0 			| 0

