# Data Creation

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_excel("stat_data_processed.xlsx", sheet_name=0, header=0)

indexes = np.unique(df['Study_Subject_Number'].values, return_index=True)[1]
patient_list = [df['Study_Subject_Number'].values[index] for index in sorted(indexes)]

In [4]:
len(patient_list)

466

In [5]:
target_var = ['Target_Hb']

info_vars = ['Study_Subject_Number', 'Order'] 

categoryA = ["Hemoglobin[Whole blood]", "Delta_Hb", "RDW[Whole blood]", "MCV[Whole blood]", "MCH[Whole blood]", "MCHC[Whole blood]", "Serum Iron[Serum]", "Age (yrs)", "Sex_M", "Sex_F", 'EPO_Dose']

categoryB = ["URR[Serum]", 'Dry Weight', 'Albumin[Serum]', 'Predialysis Weight', 'Height (cm)']

trn_vars = categoryA + categoryB

all_vars = info_vars + trn_vars + target_var

# statistics

In [6]:
len(df['Subject No.'].drop_duplicates())

466

In [7]:
#original row count
len(df.values)

15210

In [8]:
df['Erythropoietin-alpha-concat'] = df['Erythropoietin-alpha'] + df['Erythropoietin-alpha.1']
df['Dose_Erythropoietin-alpha-concat'] = df['Dose_Erythropoietin-alpha'] + df['Dose_Erythropoietin-alpha.1']

In [9]:
# df = df.drop(['Unit_Number'], axis=1)

In [10]:
df.columns

Index(['Subject No.', 'Unit_Number', 'Age (yrs)', 'Sex', 'Height (cm)',
       'Weight (kg)', 'BMI (kg/m2)', 'Transfusion', 'Study_Subject_Number',
       'Order', 'Unit Number', 'Date', 'SBP_start(mmHg)', 'SBP_End(mmHg)',
       'Dry Weight', 'Predialysis Weight', 'Post_dialysis Weight',
       'Ultrafilteration Volume', 'Hemoglobin[Whole blood]',
       'Hct[Whole blood]', 'MCV[Whole blood]', 'MCHC[Whole blood]',
       'MCH[Whole blood]', 'RDW[Whole blood]', 'MPV[Whole blood]',
       'PLT Count[Whole blood]', 'WBC COUNT[Whole blood]', 'Calcium[Serum]',
       'Inorganic P[Serum]', 'Glucose[Serum]', 'Total Protein[Serum]',
       'Albumin[Serum]', 'Creatinine[Serum]', 'Uric Acid[Serum]',
       'Serum Iron[Serum]', 'URR[Serum]', 'Darbepoetin-alpha',
       'Dose_Darbepoetin-alpha', 'Erythropoietin-alpha',
       'Dose_Erythropoietin-alpha', 'Epoetin-beta', 'Dose_Epoetin-beta',
       'Erythropoietin-alpha.1', 'Dose_Erythropoietin-alpha.1', 'EPO_Dose',
       'Sex_F', 'Sex_M', 'Date_

# Participants, No.

In [11]:
# number of patient each of hospital

print([len(np.unique(df[df['Subject No.'].str.contains(i)]['Subject No.'])) for i in ['A','B','D','E','F','G','H']])

[96, 52, 54, 49, 67, 125, 23]


# Follow-up duration, median (IQR), month

In [78]:
print('overall')
print(np.percentile(df.groupby(['Subject No.']).count()['Study_Subject_Number'],25))
print(np.percentile(df.groupby(['Subject No.']).count()['Study_Subject_Number'],50))
print(np.percentile(df.groupby(['Subject No.']).count()['Study_Subject_Number'],75))

overall
15.0
30.0
54.0


In [135]:
for hospital in ['A','B','D','E','F','G','H'] :
    print(hospital, np.percentile(df[df['Order'].str.contains(hospital)].groupby(['Subject No.']).count()['Study_Subject_Number'],25))
    print(hospital, np.percentile(df[df['Order'].str.contains(hospital)].groupby(['Subject No.']).count()['Study_Subject_Number'],50))
    print(hospital, np.percentile(df[df['Order'].str.contains(hospital)].groupby(['Subject No.']).count()['Study_Subject_Number'],75))
    print('')

A 10.75
A 20.5
A 47.25

B 21.75
B 41.0
B 58.0

D 18.5
D 45.0
D 52.0

E 9.0
E 28.0
E 36.0

F 14.0
F 32.0
F 53.5

G 21.0
G 39.0
G 58.0

H 10.0
H 19.0
H 26.0



# Sex, No. (%)

In [100]:
print('Overall')
print('Men', len(df[df['Sex']=='M']['Subject No.'].drop_duplicates()))
print('Women', len(df[df['Sex']=='F']['Subject No.'].drop_duplicates()))

Overall
Men 248
Women 218


In [116]:
for hospital in ['A','B','D','E','F','G','H'] :
    print(hospital, 'MAN', len(df[(df['Order'].str.contains(hospital)) & (df['Sex']=='M')]['Subject No.'].drop_duplicates()))
    print(hospital, 'Female', len(df[(df['Order'].str.contains(hospital)) & (df['Sex']=='F')]['Subject No.'].drop_duplicates()))

A MAN 59
A Female 37
B MAN 30
B Female 22
D MAN 27
D Female 27
E MAN 30
E Female 19
F MAN 29
F Female 38
G MAN 62
G Female 63
H MAN 11
H Female 12


# Get mean (SD)

In [110]:
target_var = 'Hemoglobin[Whole blood]'

In [111]:
print('overall', target_var)
print(round(np.mean(df[target_var]),1))
print(round(np.std(df[target_var]),1))

overall Hemoglobin[Whole blood]
10.8
1.1


In [112]:
print(target_var)
for hospital in ['A','B','D','E','F','G','H'] :
    print(hospital, round(np.mean(df[df['Order'].str.contains(hospital)][target_var]),1))
    print(hospital, round(np.std(df[df['Order'].str.contains(hospital)][target_var]),1))
    print('')

Hemoglobin[Whole blood]
A 10.7
A 1.1

B 10.4
B 1.0

D 10.9
D 1.2

E 10.7
E 1.1

F 10.4
F 0.9

G 11.1
G 1.1

H 10.4
H 0.9



# Erythropoietin, No. (%)

In [248]:
target_var = 'Epoetin-beta'
# Darbepoetin-alpha
# Epoetin-beta
# Erythropoietin-alpha-concat


In [251]:
print('over all', target_var)
print(len([i for i in df.groupby(['Study_Subject_Number',target_var]).count().reset_index()[target_var] if i!=0]))

over all Epoetin-beta
22


In [252]:
print('get frequency', target_var)
for hospital in ['A','B','D','E','F','G','H'] :
    print(hospital, len([i for i in df[df['Order'].str.contains(hospital)].groupby(['Study_Subject_Number',target_var]).count().reset_index()[target_var] if i!=0]))

get frequency Epoetin-beta
A 2
B 0
D 2
E 18
F 0
G 0
H 0


# Erythropoietin per month, median (IQR)


In [258]:
target_var = 'Dose_Epoetin-beta'
# Dose_Epoetin-beta
# Dose_Erythropoietin-alpha-concat
# Dose_Darbepoetin-alpha

In [260]:
print('over all', target_var, end='\n\n')
for i in [25,50,75] :
    print(i,'quantile',target_var,end='\n\n')
    try :
        print(np.percentile([i for i in df[target_var].values if i!=0], i))
    except :
        print('not hospital')
    if i == 75 :
        print('')

over all Dose_Epoetin-beta

25 quantile Dose_Epoetin-beta

100.0
50 quantile Dose_Epoetin-beta

120.0
75 quantile Dose_Epoetin-beta

120.0



In [232]:
# print('get quantile', target_val)
for i in [25,50,75] :
    print(i,'quantile',target_var,end='\n\n')
    for hospital in ['A','B','D','E','F','G','H'] :
        try : 
            print(hospital, np.percentile([i for i in df[df['Order'].str.contains(hospital)][target_var].values if i!=0], i))
        except :
            print('no hospital')
    if i == 75 :
        print('')

25 quantile Dose_Darbepoetin-alpha

A 120.0
B 80.0
D 80.0
E 80.0
F 60.0
G 80.0
H 80.0
50 quantile Dose_Darbepoetin-alpha

A 200.0
B 120.0
D 120.0
E 120.0
F 120.0
G 120.0
H 160.0
75 quantile Dose_Darbepoetin-alpha

A 240.0
B 200.0
D 200.0
E 160.0
F 180.0
G 200.0
H 160.0



# p-value

In [319]:
import statsmodels
from statsmodels.formula.api import ols
from scipy import stats

In [320]:
target_var = 'Height (cm)'

In [321]:
temp = []
for hospital in ['A','B','D','E','F','G','H'] :
    temp.append(list(df[df['Order'].str.contains(hospital)].groupby(['Study_Subject_Number',target_var]).count().reset_index()[target_var].values))

In [322]:
s,p = stats.f_oneway(temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6])

In [323]:
print ("{:.20f}".format(p)) 

0.34413590968067880871


In [324]:
stats.kruskal(temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6])

KruskalResult(statistic=6.58522551754474, pvalue=0.36091219575783373)

# Merged EPO

In [6]:
df = pd.read_excel("data_processed.xlsx", sheet_name=0, header=0)

indexes = np.unique(df['Study_Subject_Number'].values, return_index=True)[1]
patient_list = [df['Study_Subject_Number'].values[index] for index in sorted(indexes)]
len(patient_list)

466

### Including EPO=0

In [15]:
target_var = 'EPO_Dose'
for i in [25,50,75] :
    print(i,'quantile',target_var,end='\n')
    for hospital in ['A','B','D','E','F','G','H'] :
        try : 
            print(hospital, np.percentile([i for i in df[df['Order'].str.contains(hospital)][target_var].values], i))
        except :
            print('no hospital')
    if i == 75 :
        print('')
        
print('All 25/50/75:', np.percentile([i for i in df[target_var].values], q=[25,50,75]))

25 quantile EPO_Dose
A 0.0
B 0.0
D 0.0
E 0.0
F 0.0
G 55.0
H 40.0
50 quantile EPO_Dose
A 150.0
B 0.0
D 80.0
E 60.0
F 0.0
G 120.0
H 120.0
75 quantile EPO_Dose
A 240.0
B 120.0
D 160.0
E 160.0
F 100.0
G 225.0
H 160.0

All 25/50/75: [  0.  80. 190.]


### Excluding EPO=0

In [13]:
target_var = 'EPO_Dose'
for i in [25,50,75] :
    print(i,'quantile',target_var,end='\n')
    for hospital in ['A','B','D','E','F','G','H'] :
        try : 
            print(hospital, np.percentile([i for i in df[df['Order'].str.contains(hospital)][target_var].values if i!=0], i))
        except :
            print('no hospital')
    if i == 75 :
        print('')
        
print('All 25/50/75:', np.percentile([i for i in df[target_var].values if i!=0], q=[25,50,75]))

25 quantile EPO_Dose
A 120.0
B 80.0
D 80.0
E 90.0
F 60.0
G 80.0
H 80.0
50 quantile EPO_Dose
A 200.0
B 140.0
D 140.0
E 150.0
F 120.0
G 150.0
H 160.0
75 quantile EPO_Dose
A 280.0
B 225.0
D 200.0
E 200.0
F 200.0
G 240.0
H 200.0

All 25/50/75: [ 80. 160. 240.]
