In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from utils import imputer, shuffle_data_by_group, reset_group_id, datetime_to_days_diff, train_test_split_grouped_extrapolation, train_test_split_grouped_interpolation

In [2]:
df1 = pd.read_csv('patras_data.csv', parse_dates=['date'])
df2 = pd.read_csv('sheffield_data.csv', parse_dates=['date'])

# Combine dataframes

In [3]:
# Reset Patient ID for both dataframes
df1.groupby(['ID']).ngroup()

0         0
1         0
2         0
3         0
4         0
       ... 
4348    558
4349    558
4350    558
4351    559
4352    559
Length: 4353, dtype: int64

In [4]:
df1['ID'] = df1.groupby(['ID']).ngroup()

In [5]:
df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

0        560
1        560
2        560
3        560
4        560
        ... 
6637    1552
6638    1552
6639    1553
6640    1553
6641    1553
Length: 6642, dtype: int64

In [6]:
df2['ID'] = df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

In [7]:
data = pd.concat([df1, df2], axis=0, ignore_index=True)

In [8]:
data

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
0,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
1,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
2,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
3,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
4,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10990,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10991,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False
10992,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10993,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False


In [9]:
data.describe()

Unnamed: 0,ID,dob_year,height,sampleId,egfr,ckd_stage,slope_allprior,N_allprior,slope_2yprior,N_2yprior,...,micro_albuminuria,hba1c,c_reactive_protein,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe
count,10995.0,8924.0,10985.0,3851.0,10522.0,3847.0,9024.0,10316.0,8916.0,10316.0,...,187.0,286.0,234.0,603.0,599.0,599.0,155.0,154.0,340.0,292.0
mean,704.031924,1959.437584,5.387872,1016.242534,46.771944,1.40863,0.034643,5.451241,-0.054014,4.063494,...,1555.666695,23.944965,2.914231,57.589552,2628.758097,1514.981636,410.993548,9.491558,48.803235,207.782192
std,459.267671,15.180295,27.44879,601.27798,25.192801,1.722691,67.00758,3.621561,68.600263,2.270368,...,2309.599102,26.783405,9.077228,55.430334,3926.904331,18566.007661,235.477892,22.940807,37.259726,125.553761
min,0.0,1926.0,0.0,1.0,0.0,0.0,-1460.0,1.0,-1460.0,1.0,...,0.0,4.6,0.0,7.9,3.7,22.0,50.0,1.7,4.3,3.8
25%,276.0,1948.0,0.0,510.5,27.0,0.0,-5.318589,2.0,-5.903179,2.0,...,58.75,6.4,0.23,11.9,7.29,187.5,265.5,4.025,14.95,54.2
50%,711.0,1957.0,1.56,997.0,42.0,0.0,-1.031236,5.0,-1.170697,4.0,...,545.0,7.75,0.435,14.6,9.82,238.0,379.0,5.55,46.0,254.5
75%,1098.0,1969.0,1.7,1481.5,61.0,3.0,2.282602,8.0,3.055405,5.0,...,2326.5,47.0,1.0,116.0,6305.0,284.0,494.5,8.725,71.0,306.0
max,1553.0,2002.0,188.0,2274.0,189.0,41.0,4015.0,17.0,4015.0,14.0,...,15171.0,137.0,81.0,166.0,16270.0,320000.0,2000.0,284.0,342.0,510.0


In [10]:
# Select non-control patients
data = data[data['control_patient']==False]
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
0,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
1,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
2,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
3,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
4,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10815,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False
10816,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10817,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False


# Cleaning not involving imputing values

## Initial cleaning

In [11]:
# Dataframe after selecting relevant columns
data2 = data[['ID', 'site', 'dob_year','gender', 'ethnicity', 'height', 'Weight', 'smoker', 'kidney_transplant', 'Patient_died', 'disease', 'bp.sys', 'bun', 'date', 'egfr']]
data2 = data2.rename(columns={'Weight': 'weight', 'Patient_died': 'patient_died'})
data2['ID'] = data2.groupby(['ID']).ngroup()
data2

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1421,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10815,1421,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10816,1422,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10817,1422,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [12]:
data2['smoker'] = data2['smoker'].replace({'Past Smoker': 'Past-Smoker'})

In [13]:
data2['smoker'].value_counts()

Non-Smoker     2602
Past-Smoker    1567
Smoker          896
Name: smoker, dtype: int64

In [14]:
# Notice that for this patient, egfr and ckd_stage appears to have been switched incorrectly, so we manualy set egfr to the correct value
data.loc[data['egfr']==3]

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
3539,373,Patras,17/07/2017 12:39,False,1956.0,Male,Caucasian,Semi-Professional,1.7,Smoker,...,,,,,,,,,False,False


In [15]:
data2.loc[data2['egfr']==3, 'egfr'] = 41

In [16]:
# Sort values by date
data3 = data2.sort_values(['ID', 'date'], ignore_index=True)

## Regrouping ethnicity

In [17]:
data4 = data3.copy()

In [18]:
data4['ethnicity'].value_counts()

Caucasian                        9209
Asian                             146
Black (afro caribean descent)     108
Others                             56
Asian - Indian                     12
INDIAN                             11
Asian (Pakistan)                   11
Somali                              9
Asian(INDIAN)                       6
Chinese                             6
Asian( PAKISTANI)                   5
Indian                              5
Asian ( BANGLADESHHI)               2
Arab                                1
Name: ethnicity, dtype: int64

In [19]:
# Combine categories
data4['ethnicity'] = data4['ethnicity'].replace(dict.fromkeys(['Asian - Indian', 'Asian (Pakistan)', 'INDIAN', 'Chinese', 'Asian(INDIAN)', 'Indian', 'Asian( PAKISTANI)', 'Arab', 'Asian ( BANGLADESHHI)'], 'Asian'))
data4['ethnicity'] = data4['ethnicity'].replace({'Somali': 'Others'})
data4['ethnicity'] = data4['ethnicity'].replace({'Black (afro caribean descent)': 'Black'})

In [20]:
data4['ethnicity'].value_counts()

Caucasian    9209
Asian         205
Black         108
Others         65
Name: ethnicity, dtype: int64

## egfr

In [21]:
data4['egfr'].unique()

array([ 69. ,  62. ,  56. ,  50. ,  54. ,  53. ,  46. ,  58. ,  49. ,
        90. , 103. ,  33. ,  31. ,  29. ,  36. ,  35. ,  45. ,  48. ,
        59. ,  72. ,  41. ,  44. ,  64. ,  60. ,  71. ,  47. ,  66. ,
        76. ,  52. ,  39. ,  28. ,  84. ,  89. ,  24. ,  38. ,  27. ,
        21. ,  22. ,  30. ,  23. ,  20. ,  19. ,  26. ,  16. ,  13. ,
        32. ,  34. ,  43. ,  37. ,  25. ,   9. ,  15. ,  10. , 107. ,
        82. ,  93. ,  63. , 106. ,  92. , 118. , 150. , 149. , 101. ,
       125. , 127. , 124. ,  67. ,  55. ,  40. ,  57. ,  68. ,  42. ,
        65. ,  73. ,  80. ,  81. ,  61. ,  18. ,  78. ,  70. ,  14. ,
        17. ,  11. ,  12. ,  85. ,  77. ,  79. , 102. ,  74. ,  51. ,
        97. ,  83. ,  96. , 108. ,  94. ,  86. ,  75. , 109. ,  91. ,
       104. ,  88. ,  87. ,  95. , 114. , 100. , 133. ,  99. , 113. ,
       112. ,   7. , 120. ,  98. , 158. , 128. , 129. , 145. , 122. ,
       111. , 117. , 110. , 131. ,   8. , 123. , 146. , 105. , 136. ,
       119. , 141. ,

In [22]:
# Missing values for egfr
data4['egfr'].isna().sum()

402

In [23]:
# Drop all rows with missing egfr values
data5 = data4.dropna(subset=['egfr'])
data5.reset_index(inplace=True, drop=True)
data5['ID'] = data5.groupby(['ID']).ngroup()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data5['ID'] = data5.groupby(['ID']).ngroup()


In [24]:
# Drop row with 0 egfr value (1 row)
data6 = data5.loc[data5['egfr'] != 0]
data6.reset_index(inplace=True, drop=True)
data6['ID'] = data6.groupby(['ID']).ngroup()
data6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data6['ID'] = data6.groupby(['ID']).ngroup()


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10411,1370,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10412,1370,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10413,1371,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10414,1371,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


## Height

In [25]:
# Some issues with height
data6.describe()

Unnamed: 0,ID,dob_year,height,weight,bp.sys,bun,egfr
count,10416.0,8674.0,10406.0,3161.0,10118.0,10399.0,10416.0
mean,604.731663,1959.417454,5.495013,104.621711,106.65339,101.503183,46.569931
std,398.776884,15.168561,27.748251,1201.547418,62.594163,6793.564265,25.039977
min,0.0,1926.0,0.0,1.48,0.0,0.0,4.0
25%,228.0,1948.0,0.0,69.3,100.0,9.0,27.0
50%,616.0,1957.0,1.56,79.7,132.0,18.0,42.0
75%,936.0,1969.0,1.7,90.4,147.0,52.0,61.0
max,1371.0,2002.0,188.0,67125.0,1158.0,692801.0,189.0


In [26]:
data6['height'].unique()

array([  1.7  ,   1.67 ,   1.6  ,   1.68 ,   1.65 ,   1.63 ,   1.66 ,
         1.45 ,   1.64 ,   1.8  ,   1.75 ,   1.73 ,   1.62 ,   1.56 ,
         1.76 ,   1.74 ,   1.57 ,   1.71 ,   1.5  ,   1.78 ,   1.48 ,
         1.72 ,   1.69 ,   1.89 ,   1.52 ,   1.59 ,   1.58 ,   1.87 ,
         1.82 ,   1.77 ,   1.53 ,   1.44 ,   1.61 ,   0.   ,   1.95 ,
         1.54 ,   1.88 ,   1.81 ,   1.51 , 169.   ,   1.55 ,   1.9  ,
         2.   ,   1.42 ,   1.41 ,   1.47 ,   1.85 ,   1.83 ,   1.84 ,
           nan,   1.86 ,   1.79 ,   1.92 ,   1.91 , 158.   ,   1.601,
         1.752,   1.625,   1.98 , 180.   , 173.   ,   1.49 , 174.   ,
         2.02 ,  97.6  , 172.   , 166.   , 167.   , 171.   , 142.   ,
       181.   , 188.   , 176.   , 184.   , 168.   , 178.   , 179.   ,
       155.   , 182.   , 183.   , 165.   , 159.   ])

In [27]:
data6['height'].isna().sum()

10

In [28]:
data6[data6['height'].isna().to_numpy()]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
3858,364,Patras,1954.0,Female,Caucasian,,76.0,Non-Smoker,False,False,GMN,125.0,53.0,2017-12-10,53.0
3859,364,Patras,1954.0,Female,Caucasian,,90.0,Non-Smoker,False,False,GMN,130.0,50.0,2018-05-03,53.0
5636,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,142.0,9.0,2015-01-30,48.0
5637,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,,10.0,2015-10-29,51.0
5638,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,145.0,10.0,2016-05-02,48.0
5639,667,Sheffield,1988.0,Male,Caucasian,,73.6,,False,False,HTN,154.0,11.3,2017-02-17,50.0
10216,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,144.0,9.0,2018-05-24,24.0
10217,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,149.0,11.0,2018-10-18,30.0
10218,1316,Sheffield,1946.0,Female,,,71.0,,False,False,GMN,150.0,10.3,2019-03-28,34.0
10219,1316,Sheffield,1946.0,Female,,,69.2,,False,False,GMN,144.0,14.0,2019-10-10,32.0


In [29]:
# Remove NaN values
data7 = data6.dropna(subset=['height'])
data7.reset_index(inplace=True, drop=True)
data7['ID'] = data7.groupby(['ID']).ngroup()
data7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data7['ID'] = data7.groupby(['ID']).ngroup()


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10401,1367,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10402,1367,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10403,1368,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10404,1368,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [30]:
# Strange value
data7.loc[data7['height']==97.6]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5994,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-06-14,108.0
5995,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-10-18,117.0
5996,711,Sheffield,,Male,Asian,97.6,107.8,,False,False,GMN,0.0,21.9,2010-11-10,18.0


In [31]:
# Remove strange values
data8 = data7.drop(data7.loc[data7['height']==97.6].index.to_numpy())
data8.reset_index(inplace=True, drop=True)
data8['ID'] = data8.groupby(['ID']).ngroup()
data8

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10398,1366,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10399,1366,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10400,1367,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10401,1367,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [32]:
# Convert all height measurements to meters
data8['height'] = np.where(data8['height'] > 5, data8['height'] / 100, data8['height'])

In [33]:
data8['height'].unique()

array([1.7  , 1.67 , 1.6  , 1.68 , 1.65 , 1.63 , 1.66 , 1.45 , 1.64 ,
       1.8  , 1.75 , 1.73 , 1.62 , 1.56 , 1.76 , 1.74 , 1.57 , 1.71 ,
       1.5  , 1.78 , 1.48 , 1.72 , 1.69 , 1.89 , 1.52 , 1.59 , 1.58 ,
       1.87 , 1.82 , 1.77 , 1.53 , 1.44 , 1.61 , 0.   , 1.95 , 1.54 ,
       1.88 , 1.81 , 1.51 , 1.55 , 1.9  , 2.   , 1.42 , 1.41 , 1.47 ,
       1.85 , 1.83 , 1.84 , 1.86 , 1.79 , 1.92 , 1.91 , 1.601, 1.752,
       1.625, 1.98 , 1.49 , 2.02 ])

In [34]:
# Too many rows with zero values for height to drop from the dataframe
print('Number of rows with zero height values: ',(data8['height']==0).sum())

# All patients with zero height values do not have previously-recorded height values
print('Do all patients with zero height values not have any previously-recorded height values? ', data8.groupby(['ID']).height.min().equals(data8.groupby(['ID']).height.max()))

# All patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)
print('Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)? ', data8.loc[data8['height']!=0].groupby(['ID']).height.median().median() == data8.loc[data8['height']!=0].groupby(['ID']).first().height.median())

Number of rows with zero height values:  4761
Do all patients with zero height values not have any previously-recorded height values?  True
Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)?  True


## Age

In [35]:
# All patients with zero dob_year values do not have previously-recorded values
data8.groupby(['ID']).dob_year.min().equals(data8.groupby(['ID']).dob_year.max())

True

In [36]:
data9 = data8.copy()
data9['dob_year'].fillna(0, inplace=True)

## bp.sys

In [37]:
data10 = data9.copy()

In [38]:
# NaN, zero values and unusually small and large values
data10['bp.sys'].unique()

array([ 150.,    0.,  145.,  160.,  165.,  140.,  135.,  125.,  115.,
        120.,  180.,  130.,  110.,   nan,  155.,  137.,  122.,  105.,
        170.,  134.,  124.,  133.,  100.,  138.,  148.,  139.,  152.,
        121.,   96.,   95.,   90.,  200.,  136.,  128.,  127.,  132.,
        220.,  178.,  144.,  142.,   97.,  166.,  146.,  151.,  131.,
        117.,  126.,  300.,  123.,  175.,  149.,  190.,  156.,  114.,
         98.,  113.,  184.,  147.,  141.,  107.,   80.,  101.,  185.,
        195.,  154., 1158.,   14.,   85.,  106.,  143.,  164.,  103.,
        112.,  129.,  118.,  111.,  119.,  162.,  108.,  169.,  172.,
         92.,  177.,  153.,  159.,  174.,  116.,  173.,  168.,  183.,
        181.,  167.,  187.,  158.,  176.,  191.,  102.,  171.,  161.,
        163.,  188.,  192.,  157.,  196.,   91.,  199.,   88.,  109.,
        179.,   72.,  104.,  182.,  186.,  189.,  193.,  206.,  197.,
        209.,   17.,  237.,  202.,   99.,   93.,  215.,  225.,  207.,
        203.,  214.,

In [39]:
data10.loc[data10['bp.sys']==1158]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2044,168,Patras,1956.0,Male,Caucasian,1.63,67.0,Past-Smoker,False,False,GMN,1158.0,34.0,2016-12-07,122.0


In [40]:
data10.iloc[2043:2046, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2043,168,Patras,1956.0,Male,Caucasian,1.63,,Past-Smoker,False,False,GMN,120.0,34.0,2016-02-16,122.0
2044,168,Patras,1956.0,Male,Caucasian,1.63,67.0,Past-Smoker,False,False,GMN,1158.0,34.0,2016-12-07,122.0
2045,168,Patras,1956.0,Male,Caucasian,1.63,71.0,Past-Smoker,False,False,GMN,120.0,32.0,2017-01-31,146.0


In [41]:
# Change strange value
data10.loc[data10['bp.sys']==1158, ['bp.sys']] = 120.0

In [42]:
data10.loc[data10['bp.sys']==300]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
859,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,300.0,25.0,2015-03-26,72.0


In [43]:
data10.iloc[858:861, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
858,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,120.0,39.0,2014-09-22,65.0
859,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,300.0,25.0,2015-03-26,72.0
860,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,128.0,42.0,2015-05-10,72.0


In [44]:
# Change strange value
data10.loc[data10['bp.sys']==300, ['bp.sys']] = 120.0

In [45]:
data10.loc[data10['bp.sys']==17]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5186,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,17.0,17.0,2008-03-10,23.0


In [46]:
data10.iloc[5185:5188, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5185,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,160.0,17.0,2007-05-18,19.0
5186,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,17.0,17.0,2008-03-10,23.0
5187,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,127.0,15.0,2008-04-01,24.0


In [47]:
# Change strange value
data10.loc[data10['bp.sys']==17, ['bp.sys']] = 160.0

In [48]:
data10.loc[data10['bp.sys']==14]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2057,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,14.0,31.0,2015-09-06,60.0


In [49]:
data10.iloc[2056:2059, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2056,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,140.0,42.0,2015-01-27,77.0
2057,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,14.0,31.0,2015-09-06,60.0
2058,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,110.0,37.0,2015-09-29,60.0


In [50]:
# Change strange value
data10.loc[data10['bp.sys']==14, ['bp.sys']] = 140.0

In [51]:
# Replace NaN with zero for now
data10['bp.sys'].replace(0, np.nan, inplace=True)

## bun

In [52]:
data11 = data10.copy()

In [53]:
# NaN, zero values and unusually large values
# a = data11['bun'].unique()
# np.sort(a)

In [54]:
data11.loc[data11['bun']==692801]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
1851,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,90.0,692801.0,2015-07-29,23.0


In [55]:
data11.iloc[1850:1853, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
1850,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,,80.0,2015-01-28,29.0
1851,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,90.0,692801.0,2015-07-29,23.0
1852,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,101.0,72.0,2015-09-09,25.0


In [56]:
# Change strange value
data11.loc[data11['bun']==692801, ['bun']] = 80.0

In [57]:
data11.loc[data11['bun']==365]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2048,168,Patras,1956.0,Male,Caucasian,1.63,69.0,Past-Smoker,False,False,GMN,125.0,365.0,2018-01-30,122.0


In [58]:
data11.iloc[2047:2050, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2047,168,Patras,1956.0,Male,Caucasian,1.63,72.0,Past-Smoker,False,False,GMN,130.0,36.0,2017-09-26,122.0
2048,168,Patras,1956.0,Male,Caucasian,1.63,69.0,Past-Smoker,False,False,GMN,125.0,365.0,2018-01-30,122.0
2049,168,Patras,1956.0,Male,Caucasian,1.63,70.0,Past-Smoker,False,False,GMN,,42.0,2018-02-10,104.0


In [59]:
# Change strange value
data11.loc[data11['bun']==365, ['bun']] = 36.0

In [60]:
data11.loc[data11['bun']==471]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2107,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,471.0,2015-01-06,105.0


In [61]:
data11.iloc[2106:2109, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2106,173,Patras,1966.0,Male,Caucasian,1.67,103.0,Past-Smoker,False,False,HTN,135.0,166.0,2019-07-31,11.0
2107,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,471.0,2015-01-06,105.0
2108,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,60.0,2015-03-30,105.0


In [62]:
# Change strange value
data11.loc[data11['bun']==471, ['bun']] = 60.0

In [63]:
# Replace NaN with zero for now
data11['bun'].replace(0, np.nan, inplace=True)

## Weight

In [64]:
# No zero values for weight, but many NaN values
print(data11['weight'].isna().sum())
data11.loc[data11['weight']==0]

7248


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr


In [65]:
data12 = pd.concat([data11.loc[(data11['weight'] > 30) & (data11['weight'] < 200)], data11.loc[data11['weight'].isna()]]).reset_index(drop=True)
data12['ID'] = data12.groupby(['ID']).ngroup()
data12.sort_values(['ID', 'date'], ignore_index=True, inplace=True)

# Cleaning involving imputing values
We process data in the following data sets **separately** to avoid data leak:


*   Test (extrapolation): "test_extrapolation"
*   Test (interpolation): "test_interpolation"
*   Full training set: "data_train_full"

For 5-fold cross-validation:
*   Train set 1, Val set 1
*   Train set 2, Val set 2
*   Train set 3, Val set 3
*   Train set 4, Val set 4
*   Train set 5, Val set 5

stored in "train_set_list", "val_set_list"

All data in test and validation sets are imputed based solely on the training set(s)

In [66]:
# Convert dates to number of days since first observation for each patient
data13 = datetime_to_days_diff(data12, 'ID', 'date')

# Shuffle data by patient
data14 = shuffle_data_by_group(data13, 'ID', random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['times'] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['times'] = pd.to_numeric(group['times'], downcast="integer")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



## Create main train, test extrapolation, test interpolation sets

In [67]:
# Get patients with only 1 observation, 80 in total
a = data14.groupby(['ID']).size()==1
single_obs_rows = data14[data14['ID'].map(a)]
single_obs_rows

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
94,12,Patras,1944.0,Male,Caucasian,1.69,67.0,Smoker,False,False,Other,115.0,56.0,2017-11-14,45.0,0
296,43,Patras,1981.0,Male,Caucasian,1.74,78.0,Past-Smoker,False,False,GMN,140.0,54.0,2017-08-11,61.0,0
514,68,Patras,1972.0,Female,Caucasian,1.58,60.0,,False,False,GMN,125.0,60.0,2018-06-11,43.0,0
709,97,Patras,1944.0,Male,Caucasian,1.73,78.0,,False,False,Other,,99.0,2016-09-27,16.0,0
897,122,Patras,1954.0,Female,Caucasian,1.56,84.0,Non-Smoker,False,False,HTN,125.0,100.0,2017-11-22,48.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9648,1271,Patras,1930.0,Female,Caucasian,1.65,75.0,Non-Smoker,False,False,GMN,,104.0,2019-11-19,20.0,0
9787,1288,Patras,1947.0,Male,Caucasian,1.75,78.0,,False,False,Other,,46.0,2016-10-20,44.0,0
10048,1324,Patras,1997.0,Female,Caucasian,1.61,62.0,Non-Smoker,False,False,GMN,120.0,33.0,2019-04-16,112.0,0
10049,1325,Patras,1947.0,Male,Caucasian,1.78,75.0,Past-Smoker,False,False,GMN,110.0,78.0,2020-04-28,37.0,0


In [68]:
# Remaining data
data15 = data14[data14['ID'].map(~a)]

# Number of remaining groups
data15['ID'].unique().size

1287

In [69]:
# Split test extrapolation set
data_temp, test_extrapolation = train_test_split_grouped_extrapolation(data15, data15['ID'], test_size=0.2, random_state=1)

In [70]:
test_extrapolation = pd.concat([test_extrapolation, single_obs_rows]).reset_index(drop=True)
test_extrapolation

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,6,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,88.0,2012-03-05,36.0,0
1,6,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,107.0,2013-01-29,40.0,330
2,6,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,94.0,2013-05-27,43.0,448
3,6,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,79.0,2013-09-20,38.0,564
4,6,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,45.0,2014-01-10,46.0,676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230,1271,Patras,1930.0,Female,Caucasian,1.65,75.0,Non-Smoker,False,False,GMN,,104.0,2019-11-19,20.0,0
2231,1288,Patras,1947.0,Male,Caucasian,1.75,78.0,,False,False,Other,,46.0,2016-10-20,44.0,0
2232,1324,Patras,1997.0,Female,Caucasian,1.61,62.0,Non-Smoker,False,False,GMN,120.0,33.0,2019-04-16,112.0,0
2233,1325,Patras,1947.0,Male,Caucasian,1.78,75.0,Past-Smoker,False,False,GMN,110.0,78.0,2020-04-28,37.0,0


In [71]:
data_temp.reset_index(inplace=True, drop=True)
data_temp.sort_values(['ID', 'times'], ignore_index=True, inplace=True)
group_sizes = data_temp.groupby(['ID']).size().to_numpy()
data_train_full, test_interpolation, n_samples_chosen_per_group = train_test_split_grouped_interpolation(data_temp, group_sizes, test_size=0.25, random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [72]:
test_interpolation.reset_index(inplace=True, drop=True)
test_interpolation

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,172.0,7.0,2017-02-23,75.0,1091
1,1,Sheffield,1959.0,Female,Caucasian,0.00,70.7,,True,True,Transplant,174.0,10.0,2017-06-14,29.0,1223
2,2,Sheffield,1958.0,Male,Caucasian,0.00,65.2,,True,False,Transplant,139.0,14.9,2019-03-20,26.0,1723
3,3,Sheffield,1972.0,Female,Caucasian,0.00,65.2,,True,False,Transplant,154.0,6.4,2019-06-02,54.0,1458
4,4,Patras,1955.0,Male,Caucasian,1.75,80.0,Smoker,False,False,HTN,150.0,43.0,2016-05-10,72.0,374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2035,1363,Sheffield,0.0,Female,Caucasian,0.00,,,False,False,HTN,119.0,,2010-09-07,19.0,816
2036,1363,Sheffield,0.0,Female,Caucasian,0.00,,,False,False,HTN,142.0,,2010-10-15,16.0,854
2037,1364,Patras,1977.0,Male,Caucasian,1.65,86.0,Smoker,False,False,GMN,120.0,52.0,2017-12-06,60.0,1401
2038,1365,Sheffield,1949.0,Female,Caucasian,0.00,49.2,,False,False,HTN,176.0,7.7,2019-11-07,35.0,1478


In [73]:
data_train_full.reset_index(inplace=True, drop=True)
data_train_full

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,,8.0,2014-02-28,73.0,0
1,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,143.0,8.0,2014-08-15,62.0,168
2,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,150.0,6.0,2014-12-19,74.0,294
3,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,138.0,7.0,2015-07-05,77.0,492
4,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,144.0,7.0,2015-10-29,70.0,608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6118,1366,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,,46.0,2017-03-14,66.0,275
6119,1366,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,120.0,63.0,2017-06-06,61.0,359
6120,1366,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,,39.0,2017-09-28,66.0,473
6121,1366,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,120.0,45.0,2017-11-29,61.0,535


## Create validation sets

In [74]:
# Cross-validation sets
train_set_list, val_set_list = [], []
group_kfold = GroupKFold(n_splits=5)
for train_index, val_index in group_kfold.split(data_train_full, groups=data_train_full['ID']):
    train_set_list.append(data_train_full.iloc[train_index].reset_index(drop=True))
    val_set_list.append(data_train_full.iloc[val_index].reset_index(drop=True))

In [75]:
# Training and validation sets
data_train, data_val = train_test_split_grouped_extrapolation(data_train_full, data_train_full['ID'], test_size=0.25, random_state=1)
data_train.reset_index(inplace=True, drop=True)
data_val.reset_index(inplace=True, drop=True)

In [76]:
data_val

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,2,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,108.0,12.0,2014-07-01,9.0,0
1,2,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,96.0,23.0,2014-12-06,4.0,158
2,2,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,153.0,10.0,2014-12-30,75.0,182
3,2,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,,10.0,2015-07-07,69.0,371
4,2,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,117.0,8.0,2015-12-15,71.0,532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1707,1359,Sheffield,1958.0,Male,Caucasian,0.0,,,True,False,Transplant,130.0,7.0,2016-12-13,68.0,896
1708,1360,Sheffield,1966.0,Male,Caucasian,0.0,,,True,False,Transplant,140.0,16.0,2014-02-07,7.0,0
1709,1360,Sheffield,1966.0,Male,Caucasian,0.0,,,True,False,Transplant,134.0,4.0,2014-05-02,8.0,84
1710,1360,Sheffield,1966.0,Male,Caucasian,0.0,,,True,False,Transplant,134.0,6.0,2015-01-28,53.0,355


## Height

### Impute for data_train_full, test_extrapolation, test_interpolation

In [77]:
# Median height among all non-zero rows
median_height = data_train_full.loc[data_train_full['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train_full.loc[data_train_full['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
height_table

Unnamed: 0,gender,ethnicity,height
0,Female,Asian,1.57
1,Female,Black,1.67
2,Female,Caucasian,1.6
3,Male,Asian,1.725
4,Male,Black,1.76
5,Male,Caucasian,1.73


In [78]:
data_train_full2 = imputer(data_train_full, height_table, median_height, 'height')
test_extrapolation2 = imputer(test_extrapolation, height_table, median_height, 'height')
test_interpolation2 = imputer(test_interpolation, height_table, median_height, 'height')

In [79]:
# All looks good
print('data_train_full: ', data_train_full2['height'].unique())
print('test_extrapolation: ', test_extrapolation2['height'].unique())
print('test_interpolation: ', test_interpolation2['height'].unique())

data_train_full:  [1.69  1.6   1.73  1.75  1.76  1.66  1.58  1.65  1.7   1.68  1.63  1.72
 1.71  1.59  1.8   1.85  1.79  1.61  1.83  1.77  1.45  1.67  1.62  1.55
 1.53  1.42  1.56  1.57  1.78  1.64  1.81  1.601 1.625 1.92  1.52  1.74
 1.5   1.95  1.82  2.02  1.725 1.49  1.51  1.9   1.98  1.752 1.54  1.86
 1.87  1.88  1.84  2.    1.41  1.91  1.47  1.89 ]
test_extrapolation:  [1.67  1.73  1.75  1.7   1.6   1.58  1.76  1.69  1.65  1.62  1.8   1.61
 1.68  1.47  1.71  1.77  1.56  1.725 1.52  1.74  1.72  1.78  1.91  1.63
 1.66  1.64  1.81  1.59  1.5   1.44  1.79  1.95  1.48  1.85  1.82  1.87
 1.57  1.88  1.53  1.83 ]
test_interpolation:  [1.69  1.6   1.73  1.75  1.76  1.66  1.58  1.65  1.7   1.68  1.63  1.72
 1.71  1.59  1.8   1.85  1.79  1.61  1.83  1.77  1.45  1.67  1.62  1.55
 1.53  1.42  1.56  1.57  1.78  1.64  1.81  1.601 1.625 1.92  1.52  1.74
 1.5   1.95  1.82  2.02  1.725 1.49  1.51  1.9   1.98  1.752 1.54  1.86
 1.87  1.88  1.84  2.    1.41  1.91  1.47  1.89 ]


### Impute for data_train and data_val

In [80]:
# Median height among all non-zero rows
median_height = data_train.loc[data_train['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train.loc[data_train['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
height_table

Unnamed: 0,gender,ethnicity,height
0,Female,Asian,1.575
1,Female,Black,1.63
2,Female,Caucasian,1.6005
3,Male,Asian,1.7
4,Male,Black,1.76
5,Male,Caucasian,1.72


In [81]:
data_train2 = imputer(data_train, height_table, median_height, 'height')
data_val2 = imputer(data_val, height_table, median_height, 'height')

In [82]:
# All looks good
print('data_train: ', data_train2['height'].unique())
print('data_val: ', data_val2['height'].unique())

data_train:  [1.68   1.6005 1.75   1.72   1.6    1.76   1.65   1.7    1.58   1.59
 1.8    1.85   1.79   1.61   1.66   1.67   1.62   1.53   1.42   1.56
 1.575  1.63   1.78   1.64   1.81   1.601  1.73   1.625  1.92   1.52
 1.55   1.5    1.57   1.77   1.45   1.69   1.95   1.83   1.74   1.71
 1.82   2.02   1.49   1.9    1.98   1.86   1.88   1.84   1.54   2.
 1.87   1.91   1.47   1.51   1.89  ]
data_val:  [1.72   1.6005 1.68   1.66   1.58   1.63   1.6    1.7    1.71   1.83
 1.77   1.45   1.55   1.81   1.67   1.65   1.75   1.74   1.76   1.85
 1.5    1.78   1.62   1.73   1.51   1.53   1.61   1.82   1.752  1.54
 1.69   1.8    1.87   1.57   1.41   1.56  ]


### Impute for CV sets

In [83]:
train_set_list2, val_set_list2 = [], []
for i in range(len(train_set_list)):
    train_set = train_set_list[i]
    val_set = val_set_list[i]
    median_height = train_set.loc[train_set['height']!=0].groupby(['ID']).height.median().median()
    height_table = pd.DataFrame({'height': train_set.loc[train_set['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
    train_set_list2.append(imputer(train_set, height_table, median_height, 'height'))
    val_set_list2.append(imputer(val_set, height_table, median_height, 'height'))

## Weight

### Impute for data_train_full, test_extrapolation, test_interpolation

In [84]:
# Median weight among all non-NaN rows
median_weight = data_train_full2.loc[~data_train_full2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train_full2['weight'] = data_train_full2['weight'].fillna(data_train_full2.groupby('ID')['weight'].transform('mean'))
test_extrapolation2['weight'] = test_extrapolation2['weight'].fillna(data_train_full2.groupby('ID')['weight'].transform('mean'))
test_interpolation2['weight'] = test_interpolation2['weight'].fillna(data_train_full2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train_full2['weight'].fillna(median_weight, inplace=True)
test_extrapolation2['weight'].fillna(median_weight, inplace=True)
test_interpolation2['weight'].fillna(median_weight, inplace=True)

# All looks good
print('data_train_full: ', data_train_full2['weight'].isna().sum())
print('test_extrapolation: ', test_extrapolation2['weight'].isna().sum())
print('test_interpolation: ', test_interpolation2['weight'].isna().sum())

data_train_full:  0
test_extrapolation:  0
test_interpolation:  0


### Impute for data_train and data_val

In [85]:
# Median weight among all non-NaN rows
median_weight = data_train2.loc[~data_train2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train2['weight'] = data_train2['weight'].fillna(data_train2.groupby('ID')['weight'].transform('mean'))
data_val2['weight'] = data_val2['weight'].fillna(data_train2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train2['weight'].fillna(median_weight, inplace=True)
data_val2['weight'].fillna(median_weight, inplace=True)

# All looks good
print('data_train: ', data_train2['weight'].isna().sum())
print('data_val: ', data_val2['weight'].isna().sum())

data_train:  0
data_val:  0


### Impute for CV sets

In [86]:
for i in range(len(train_set_list2)):
    # Median weight among all non-NaN rows
    median_weight = train_set_list2[i].loc[~train_set_list2[i]['weight'].isna()].groupby(['ID']).weight.median().median()

    # Impute NaN values using the mean of each patient in the training set where available
    train_set_list2[i]['weight'] = train_set_list2[i]['weight'].fillna(train_set_list2[i].groupby('ID')['weight'].transform('mean'))
    val_set_list2[i]['weight'] = val_set_list2[i]['weight'].fillna(train_set_list2[i].groupby('ID')['weight'].transform('mean'))

    # Fill remaining NaN values with overall median
    train_set_list2[i]['weight'].fillna(median_weight, inplace=True)
    val_set_list2[i]['weight'].fillna(median_weight, inplace=True)

## Age

### Impute for data_train_full, test_extrapolation, test_interpolation

In [87]:
# Median dob_year among all non-zero rows
median_dob = data_train_full2.loc[data_train_full2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train_full2.loc[data_train_full2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
dob_table

Unnamed: 0,gender,ethnicity,dob_year
0,Female,Asian,1952.0
1,Female,Black,1970.0
2,Female,Caucasian,1962.0
3,Female,Others,1965.0
4,Male,Asian,1974.0
5,Male,Caucasian,1954.0
6,Male,Others,1953.0


In [88]:
data_train_full3 = imputer(data_train_full2, dob_table, median_dob, 'dob_year')
test_extrapolation3 = imputer(test_extrapolation2, dob_table, median_dob, 'dob_year')
test_interpolation3 = imputer(test_interpolation2, dob_table, median_dob, 'dob_year')

In [89]:
# Convert dob_year to age
data_train_full3.insert(3, 'age', data_train_full3['date'].dt.year - data_train_full3['dob_year'])
data_train_full3.drop(columns=['dob_year', 'date'], inplace=True)
test_extrapolation3.insert(3, 'age', test_extrapolation3['date'].dt.year - test_extrapolation3['dob_year'])
test_extrapolation3.drop(columns=['dob_year', 'date'], inplace=True)
test_interpolation3.insert(3, 'age', test_interpolation3['date'].dt.year - test_interpolation3['dob_year'])
test_interpolation3.drop(columns=['dob_year', 'date'], inplace=True)

### Impute for data_train and data_val

In [90]:
# Median dob_year among all non-zero rows
median_dob = data_train2.loc[data_train2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train2.loc[data_train2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
dob_table

Unnamed: 0,gender,ethnicity,dob_year
0,Female,Asian,1952.0
1,Female,Black,1970.0
2,Female,Caucasian,1964.0
3,Female,Others,1965.0
4,Male,Asian,1965.0
5,Male,Caucasian,1954.0
6,Male,Others,1959.0


In [91]:
data_train3 = imputer(data_train2, dob_table, median_dob, 'dob_year')
data_val3 = imputer(data_val2, dob_table, median_dob, 'dob_year')

In [92]:
# Convert dob_year to age
data_train3.insert(3, 'age', data_train3['date'].dt.year - data_train3['dob_year'])
data_train3.drop(columns=['dob_year', 'date'], inplace=True)
data_val3.insert(3, 'age', data_val3['date'].dt.year - data_val3['dob_year'])
data_val3.drop(columns=['dob_year', 'date'], inplace=True)

### Impute for CV sets

In [93]:
train_set_list3, val_set_list3 = [], []
for i in range(len(train_set_list2)):
    train_set = train_set_list2[i]
    val_set = val_set_list2[i]
    median_dob = train_set.loc[train_set['dob_year']!=0].groupby(['ID']).dob_year.median().median()
    dob_table = pd.DataFrame({'dob_year': train_set.loc[train_set['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
    train_set3 = imputer(train_set, dob_table, median_dob, 'dob_year')
    val_set3 = imputer(val_set, dob_table, median_dob, 'dob_year')
    train_set3.insert(3, 'age', train_set3['date'].dt.year - train_set3['dob_year'])
    train_set3.drop(columns=['dob_year', 'date'], inplace=True)
    val_set3.insert(3, 'age', val_set3['date'].dt.year - val_set3['dob_year'])
    val_set3.drop(columns=['dob_year', 'date'], inplace=True)
    train_set_list3.append(train_set3)
    val_set_list3.append(val_set3)

## bp.sys

In [94]:
# Fill zero values with mean of each patient
data_train_full3['bp.sys'] = data_train_full3['bp.sys'].fillna(data_train_full3.groupby('ID')['bp.sys'].transform('mean'))
test_extrapolation3['bp.sys'] = test_extrapolation3['bp.sys'].fillna(data_train_full3.groupby('ID')['bp.sys'].transform('mean'))
test_interpolation3['bp.sys'] = test_interpolation3['bp.sys'].fillna(data_train_full3.groupby('ID')['bp.sys'].transform('mean'))
data_train3['bp.sys'] = data_train3['bp.sys'].fillna(data_train3.groupby('ID')['bp.sys'].transform('mean'))
data_val3['bp.sys'] = data_val3['bp.sys'].fillna(data_train3.groupby('ID')['bp.sys'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full3['bp.sys'] = data_train_full3['bp.sys'].fillna(data_train_full3['bp.sys'].mean())
test_extrapolation3['bp.sys'] = test_extrapolation3['bp.sys'].fillna(data_train_full3['bp.sys'].mean())
test_interpolation3['bp.sys'] = test_interpolation3['bp.sys'].fillna(data_train_full3['bp.sys'].mean())
data_train3['bp.sys'] = data_train3['bp.sys'].fillna(data_train3['bp.sys'].mean())
data_val3['bp.sys'] = data_val3['bp.sys'].fillna(data_train3['bp.sys'].mean())

In [95]:
for i in range(len(train_set_list3)):
    train_set_list3[i]['bp.sys'] = train_set_list3[i]['bp.sys'].fillna(train_set_list3[i].groupby('ID')['bp.sys'].transform('mean'))
    val_set_list3[i]['bp.sys'] = val_set_list3[i]['bp.sys'].fillna(train_set_list3[i].groupby('ID')['bp.sys'].transform('mean'))
    train_set_list3[i]['bp.sys'] = train_set_list3[i]['bp.sys'].fillna(train_set_list3[i]['bp.sys'].mean())
    val_set_list3[i]['bp.sys'] = val_set_list3[i]['bp.sys'].fillna(train_set_list3[i]['bp.sys'].mean())

## bun

In [96]:
# Fill zero values with mean of each patient
data_train_full3['bun'] = data_train_full3['bun'].fillna(data_train_full3.groupby('ID')['bun'].transform('mean'))
test_extrapolation3['bun'] = test_extrapolation3['bun'].fillna(data_train_full3.groupby('ID')['bun'].transform('mean'))
test_interpolation3['bun'] = test_interpolation3['bun'].fillna(data_train_full3.groupby('ID')['bun'].transform('mean'))
data_train3['bun'] = data_train3['bun'].fillna(data_train3.groupby('ID')['bun'].transform('mean'))
data_val3['bun'] = data_val3['bun'].fillna(data_train3.groupby('ID')['bun'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full3['bun'] = data_train_full3['bun'].fillna(data_train_full3['bun'].mean())
test_extrapolation3['bun'] = test_extrapolation3['bun'].fillna(data_train_full3['bun'].mean())
test_interpolation3['bun'] = test_interpolation3['bun'].fillna(data_train_full3['bun'].mean())
data_train3['bun'] = data_train3['bun'].fillna(data_train3['bun'].mean())
data_val3['bun'] = data_val3['bun'].fillna(data_train3['bun'].mean())

In [97]:
for i in range(len(train_set_list3)):
    train_set_list3[i]['bun'] = train_set_list3[i]['bun'].fillna(train_set_list3[i].groupby('ID')['bun'].transform('mean'))
    val_set_list3[i]['bun'] = val_set_list3[i]['bun'].fillna(train_set_list3[i].groupby('ID')['bun'].transform('mean'))
    train_set_list3[i]['bun'] = train_set_list3[i]['bun'].fillna(train_set_list3[i]['bun'].mean())
    val_set_list3[i]['bun'] = val_set_list3[i]['bun'].fillna(train_set_list3[i]['bun'].mean())

# Save

In [98]:
# Fill NaN values with Unknown
data_train_full4 = data_train_full3.fillna('Unknown')
test_extrapolation4 = test_extrapolation3.fillna('Unknown')
test_interpolation4 = test_interpolation3.fillna('Unknown')
data_train4 = data_train3.fillna('Unknown')
data_val4 = data_val3.fillna('Unknown')

train_set_list4, val_set_list4 = [], []
for i in range(len(train_set_list3)):
    train_set_list4.append(train_set_list3[i].fillna('Unknown'))
    val_set_list4.append(val_set_list3[i].fillna('Unknown'))

In [100]:
data_train_full4.to_csv('data/data_train_full.csv', index=False)
test_extrapolation4.to_csv('data/test_extrapolation.csv', index=False)
test_interpolation4.to_csv('data/test_interpolation.csv', index=False)
data_train4.to_csv('data/data_train.csv', index=False)
data_val4.to_csv('data/data_val.csv', index=False)

for i in range(len(train_set_list4)):
    train_set_list4[i].to_csv('data/data_cv_train_' + str(i+1) + '.csv', index=False)
    val_set_list4[i].to_csv('data/data_cv_val_' + str(i+1) + '.csv', index=False)

In [101]:
n_samples_chosen_per_group_series = pd.Series(n_samples_chosen_per_group)
n_samples_chosen_per_group_series.to_csv('data/n_samples_chosen_per_group_series.csv', index=False)