In [3]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, StratifiedKFold
from scipy.stats import linregress

from utils import * 

## In this notebook, we convert the longitudinal dataset to a cross sectional one, by keeping only the first observation for each patient, and by assigning the cluster labels as target values. In addition, we perform individual linear regressions on the eGFR trajectories to create a 'slope' and 'R^2' feature for each patient.

In [6]:
df1 = pd.read_csv('patras_data.csv', parse_dates=['date'])
df2 = pd.read_csv('sheffield_data.csv', parse_dates=['date'])
labels = pd.read_csv('labelled_patients.csv', index_col=[0])

# Combine dataframes

In [12]:
# Reset Patient ID for both dataframes
df1.groupby(['ID']).ngroup()

0         0
1         0
2         0
3         0
4         0
       ... 
4348    558
4349    558
4350    558
4351    559
4352    559
Length: 4353, dtype: int64

In [13]:
df1['ID'] = df1.groupby(['ID']).ngroup()

In [14]:
df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

0        560
1        560
2        560
3        560
4        560
        ... 
6637    1552
6638    1552
6639    1553
6640    1553
6641    1553
Length: 6642, dtype: int64

In [15]:
df2['ID'] = df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

In [16]:
data = pd.concat([df1, df2], axis=0, ignore_index=True)

In [8]:
data

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
0,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
1,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
2,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
3,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
4,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10990,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10991,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False
10992,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10993,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False


In [9]:
data.describe()

Unnamed: 0,ID,dob_year,height,sampleId,egfr,ckd_stage,slope_allprior,N_allprior,slope_2yprior,N_2yprior,...,micro_albuminuria,hba1c,c_reactive_protein,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe
count,10995.0,8924.0,10985.0,3851.0,10522.0,3847.0,9024.0,10316.0,8916.0,10316.0,...,187.0,286.0,234.0,603.0,599.0,599.0,155.0,154.0,340.0,292.0
mean,704.031924,1959.437584,5.387872,1016.242534,46.771944,1.40863,0.034643,5.451241,-0.054014,4.063494,...,1555.666695,23.944965,2.914231,57.589552,2628.758097,1514.981636,410.993548,9.491558,48.803235,207.782192
std,459.267671,15.180295,27.44879,601.27798,25.192801,1.722691,67.00758,3.621561,68.600263,2.270368,...,2309.599102,26.783405,9.077228,55.430334,3926.904331,18566.007661,235.477892,22.940807,37.259726,125.553761
min,0.0,1926.0,0.0,1.0,0.0,0.0,-1460.0,1.0,-1460.0,1.0,...,0.0,4.6,0.0,7.9,3.7,22.0,50.0,1.7,4.3,3.8
25%,276.0,1948.0,0.0,510.5,27.0,0.0,-5.318589,2.0,-5.903179,2.0,...,58.75,6.4,0.23,11.9,7.29,187.5,265.5,4.025,14.95,54.2
50%,711.0,1957.0,1.56,997.0,42.0,0.0,-1.031236,5.0,-1.170697,4.0,...,545.0,7.75,0.435,14.6,9.82,238.0,379.0,5.55,46.0,254.5
75%,1098.0,1969.0,1.7,1481.5,61.0,3.0,2.282602,8.0,3.055405,5.0,...,2326.5,47.0,1.0,116.0,6305.0,284.0,494.5,8.725,71.0,306.0
max,1553.0,2002.0,188.0,2274.0,189.0,41.0,4015.0,17.0,4015.0,14.0,...,15171.0,137.0,81.0,166.0,16270.0,320000.0,2000.0,284.0,342.0,510.0


In [17]:
# Select non-control patients
data = data[data['control_patient']==False]
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
0,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
1,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
2,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
3,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
4,0,Patras,16/02/2016 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10815,1552,Sheffield,09/06/2019,False,1973.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False
10816,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,,
10817,1553,Sheffield,09/06/2019,False,1948.0,Female,Caucasian,,0.0,,...,,,,,,,,,False,False


# Cleaning not involving imputing values

## Initial cleaning

In [18]:
# Dataframe after selecting relevant columns
data2 = data[['ID', 'site', 'dob_year','gender', 'ethnicity', 'height', 'Weight', 'smoker', 'kidney_transplant', 'Patient_died', 'disease', 'bp.sys', 'bun', 'date', 'egfr']]
data2 = data2.rename(columns={'Weight': 'weight', 'Patient_died': 'patient_died'})
data2['ID'] = data2.groupby(['ID']).ngroup()
data2

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1421,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10815,1421,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10816,1422,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10817,1422,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [19]:
data2['smoker'] = data2['smoker'].replace({'Past Smoker': 'Past-Smoker'})

In [20]:
data2['smoker'].value_counts()

Non-Smoker     2602
Past-Smoker    1567
Smoker          896
Name: smoker, dtype: int64

In [21]:
# Notice that for this patient, egfr and ckd_stage appears to have been switched incorrectly, so we manualy set egfr to the correct value
data.loc[data['egfr']==3]

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,...,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
3539,373,Patras,17/07/2017 12:39,False,1956.0,Male,Caucasian,Semi-Professional,1.7,Smoker,...,,,,,,,,,False,False


In [22]:
data2.loc[data2['egfr']==3, 'egfr'] = 41

In [23]:
# Sort values by date
data3 = data2.sort_values(['ID', 'date'], ignore_index=True)

## Regrouping ethnicity

In [24]:
data4 = data3.copy()

In [25]:
data4['ethnicity'].value_counts()

Caucasian                        9209
Asian                             146
Black (afro caribean descent)     108
Others                             56
Asian - Indian                     12
INDIAN                             11
Asian (Pakistan)                   11
Somali                              9
Asian(INDIAN)                       6
Chinese                             6
Asian( PAKISTANI)                   5
Indian                              5
Asian ( BANGLADESHHI)               2
Arab                                1
Name: ethnicity, dtype: int64

In [26]:
# Combine categories
data4['ethnicity'] = data4['ethnicity'].replace(dict.fromkeys(['Asian - Indian', 'Asian (Pakistan)', 'INDIAN', 'Chinese', 'Asian(INDIAN)', 'Indian', 'Asian( PAKISTANI)', 'Arab', 'Asian ( BANGLADESHHI)'], 'Asian'))
data4['ethnicity'] = data4['ethnicity'].replace({'Somali': 'Others'})
data4['ethnicity'] = data4['ethnicity'].replace({'Black (afro caribean descent)': 'Black'})

In [27]:
data4['ethnicity'].value_counts()

Caucasian    9209
Asian         205
Black         108
Others         65
Name: ethnicity, dtype: int64

## egfr

In [28]:
data4['egfr'].unique()

array([ 69. ,  62. ,  56. ,  50. ,  54. ,  53. ,  46. ,  58. ,  49. ,
        90. , 103. ,  33. ,  31. ,  29. ,  36. ,  35. ,  45. ,  48. ,
        59. ,  72. ,  41. ,  44. ,  64. ,  60. ,  71. ,  47. ,  66. ,
        76. ,  52. ,  39. ,  28. ,  84. ,  89. ,  24. ,  38. ,  27. ,
        21. ,  22. ,  30. ,  23. ,  20. ,  19. ,  26. ,  16. ,  13. ,
        32. ,  34. ,  43. ,  37. ,  25. ,   9. ,  15. ,  10. , 107. ,
        82. ,  93. ,  63. , 106. ,  92. , 118. , 150. , 149. , 101. ,
       125. , 127. , 124. ,  67. ,  55. ,  40. ,  57. ,  68. ,  42. ,
        65. ,  73. ,  80. ,  81. ,  61. ,  18. ,  78. ,  70. ,  14. ,
        17. ,  11. ,  12. ,  85. ,  77. ,  79. , 102. ,  74. ,  51. ,
        97. ,  83. ,  96. , 108. ,  94. ,  86. ,  75. , 109. ,  91. ,
       104. ,  88. ,  87. ,  95. , 114. , 100. , 133. ,  99. , 113. ,
       112. ,   7. , 120. ,  98. , 158. , 128. , 129. , 145. , 122. ,
       111. , 117. , 110. , 131. ,   8. , 123. , 146. , 105. , 136. ,
       119. , 141. ,

In [29]:
# Missing values for egfr
data4['egfr'].isna().sum()

402

In [30]:
# Drop all rows with missing egfr values
data5 = data4.dropna(subset=['egfr'])
data5.reset_index(inplace=True, drop=True)
data5['ID'] = data5.groupby(['ID']).ngroup()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data5['ID'] = data5.groupby(['ID']).ngroup()


In [31]:
# Drop row with 0 egfr value (1 row)
data6 = data5.loc[data5['egfr'] != 0]
data6.reset_index(inplace=True, drop=True)
data6['ID'] = data6.groupby(['ID']).ngroup()
data6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data6['ID'] = data6.groupby(['ID']).ngroup()


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10411,1370,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10412,1370,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10413,1371,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10414,1371,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


## Assign labels from the clustering analysis part

In [104]:
label_dict = {labels['ID'].values[i] : labels['Label'].values[i] for i in range(labels.shape[0])}
data6['Labels'] = np.zeros_like(data6['ID'].values)

for i in range(data6.shape[0]):
    key = data6['ID'].values[i]
    try:
        data6['Labels'].values[i] = label_dict[key]
    except: # assign a random value to patients whose trajectories have not been used in clustering --> Drop these later
        data6['Labels'].values[i] = 1000

data6 = data6[data6['Labels'] <= 2]
# Making sure that everything works as expected
assert np.sum(np.unique(data6['ID'].values) == np.unique(labels['ID'])) == labels['ID'].shape[0] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data6['Labels'] = np.zeros_like(data6['ID'].values)


In [105]:
data6

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0,1
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0,1
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0,1
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0,1
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10409,1369,Sheffield,1954.0,Male,Caucasian,0.0,,,False,False,DN,190.0,11.2,2019-06-09,36.0,0
10410,1369,Sheffield,1954.0,Male,Caucasian,0.0,140.8,,False,False,DN,163.0,9.6,2019-10-18,40.0,0
10413,1371,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0,2
10414,1371,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0,2


## Height

In [106]:
# Some issues with height
data6.describe()

Unnamed: 0,ID,dob_year,height,weight,bp.sys,bun,egfr,Labels
count,10173.0,8463.0,10165.0,3015.0,9896.0,10157.0,10173.0,10173.0
mean,602.816868,1959.423727,5.567724,103.001187,106.932902,102.788245,46.479544,0.388184
std,398.353123,15.156922,27.975053,1221.185929,62.421653,6874.017196,24.994675,0.644714
min,0.0,1926.0,0.0,1.48,0.0,0.0,4.0,0.0
25%,222.0,1948.0,0.0,69.1,102.0,9.0,27.0,0.0
50%,621.0,1957.0,1.56,79.7,132.0,18.0,42.0,0.0
75%,934.0,1969.0,1.7,90.4,147.0,52.0,61.0,1.0
max,1371.0,2002.0,188.0,67125.0,1158.0,692801.0,189.0,2.0


In [26]:
data6['height'].unique()

array([  1.7  ,   1.67 ,   1.6  ,   1.68 ,   1.65 ,   1.63 ,   1.66 ,
         1.45 ,   1.64 ,   1.8  ,   1.75 ,   1.73 ,   1.62 ,   1.56 ,
         1.76 ,   1.74 ,   1.57 ,   1.71 ,   1.5  ,   1.78 ,   1.48 ,
         1.72 ,   1.69 ,   1.89 ,   1.52 ,   1.59 ,   1.58 ,   1.87 ,
         1.82 ,   1.77 ,   1.53 ,   1.44 ,   1.61 ,   0.   ,   1.95 ,
         1.54 ,   1.88 ,   1.81 ,   1.51 , 169.   ,   1.55 ,   1.9  ,
         2.   ,   1.42 ,   1.41 ,   1.47 ,   1.85 ,   1.83 ,   1.84 ,
           nan,   1.86 ,   1.79 ,   1.92 ,   1.91 , 158.   ,   1.601,
         1.752,   1.625,   1.98 , 180.   , 173.   ,   1.49 , 174.   ,
         2.02 ,  97.6  , 172.   , 166.   , 167.   , 171.   , 142.   ,
       181.   , 188.   , 176.   , 184.   , 168.   , 178.   , 179.   ,
       155.   , 182.   , 183.   , 165.   , 159.   ])

In [27]:
data6['height'].isna().sum()

10

In [107]:
data6[data6['height'].isna().to_numpy()]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
5636,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,142.0,9.0,2015-01-30,48.0,0
5637,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,,10.0,2015-10-29,51.0,0
5638,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,145.0,10.0,2016-05-02,48.0,0
5639,667,Sheffield,1988.0,Male,Caucasian,,73.6,,False,False,HTN,154.0,11.3,2017-02-17,50.0,0
10216,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,144.0,9.0,2018-05-24,24.0,0
10217,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,149.0,11.0,2018-10-18,30.0,0
10218,1316,Sheffield,1946.0,Female,,,71.0,,False,False,GMN,150.0,10.3,2019-03-28,34.0,0
10219,1316,Sheffield,1946.0,Female,,,69.2,,False,False,GMN,144.0,14.0,2019-10-10,32.0,0


In [108]:
# Remove NaN values
data7 = data6.dropna(subset=['height'])
data7.reset_index(inplace=True, drop=True)
data7['ID'] = data7.groupby(['ID']).ngroup()
data7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data7['ID'] = data7.groupby(['ID']).ngroup()


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0,1
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0,1
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0,1
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0,1
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10160,1206,Sheffield,1954.0,Male,Caucasian,0.0,,,False,False,DN,190.0,11.2,2019-06-09,36.0,0
10161,1206,Sheffield,1954.0,Male,Caucasian,0.0,140.8,,False,False,DN,163.0,9.6,2019-10-18,40.0,0
10162,1207,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0,2
10163,1207,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0,2


In [35]:
# Strange value
data7.loc[data7['height']==97.6]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5994,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-06-14,108.0
5995,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-10-18,117.0
5996,711,Sheffield,,Male,Asian,97.6,107.8,,False,False,GMN,0.0,21.9,2010-11-10,18.0


In [109]:
# Remove strange values
data8 = data7.drop(data7.loc[data7['height']==97.6].index.to_numpy())
data8.reset_index(inplace=True, drop=True)
data8['ID'] = data8.groupby(['ID']).ngroup()
data8

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0,1
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0,1
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0,1
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0,1
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,1205,Sheffield,1954.0,Male,Caucasian,0.0,,,False,False,DN,190.0,11.2,2019-06-09,36.0,0
10158,1205,Sheffield,1954.0,Male,Caucasian,0.0,140.8,,False,False,DN,163.0,9.6,2019-10-18,40.0,0
10159,1206,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0,2
10160,1206,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0,2


In [110]:
# Convert all height measurements to meters
data8['height'] = np.where(data8['height'] > 5, data8['height'] / 100, data8['height'])

In [111]:
data8['height'].unique()

array([1.7  , 1.67 , 1.6  , 1.68 , 1.65 , 1.63 , 1.66 , 1.45 , 1.64 ,
       1.8  , 1.73 , 1.75 , 1.62 , 1.56 , 1.76 , 1.74 , 1.57 , 1.71 ,
       1.5  , 1.78 , 1.48 , 1.72 , 1.69 , 1.89 , 1.52 , 1.59 , 1.58 ,
       1.87 , 1.82 , 1.77 , 1.53 , 1.44 , 1.61 , 0.   , 1.95 , 1.54 ,
       1.88 , 1.81 , 1.51 , 1.55 , 1.9  , 2.   , 1.42 , 1.41 , 1.47 ,
       1.85 , 1.83 , 1.86 , 1.79 , 1.92 , 1.91 , 1.601, 1.752, 1.625,
       1.98 , 1.49 , 1.84 , 2.02 ])

In [112]:
# Too many rows with zero values for height to drop from the dataframe
print('Number of rows with zero height values: ',(data8['height']==0).sum())

# All patients with zero height values do not have previously-recorded height values
print('Do all patients with zero height values not have any previously-recorded height values? ', data8.groupby(['ID']).height.min().equals(data8.groupby(['ID']).height.max()))

# All patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)
print('Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)? ', data8.loc[data8['height']!=0].groupby(['ID']).height.median().median() == data8.loc[data8['height']!=0].groupby(['ID']).first().height.median())

Number of rows with zero height values:  4669
Do all patients with zero height values not have any previously-recorded height values?  True
Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)?  True


## Age

In [113]:
# All patients with zero dob_year values do not have previously-recorded values
data8.groupby(['ID']).dob_year.min().equals(data8.groupby(['ID']).dob_year.max())

True

In [114]:
data9 = data8.copy()
data9['dob_year'].fillna(0, inplace=True)

## bp.sys

In [115]:
data10 = data9.copy()

In [42]:
# NaN, zero values and unusually small and large values
data10['bp.sys'].unique()

array([ 150.,    0.,  145.,  160.,  165.,  140.,  135.,  125.,  115.,
        120.,  180.,  130.,  110.,   nan,  155.,  137.,  122.,  105.,
        170.,  134.,  124.,  133.,  100.,  138.,  148.,  139.,  152.,
        121.,   96.,   95.,   90.,  200.,  136.,  128.,  127.,  132.,
        220.,  178.,  144.,  142.,   97.,  166.,  146.,  151.,  131.,
        117.,  126.,  300.,  123.,  175.,  149.,  190.,  156.,  114.,
         98.,  113.,  184.,  147.,  141.,  107.,   80.,  101.,  185.,
        195.,  154., 1158.,   14.,   85.,  106.,  143.,  164.,  103.,
        112.,  129.,  118.,  111.,  119.,  162.,  108.,  169.,  172.,
         92.,  177.,  153.,  159.,  174.,  116.,  173.,  168.,  183.,
        181.,  167.,  187.,  158.,  176.,  191.,  102.,  171.,  161.,
        163.,  188.,  192.,  157.,  196.,   91.,  199.,   88.,  109.,
        179.,   72.,  104.,  182.,  186.,  189.,  193.,  206.,  197.,
        209.,   17.,  237.,  202.,   99.,   93.,  215.,  225.,  207.,
        203.,  214.,

In [116]:
data10.loc[data10['bp.sys']==1158]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2036,163,Patras,1956.0,Male,Caucasian,1.63,67.0,Past-Smoker,False,False,GMN,1158.0,34.0,2016-12-07,122.0,2


In [117]:
data10.iloc[2043:2046, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2043,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,120.0,55.0,2013-05-14,68.0,2
2044,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,140.0,56.0,2013-07-16,68.0,2
2045,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,120.0,54.0,2014-07-01,60.0,2


In [118]:
# Change strange value
data10.loc[data10['bp.sys']==1158, ['bp.sys']] = 120.0

In [119]:
data10.loc[data10['bp.sys']==300]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
855,66,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,300.0,25.0,2015-03-26,72.0,2


In [120]:
data10.iloc[858:861, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
858,66,Patras,1952.0,Male,Caucasian,1.74,84.0,Past-Smoker,True,False,Transplant,135.0,32.0,2016-11-15,80.0,2
859,66,Patras,1952.0,Male,Caucasian,1.74,83.5,Past-Smoker,True,False,Transplant,0.0,41.0,2017-04-09,80.0,2
860,66,Patras,1952.0,Male,Caucasian,1.74,84.0,Past-Smoker,True,False,Transplant,135.0,28.0,2017-11-20,80.0,2


In [121]:
# Change strange value
data10.loc[data10['bp.sys']==300, ['bp.sys']] = 120.0

In [122]:
data10.loc[data10['bp.sys']==17]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
5038,508,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,17.0,17.0,2008-03-10,23.0,0


In [123]:
data10.iloc[5185:5188, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
5185,525,Sheffield,0.0,Male,Caucasian,1.73,,,False,False,GMN,140.0,9.0,2011-04-18,35.0,0
5186,526,Sheffield,0.0,Female,Caucasian,1.8,,,False,False,GMN,133.0,0.0,2009-05-11,92.0,2
5187,526,Sheffield,0.0,Female,Caucasian,1.8,,,False,False,GMN,0.0,0.0,2010-01-28,109.0,2


In [124]:
# Change strange value
data10.loc[data10['bp.sys']==17, ['bp.sys']] = 160.0

In [125]:
data10.loc[data10['bp.sys']==14]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2049,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,14.0,31.0,2015-09-06,60.0,2


In [126]:
data10.iloc[2056:2059, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2056,164,Patras,1954.0,Female,Caucasian,1.58,56.0,Non-Smoker,False,False,GMN,120.0,29.0,2018-09-10,67.0,2
2057,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,0.0,24.0,2019-11-06,67.0,2
2058,165,Patras,1953.0,Male,Caucasian,1.78,,Past-Smoker,False,False,DN,0.0,37.0,2013-04-02,81.0,1


In [127]:
# Change strange value
data10.loc[data10['bp.sys']==14, ['bp.sys']] = 140.0

In [128]:
# Replace NaN with zero for now
data10['bp.sys'].replace(0, np.nan, inplace=True)

## bun

In [129]:
data11 = data10.copy()

In [53]:
# NaN, zero values and unusually large values
# a = data11['bun'].unique()
# np.sort(a)

In [130]:
data11.loc[data11['bun']==692801]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
1845,147,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,90.0,692801.0,2015-07-29,23.0,0


In [131]:
data11.iloc[1850:1853, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
1850,147,Patras,1972.0,Male,Caucasian,1.71,69.0,Non-Smoker,False,False,DN,110.0,103.0,2017-01-26,26.0,0
1851,147,Patras,1972.0,Male,Caucasian,1.71,72.0,Non-Smoker,False,False,DN,95.0,127.0,2017-02-11,25.0,0
1852,147,Patras,1972.0,Male,Caucasian,1.71,71.0,Non-Smoker,False,False,DN,100.0,125.0,2017-06-29,23.0,0


In [132]:
# Change strange value
data11.loc[data11['bun']==692801, ['bun']] = 80.0

In [133]:
data11.loc[data11['bun']==365]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2040,163,Patras,1956.0,Male,Caucasian,1.63,69.0,Past-Smoker,False,False,GMN,125.0,365.0,2018-01-30,122.0,2


In [134]:
data11.iloc[2047:2050, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2047,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,100.0,40.0,2014-09-16,68.0,2
2048,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,140.0,42.0,2015-01-27,77.0,2
2049,164,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,140.0,31.0,2015-09-06,60.0,2


In [135]:
# Change strange value
data11.loc[data11['bun']==365, ['bun']] = 36.0

In [136]:
data11.loc[data11['bun']==471]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2097,168,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,471.0,2015-01-06,105.0,0


In [137]:
data11.iloc[2106:2109, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels
2106,168,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,130.0,28.0,2016-06-21,104.0,0
2107,168,Patras,1985.0,Female,Caucasian,1.57,86.0,Non-Smoker,False,False,GMN,150.0,22.0,2016-07-26,124.0,0
2108,168,Patras,1985.0,Female,Caucasian,1.57,85.0,Non-Smoker,False,False,GMN,145.0,34.0,2016-09-20,104.0,0


In [138]:
# Change strange value
data11.loc[data11['bun']==471, ['bun']] = 60.0

In [139]:
# Replace NaN with zero for now
data11['bun'].replace(0, np.nan, inplace=True)

## Weight

In [140]:
# No zero values for weight, but many NaN values
print(data11['weight'].isna().sum())
data11.loc[data11['weight']==0]

7151


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels


In [141]:
data12 = pd.concat([data11.loc[(data11['weight'] > 30) & (data11['weight'] < 200)], data11.loc[data11['weight'].isna()]]).reset_index(drop=True)
data12['ID'] = data12.groupby(['ID']).ngroup()
data12.sort_values(['ID', 'date'], ignore_index=True, inplace=True)

# Cleaning involving imputing values

In [142]:
# Convert dates to number of days since first observation for each patient
data13 = datetime_to_days_diff(data12, 'ID', 'date')

# Shuffle data by patient
data14 = shuffle_data_by_group(data13, 'ID', random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['times'] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['times'] = pd.to_numeric(group['times'], downcast="integer")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [157]:
data14

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,Labels,times
0,0,Sheffield,0.0,Male,Caucasian,1.72,,Past-Smoker,False,False,DN,132.0,12.0,2006-01-02,31.0,0,0
1,0,Sheffield,0.0,Male,Caucasian,1.72,,Past-Smoker,False,False,DN,144.0,19.0,2006-06-12,23.0,0,161
2,0,Sheffield,0.0,Male,Caucasian,1.72,,Past-Smoker,False,False,DN,160.0,11.0,2006-07-06,27.0,0,185
3,0,Sheffield,0.0,Male,Caucasian,1.72,,Past-Smoker,False,False,DN,153.0,16.0,2006-11-10,20.0,0,312
4,0,Sheffield,0.0,Male,Caucasian,1.72,,Past-Smoker,False,False,DN,143.0,19.0,2007-07-03,23.0,0,547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10153,1206,Patras,1957.0,Male,Caucasian,1.73,80.0,Non-Smoker,True,False,Transplant,135.0,35.0,2017-04-04,91.0,0,1237
10154,1206,Patras,1957.0,Male,Caucasian,1.73,80.0,Non-Smoker,True,False,Transplant,,53.0,2017-09-25,81.0,0,1411
10155,1206,Patras,1957.0,Male,Caucasian,1.73,80.0,Non-Smoker,True,False,Transplant,130.0,38.0,2017-11-22,81.0,0,1469
10156,1206,Patras,1957.0,Male,Caucasian,1.73,80.0,Non-Smoker,True,False,Transplant,120.0,37.0,2018-02-28,91.0,0,1567


## Height

In [158]:
# Median height among all non-zero rows
median_height = data14.loc[data14['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data14.loc[data14['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
height_table

Unnamed: 0,gender,ethnicity,height
0,Female,Asian,1.575
1,Female,Black,1.67
2,Female,Caucasian,1.6
3,Male,Asian,1.71
4,Male,Black,1.76
5,Male,Caucasian,1.725


In [159]:
# This is the whole dataset not just the training set!
data_full2 = imputer(data14, height_table, median_height, 'height')

In [161]:
# All looks good
print(data_full2['height'].unique())

[1.72  1.69  1.63  1.75  1.67  1.83  1.6   1.725 1.78  1.81  1.71  1.53
 1.8   1.66  1.73  1.45  1.58  1.68  1.7   1.59  1.74  1.52  1.95  1.64
 1.79  1.65  1.88  1.55  1.47  1.57  1.76  1.61  1.82  1.77  1.575 1.62
 1.9   1.5   1.51  1.87  1.54  1.91  1.84  1.56  1.92  1.85  1.601 1.49
 1.42  1.98  1.625 1.41  1.86  2.    2.02  1.44  1.48  1.89  1.752]


## Weight

In [162]:
# Median weight among all non-NaN rows
median_weight = data_full2.loc[~data_full2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_full2['weight'] = data_full2['weight'].fillna(data_full2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_full2['weight'].fillna(median_weight, inplace=True)

# All looks good
print(data_full2['weight'].isna().sum())

0


## Age

In [163]:
# Median dob_year among all non-zero rows
median_dob = data_full2.loc[data_full2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_full2.loc[data_full2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
dob_table

Unnamed: 0,gender,ethnicity,dob_year
0,Female,Asian,1952.0
1,Female,Black,1979.0
2,Female,Caucasian,1962.0
3,Female,Others,1965.0
4,Male,Asian,1977.5
5,Male,Caucasian,1955.0
6,Male,Others,1959.0


In [164]:
data_full3 = imputer(data_full2, dob_table, median_dob, 'dob_year')

In [165]:
# Convert dob_year to age
data_full3.insert(3, 'age', data_full3['date'].dt.year - data_full3['dob_year'])
data_full3.drop(columns=['dob_year', 'date'], inplace=True)

## bp.sys

In [166]:
# Fill zero values with mean of each patient
data_full3['bp.sys'] = data_full3['bp.sys'].fillna(data_full3.groupby('ID')['bp.sys'].transform('mean'))

# Fill remaining NaN with global mean
data_full3['bp.sys'] = data_full3['bp.sys'].fillna(data_full3['bp.sys'].mean())

## bun

In [167]:
# Fill zero values with mean of each patient
data_full3['bun'] = data_full3['bun'].fillna(data_full3.groupby('ID')['bun'].transform('mean'))

# Fill remaining NaN with global mean
data_full3['bun'] = data_full3['bun'].fillna(data_full3['bun'].mean())

## Calculate linear regression slopes and R^2 coefficients as additional features for each patient

In [168]:
# Fill NaN values with Unknown
data_full4 = data_full3.fillna('Unknown')

In [169]:
def compute_slope(df):
    '''
    Converts a longitudinal dataset to a cross-sectional dataset by taking the slope of the regression line between egfr and times.
    '''
    group_object = df.groupby(['ID'])
    grouped_data = [group_object.get_group(x) for x in group_object.groups]
    df_list = []
    for group in grouped_data:
        # Ignore patients with only 1 observation
        if len(group) == 1:
            continue
        ref = group.iloc[[0]]
        X = group['times'].to_numpy()
        y = group['egfr'].to_numpy()
        result = linregress(X, y)
        ref['slope'] = result.slope
        ref['r2'] = (result.rvalue)**2
        df_list.append(ref)
    df_new = pd.concat(df_list, ignore_index=True)
    df_new.drop(columns=['times'], inplace=True)

    return df_new

In [170]:
data_full5 = compute_slope(data_full4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ref['slope'] = result.slope
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ref['r2'] = (result.rvalue)**2


In [171]:
data_full5

Unnamed: 0,ID,site,age,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,egfr,Labels,slope,r2
0,0,Sheffield,51.0,Male,Caucasian,1.720,80.225000,Past-Smoker,False,False,DN,132.0,12.000000,31.0,0,-0.004200,0.246101
1,1,Sheffield,68.0,Male,Unknown,1.690,80.225000,Unknown,False,False,GMN,172.0,9.000000,45.0,0,-0.036924,0.854992
2,2,Sheffield,50.0,Female,Black,1.630,80.225000,Unknown,False,False,GMN,159.5,8.700000,39.0,1,-0.009580,0.882591
3,3,Sheffield,51.0,Male,Caucasian,1.750,80.225000,Past-Smoker,False,False,Vascular,151.0,10.000000,36.0,0,-0.011684,0.769347
4,4,Patras,61.0,Female,Caucasian,1.670,74.333333,Smoker,False,False,HTN,130.0,88.000000,32.0,0,0.003875,0.124922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202,1202,Patras,77.0,Female,Caucasian,1.590,82.250000,Non-Smoker,False,False,DN,144.0,68.000000,31.0,0,0.002866,0.211318
1203,1203,Sheffield,53.0,Male,Caucasian,1.725,80.225000,Unknown,False,False,HTN,135.0,37.953809,22.0,0,-0.018027,0.723886
1204,1204,Patras,52.0,Female,Caucasian,1.640,71.200000,Smoker,False,False,GMN,132.0,31.000000,62.0,2,0.016968,0.469668
1205,1205,Sheffield,70.0,Male,Caucasian,1.725,80.700000,Unknown,False,False,GMN,152.0,6.000000,59.0,0,-0.013709,0.354751


## Save the dataset

In [178]:
data_full5.to_csv('cross_sectional_data.csv')