In [8]:
# Importing packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
# Data description for only patients with associated ECG values
df_ECG = pd.read_csv('./database/data_description/ECG_data_summary_table.csv', encoding = 'latin-1')
df_ECG.set_index('patient ID', inplace=True)

# Data description for all patients
df_all = pd.read_csv('./database/data_description/ECG_data_summary_table.csv', encoding = 'latin-1')
df_all.set_index('patient ID', inplace=True)

In [10]:
# NOT NECESSARY TO RUN, ONLY RUN THIS ONCE, HARD CODED TO JUST REMOVE ALL COLUMNS WITHIN THAT INDEX
# The variables we want to focus on:
df_all.drop(df_all.iloc[:,12:937], inplace = True, axis = 1) #drops all of the brain perfusion variables
df_all.drop(df_all.iloc[:,-69:], inplace = True, axis = 1) #drops all of the MENTAL test variables
df_ECG.drop(df_ECG.iloc[:,12:937], inplace = True, axis = 1) #drops all of the brain perfusion variables
df_ECG.drop(df_ECG.iloc[:,-69:], inplace = True, axis = 1) #drops all of the MENTAL test variables

# Strip columns with all NaN values
df_all = df_all.dropna(axis='columns', how='all')
df_ECG = df_all.dropna(axis='columns', how='all')

In [11]:
# easy way to check the variables present in the database

In [12]:
# Select variables you want to make a cross-correlation matrix out of
neuropathy = ['Group', 'Diabetes Duration', 'age', 'WBC K/uL', 'RBC m/uL', 'Hgb g/dL', 'GLUCOSE mg/dL','Hb A1C%', 'Dizziness AUTONOMIC SYMPTOMS', 'LDL CALCmg/dL','HDL mg/dL', 'Neuropathy AUTONOMIC SYMPTOMS']
retinopathy = ['Group', 'Diabetes Duration','age','BMI', 'Hb A1C%', 'CRP (mg/L)', 'URINE ALBUMIN mg/dL', 'Hgb g/dL', 'Retinopathy Grading']
general = ['Group', 'Diabetes Duration','age','BMI','Hb A1C%','CRP (mg/L)','Neuropathy AUTONOMIC SYMPTOMS','WBC K/uL','RBC m/uL','Hgb g/dL','GLUCOSE mg/dL','URINE CREAT mg/dL','URINE ALBUMIN mg/dL', 'CHOLESTmg/dL','LDL CALCmg/dL','Retinopathy Grading']
df_all[general].head()

Unnamed: 0_level_0,Group,Diabetes Duration,age,BMI,Hb A1C%,CRP (mg/L),Neuropathy AUTONOMIC SYMPTOMS,WBC K/uL,RBC m/uL,Hgb g/dL,GLUCOSE mg/dL,URINE CREAT mg/dL,URINE ALBUMIN mg/dL,CHOLESTmg/dL,LDL CALCmg/dL,Retinopathy Grading
patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
S0250,DM,7.0,50,35.782279,9.4,0.24,NO,6.9,5.31,15.0,211,65,2.4,135,72,
S0256,DM,23.0,65,29.62963,7.1,0.06,NO,4.4,3.58,10.3,111,69,0.3,194,97,0.0
s0273,DM,16.0,68,26.27671,6.5,0.084,YES,8.5,5.43,16.2,109,185,1.317308,188,121,0.0
s0282,DM,1.0,62,26.762951,5.4,0.14,NO,5.6,4.76,13.6,108,24,1.317308,169,97,
s0283,CONTROL,0.0,83,21.20412,5.9,0.028,YES,4.9,3.68,12.1,78,69,1.4,268,146,


In [13]:
# Unfortunately .corr() ignores all columns with all non-numeric values so find a way to represent these (is just converting to binary 1 or 0 enough?)
df_all[neuropathy] = df_all[neuropathy].apply(lambda x: x.astype(str).str.lower()) # Converts all of the values to string, and all strings to lowercase
df_all[retinopathy] = df_all[retinopathy].apply(lambda x: x.astype(str).str.lower()) # Converts all of the values to string, and all strings to lowercase
df_all[general] = df_all[general].apply(lambda x: x.astype(str).str.lower()) # Converts all of the values to string, and all strings to lowercase
df_all['Dizziness AUTONOMIC SYMPTOMS'].replace(('yes', 'no'), (1, 0), inplace=True)
df_all['Neuropathy AUTONOMIC SYMPTOMS'].replace(('yes', 'no'), (1, 0), inplace=True)
df_all['Group'].replace(('dm', 'control'), (1, 0), inplace=True)
df_all[neuropathy] = df_all[neuropathy].apply(lambda x: x.astype(float))
df_all[retinopathy] = df_all[retinopathy].apply(lambda x: x.astype(float))
df_all[general] = df_all[general].apply(lambda x: x.astype(float))

In [14]:
corr=df_all[neuropathy].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Group,Diabetes Duration,age,WBC K/uL,RBC m/uL,Hgb g/dL,GLUCOSE mg/dL,Hb A1C%,Dizziness AUTONOMIC SYMPTOMS,LDL CALCmg/dL,HDL mg/dL,Neuropathy AUTONOMIC SYMPTOMS
Group,1.0,0.215402,-0.140161,0.081962,-0.077123,-0.068261,0.280516,0.330315,0.102138,-0.478919,-0.136478,0.102138
Diabetes Duration,0.215402,1.0,0.017241,0.074163,0.090658,-0.039896,0.251967,0.381544,0.117262,-0.041577,0.038425,0.219319
age,-0.140161,0.017241,1.0,-0.015946,-0.20033,-0.026889,-0.185869,-0.305483,-0.104065,0.134443,0.084752,0.257684
WBC K/uL,0.081962,0.074163,-0.015946,1.0,0.227372,0.260163,0.094491,0.078362,-0.049306,-0.160597,-0.200576,0.247251
RBC m/uL,-0.077123,0.090658,-0.20033,0.227372,1.0,0.817707,0.236402,0.155116,-0.148753,0.129342,-0.355176,-0.059092
Hgb g/dL,-0.068261,-0.039896,-0.026889,0.260163,0.817707,1.0,0.165704,-0.0845,0.021532,-0.052758,-0.487661,0.070338
GLUCOSE mg/dL,0.280516,0.251967,-0.185869,0.094491,0.236402,0.165704,1.0,0.748386,-0.007578,-0.207622,-0.206061,0.107122
Hb A1C%,0.330315,0.381544,-0.305483,0.078362,0.155116,-0.0845,0.748386,1.0,-0.041873,-0.043682,0.08161,0.010372
Dizziness AUTONOMIC SYMPTOMS,0.102138,0.117262,-0.104065,-0.049306,-0.148753,0.021532,-0.007578,-0.041873,1.0,-0.345472,-0.238531,-0.038889
LDL CALCmg/dL,-0.478919,-0.041577,0.134443,-0.160597,0.129342,-0.052758,-0.207622,-0.043682,-0.345472,1.0,0.273241,-0.118329


In [15]:
corr=df_all[retinopathy].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Group,Diabetes Duration,age,BMI,Hb A1C%,CRP (mg/L),URINE ALBUMIN mg/dL,Hgb g/dL,Retinopathy Grading
Group,1.0,0.215402,-0.140161,0.208393,0.330315,-0.073703,0.123881,-0.068261,0.146556
Diabetes Duration,0.215402,1.0,0.017241,0.030572,0.381544,-0.119392,0.200671,-0.039896,0.061175
age,-0.140161,0.017241,1.0,-0.601394,-0.305483,-0.030066,0.039995,-0.026889,-0.20741
BMI,0.208393,0.030572,-0.601394,1.0,0.322485,0.284163,0.087079,0.102831,0.222785
Hb A1C%,0.330315,0.381544,-0.305483,0.322485,1.0,-0.131841,0.054823,-0.0845,0.184835
CRP (mg/L),-0.073703,-0.119392,-0.030066,0.284163,-0.131841,1.0,0.159446,0.010067,0.006576
URINE ALBUMIN mg/dL,0.123881,0.200671,0.039995,0.087079,0.054823,0.159446,1.0,0.045855,0.133998
Hgb g/dL,-0.068261,-0.039896,-0.026889,0.102831,-0.0845,0.010067,0.045855,1.0,-0.172223
Retinopathy Grading,0.146556,0.061175,-0.20741,0.222785,0.184835,0.006576,0.133998,-0.172223,1.0


In [16]:
corr=df_all[general].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Group,Diabetes Duration,age,BMI,Hb A1C%,CRP (mg/L),Neuropathy AUTONOMIC SYMPTOMS,WBC K/uL,RBC m/uL,Hgb g/dL,GLUCOSE mg/dL,URINE CREAT mg/dL,URINE ALBUMIN mg/dL,CHOLESTmg/dL,LDL CALCmg/dL,Retinopathy Grading
Group,1.0,0.215402,-0.140161,0.208393,0.330315,-0.073703,0.102138,0.081962,-0.077123,-0.068261,0.280516,0.15159,0.123881,-0.364545,-0.478919,0.146556
Diabetes Duration,0.215402,1.0,0.017241,0.030572,0.381544,-0.119392,0.219319,0.074163,0.090658,-0.039896,0.251967,0.254818,0.200671,0.091642,-0.041577,0.061175
age,-0.140161,0.017241,1.0,-0.601394,-0.305483,-0.030066,0.257684,-0.015946,-0.20033,-0.026889,-0.185869,0.148423,0.039995,0.10791,0.134443,-0.20741
BMI,0.208393,0.030572,-0.601394,1.0,0.322485,0.284163,-0.032021,0.258687,0.222318,0.102831,0.196155,-0.049548,0.087079,-0.222966,-0.283876,0.222785
Hb A1C%,0.330315,0.381544,-0.305483,0.322485,1.0,-0.131841,0.010372,0.078362,0.155116,-0.0845,0.748386,0.103318,0.054823,0.034053,-0.043682,0.184835
CRP (mg/L),-0.073703,-0.119392,-0.030066,0.284163,-0.131841,1.0,-0.223337,0.049294,0.14011,0.010067,-0.174169,0.029769,0.159446,-0.056479,0.027148,0.006576
Neuropathy AUTONOMIC SYMPTOMS,0.102138,0.219319,0.257684,-0.032021,0.010372,-0.223337,1.0,0.247251,-0.059092,0.070338,0.107122,0.302343,0.132457,-0.077957,-0.118329,0.104519
WBC K/uL,0.081962,0.074163,-0.015946,0.258687,0.078362,0.049294,0.247251,1.0,0.227372,0.260163,0.094491,0.089701,0.392534,-0.121389,-0.160597,0.076811
RBC m/uL,-0.077123,0.090658,-0.20033,0.222318,0.155116,0.14011,-0.059092,0.227372,1.0,0.817707,0.236402,-0.025541,0.13294,0.03695,0.129342,-0.031268
Hgb g/dL,-0.068261,-0.039896,-0.026889,0.102831,-0.0845,0.010067,0.070338,0.260163,0.817707,1.0,0.165704,-0.092822,0.045855,-0.118588,-0.052758,-0.172223


In [17]:
print (df_all['Diabetes Duration'])

patient ID
S0250     7.000000
S0256    23.000000
s0273    16.000000
s0282     1.000000
s0283     0.000000
s0287    18.000000
s0288     2.000000
s0292     5.000000
s0296     9.000000
S0300    10.000000
s0301    21.000000
s0304    21.000000
s0308    10.000000
s0310     7.000000
S0312     6.000000
s0314     2.000000
s0315     7.000000
s0316    12.000000
s0317    17.000000
s0318     4.000000
S0326    37.000000
s0327     5.000000
s0339    28.000000
s0342     3.000000
s0349    17.000000
s0365     8.000000
s0366     7.000000
s0368     0.000000
s0372     3.000000
s0381     3.000000
s0382     2.000000
s0390     2.000000
s0392     3.000000
s0398     1.000000
s0403    12.000000
s0405    20.000000
s0406    10.000000
s0409     4.000000
s0411     0.000000
s0416     8.954545
s0420     2.000000
s0423     8.954545
s0424     8.954545
s0426     8.954545
s0427     8.954545
s0430     3.000000
s0432    10.000000
s0433     9.000000
s0434     7.000000
S0435     8.954545
s0441     8.954545
Name: Diabetes Durat