In [75]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### Read in Data


In [29]:
test_data, train_data = pd.read_csv('fhs_test.csv'), pd.read_csv('fhs_train.csv')
data = pd.concat([test_data, train_data])

numeric_data = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
categorical_data = ['sex', 'education', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']

In [30]:
data.head()

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,674,0,58,1.0,1,20.0,0.0,0,0,0,,126.0,77.0,30.08,78.0,,0
1,4070,0,51,3.0,0,0.0,0.0,0,0,0,264.0,135.0,83.0,26.68,60.0,74.0,0
2,3150,0,44,2.0,1,9.0,0.0,0,1,0,,147.5,96.0,30.57,78.0,,1
3,1695,0,40,2.0,1,20.0,0.0,0,0,0,271.0,138.5,88.0,27.24,80.0,,1
4,2692,1,58,2.0,1,20.0,0.0,0,0,0,207.0,110.0,80.0,23.55,78.0,78.0,0


### Analyze how the data correlates with CHD by categorical and numeric data

In [77]:
def desc_data(var):
    return data.loc[:, [var, 'TenYearCHD']].groupby('TenYearCHD').describe()

def find_diff(var):
    return data.groupby('TenYearCHD')[var].mean().diff().iloc[-1]

In [83]:
diff_numeric_data = {}

for var in numeric_data:
    print(desc_data(var))
    diff_numeric_data[var] = find_diff(var)

dict(sorted(diff_numeric_data.items(), key=lambda item: -item[1]))

{'sysBP': 13.281469576271775,
 'totChol': 10.241578347010432,
 'glucose': 8.329166434138813,
 'age': 5.383448828581095,
 'diaBP': 4.814931531929886,
 'cigsPerDay': 1.9153580197774378,
 'BMI': 0.8597923363284323,
 'heartRate': 0.7678126897520485}

`^ shows the most significant indicators of CHD by purely the difference in numeric values`

In [87]:
diff_categorical_data = {}

for var in categorical_data:
    print(desc_data(var))
    diff_categorical_data[var] = find_diff(var)

dict(sorted(diff_categorical_data.items(), key=lambda item: -item[1]))

               sex                                             
             count      mean       std  min  25%  50%  75%  max
TenYearCHD                                                     
0           3596.0  0.410734  0.492035  0.0  0.0  0.0  1.0  1.0
1            644.0  0.532609  0.499323  0.0  0.0  1.0  1.0  1.0
           education                                             
               count      mean       std  min  25%  50%  75%  max
TenYearCHD                                                       
0             3507.0  2.002851  1.014719  1.0  1.0  2.0  3.0  4.0
1              628.0  1.848726  1.038823  1.0  1.0  1.0  3.0  4.0
            BPMeds                                             
             count      mean       std  min  25%  50%  75%  max
TenYearCHD                                                     
0           3554.0  0.023354  0.151046  0.0  0.0  0.0  0.0  1.0
1            633.0  0.064771  0.246316  0.0  0.0  0.0  0.0  1.0
           prevalentStroke    

{'prevalentHyp': 0.2287963161276505,
 'sex': 0.12187454659766894,
 'diabetes': 0.04292381459040065,
 'BPMeds': 0.041416964708789944,
 'prevalentStroke': 0.013187530658633817,
 'education': -0.154125325327507}

`prevalentHyp` - Hyp prevalence is positively correlated with CHD

`sex` - males more likely to have CHD

`diabetes` - diabetes is positively correlated with CHD

`BPMeds` - blood pressure medications are positively correlated with CHD

`prevalentStroke` - strokes are positively correlated with CHD

`education` - Higher Education is positively correlated with CHD