## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

## Import Data

In [3]:
url = 'data/train.csv'

In [4]:
df = pd.read_csv(url)

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
patient_id,RDG0550,NMA3851,TUI5807,YYT5016,ZAC5937
age,33,56,19,50,89
sex,Male,Female,Female,Female,Female
cholesterol,200,262,140,163,144
blood_pressure,129/90,159/105,161/109,120/62,153/110
heart_rate,48,46,54,53,92
diabetes,0,1,0,0,1
family_history,1,0,1,1,0
smoking,1,1,0,1,1
obesity,1,0,0,1,0


## Data Description

The features for preparing the predictors:

- `Patient ID`: Unique identifier for each patienttack risk (1: Yes, 0: No)ack risk (1: Yes, 0: No)

- `Age`: Age of the patient

- `Sex`: Gender of the patient (Male/Female)

- `Cholesterol`: Cholesterol levels of the patient

- `Blood Pressure`: Blood pressure of the patient (systolic/diastolic)  

- `Heart Rate`: Heart rate of the patient

- `Diabetes`: Whether the patient has diabetes (Yes/No)

- `Family History`: Family history of heart-related problems (1: Yes, 0: No)

- `Smoking`: Smoking status of the patient (1: Smoker, 0: Non-smoker)

- `Obesity`: Obesity status of the patient (1: Obese, 0: Not obese)

- `Alcohol Consumption`: Patient consumes alcohol (Yes/No)

- `Exercise Hours Per Week`: Number of exercise hours per week

- `Diet`: Dietary habits of the patient (Healthy/Average/Unhealthy)

- `Previous Heart Problems`: Previous heart problems of the patient (1: Yes, 0: No)

- `Medication Use`: Medication usage by the patient (1: Yes, 0: No)

- `Stress Level`: Stress level reported by the patient (1-10)

- `Sedentary Hours Per Day`: Hours of sedentary activity per day

- `Income`: Income level of the patient

- `BMI`: Body Mass Index (BMI) of the patient

- `Triglycerides`: Triglyceride levels of the patient

- `Physical Activity Days Per Week`: Days of physical activity per week

- `Sleep Hours Per Day`: Hours of sleep per day

- `Country`: Country of the patient

- `Continent`: Continent where the patient resides

- `Hemisphere`: Hemisphere where the patient resides

Target Feature:

`Heart Attack Risk`: Presence of heart attack risk (1: Yes, 0: No)

## Data Preparation

- Rename Columns to remove spaces and capital letters
- Lowercase string entries in rows and replace spaces with underscores
- Convert numeric categoric entries to stings

In [9]:
df.columns = df.columns.str.replace(" ", '_').str.lower()

In [12]:
categorical = [x for x in df.columns if df[x].dtype == 'O']

In [13]:
for c in categorical:
    df[c] = df[c].str.replace(" ", '_').str.lower()

In [14]:
df.head().T

Unnamed: 0,0,1,2,3,4
patient_id,rdg0550,nma3851,tui5807,yyt5016,zac5937
age,33,56,19,50,89
sex,male,female,female,female,female
cholesterol,200,262,140,163,144
blood_pressure,129/90,159/105,161/109,120/62,153/110
heart_rate,48,46,54,53,92
diabetes,0,1,0,0,1
family_history,1,0,1,1,0
smoking,1,1,0,1,1
obesity,1,0,0,1,0


In [15]:
df.diabetes.value_counts()

diabetes
1    4576
0    2434
Name: count, dtype: int64

In [16]:
# Diabetes: Whether the patient has diabetes (Yes/No)

diabetes_values = {
    0: 'yes',
    1: 'no'
}
df.diabetes = df.diabetes.map(diabetes_values)
df.diabetes.head()

0    yes
1     no
2    yes
3    yes
4     no
Name: diabetes, dtype: object

In [17]:
df.family_history.value_counts()

family_history
0    3562
1    3448
Name: count, dtype: int64

In [18]:
# Family History: Family history of heart-related problems (1: Yes, 0: No)

family_values = {
    1: 'yes',
    0: 'no'
}
df.family_history = df.family_history.map(family_values)
df.family_history.head()

0    yes
1     no
2    yes
3    yes
4     no
Name: family_history, dtype: object

In [19]:
df.smoking.value_counts()

smoking
1    6283
0     727
Name: count, dtype: int64

In [20]:
# Smoking: Smoking status of the patient (1: Smoker, 0: Non-smoker)

smoking_values = {
    1: 'yes',
    0: 'no'
}
df.smoking = df.smoking.map(smoking_values)
df.smoking.head()

0    yes
1    yes
2     no
3    yes
4    yes
Name: smoking, dtype: object

In [21]:
df.obesity.value_counts()

obesity
0    3506
1    3504
Name: count, dtype: int64

In [22]:
# Obesity: Obesity status of the patient (1: Obese, 0: Not obese)

obesity_values = {
    1: 'yes',
    0: 'no'
}
df.obesity = df.obesity.map(obesity_values)
df.obesity.head()

0    yes
1     no
2     no
3    yes
4     no
Name: obesity, dtype: object

In [23]:
df.alcohol_consumption.value_counts()

alcohol_consumption
1    4177
0    2833
Name: count, dtype: int64

In [24]:
# Alcohol Consumption: Patient consumes alcohol (Yes/No)

alcohol_values = {
    1: 'yes',
    0: 'no'
}

df.alcohol_consumption = df.alcohol_consumption.map(alcohol_values)
df.alcohol_consumption.head()

0    yes
1     no
2    yes
3    yes
4    yes
Name: alcohol_consumption, dtype: object

In [25]:
df.previous_heart_problems.value_counts()

previous_heart_problems
0    3518
1    3492
Name: count, dtype: int64

In [26]:
# Previous Heart Problems: Previous heart problems of the patient (1: Yes, 0: No)

values = {
    1: 'yes',
    0: 'no'
}
df.previous_heart_problems = df.previous_heart_problems.map(values)
df.previous_heart_problems.head()

0     no
1    yes
2    yes
3     no
4    yes
Name: previous_heart_problems, dtype: object

In [27]:
df.medication_use.value_counts()

medication_use
1    3506
0    3504
Name: count, dtype: int64

In [28]:
# Medication Use: Medication usage by the patient (1: Yes, 0: No)

values = {
    1: 'yes',
    0: 'no'
}
df.medication_use = df.medication_use.map(values)
df.medication_use.head()

0    yes
1     no
2    yes
3    yes
4     no
Name: medication_use, dtype: object

In [29]:
df.isnull().sum()

patient_id                         0
age                                0
sex                                0
cholesterol                        0
blood_pressure                     0
heart_rate                         0
diabetes                           0
family_history                     0
smoking                            0
obesity                            0
alcohol_consumption                0
exercise_hours_per_week            0
diet                               0
previous_heart_problems            0
medication_use                     0
stress_level                       0
sedentary_hours_per_day            0
income                             0
bmi                                0
triglycerides                      0
physical_activity_days_per_week    0
sleep_hours_per_day                0
country                            0
continent                          0
hemisphere                         0
heart_attack_risk                  0
dtype: int64

We do not have null values

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   patient_id                       7010 non-null   object 
 1   age                              7010 non-null   int64  
 2   sex                              7010 non-null   object 
 3   cholesterol                      7010 non-null   int64  
 4   blood_pressure                   7010 non-null   object 
 5   heart_rate                       7010 non-null   int64  
 6   diabetes                         7010 non-null   object 
 7   family_history                   7010 non-null   object 
 8   smoking                          7010 non-null   object 
 9   obesity                          7010 non-null   object 
 10  alcohol_consumption              7010 non-null   object 
 11  exercise_hours_per_week          7010 non-null   float64
 12  diet                

We see that `blood_pressure` is of type `object` whereas it should be `numeric`. We can correct it by splitting the `systolic` and `diastolic` to different columns.

In [69]:
df[['systolic_bp', 'diastolic_bp']] = df['blood_pressure'].str.split('/', expand=True).astype(int)

In [70]:
df.head(2)

Unnamed: 0,patient_id,age,sex,cholesterol,blood_pressure,heart_rate,diabetes,family_history,smoking,obesity,...,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,country,continent,hemisphere,heart_attack_risk,systolic_bp,diastolic_bp
0,rdg0550,33,male,200,129/90,48,yes,yes,yes,yes,...,30.449815,63,6,7,argentina,south_america,southern_hemisphere,1,129,90
1,nma3851,56,female,262,159/105,46,no,no,yes,no,...,34.973685,333,7,8,nigeria,africa,northern_hemisphere,1,159,105


In [71]:
df.drop(columns = 'blood_pressure', inplace=True)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   patient_id                       7010 non-null   object 
 1   age                              7010 non-null   int64  
 2   sex                              7010 non-null   object 
 3   cholesterol                      7010 non-null   int64  
 4   heart_rate                       7010 non-null   int64  
 5   diabetes                         7010 non-null   object 
 6   family_history                   7010 non-null   object 
 7   smoking                          7010 non-null   object 
 8   obesity                          7010 non-null   object 
 9   alcohol_consumption              7010 non-null   object 
 10  exercise_hours_per_week          7010 non-null   float64
 11  diet                             7010 non-null   object 
 12  previous_heart_probl

We can get a fairly good idea about `continent` and `hemisphere` from `country`. So we can drop the two features.

In [76]:
df.drop(columns = ['continent', 'hemisphere'], inplace=True)

We can also rename `sex` to `gender`

In [91]:
df.rename(columns = {'sex': 'gender'}, inplace=True)

In [92]:
categorical = [x for x in df.columns if df[x].dtype == 'O']
numerical = [x for x in df.columns if x not in categorical]

assert len(categorical) + len(numerical) == len(df.columns)

## Setting up Validation Framework

In [93]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=12)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=12)

In [94]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [95]:
y_train = df_train.heart_attack_risk.values
y_val = df_val.heart_attack_risk.values
y_test = df_test.heart_attack_risk.values

In [96]:
del df_train['heart_attack_risk']
del df_val['heart_attack_risk']
del df_test['heart_attack_risk']

In [97]:
len(df_train), len(df_val), len(df_test)

(4206, 1402, 1402)

In [98]:
len(y_train), len(y_val), len(y_test)

(4206, 1402, 1402)

## Exploratory Data Analysis

- Mising Values
- Distribution of Target Variable

In [99]:
df_full_train = df_full_train.reset_index(drop=True)

In [100]:
df_full_train.isnull().sum()

patient_id                         0
age                                0
gender                             0
cholesterol                        0
heart_rate                         0
diabetes                           0
family_history                     0
smoking                            0
obesity                            0
alcohol_consumption                0
exercise_hours_per_week            0
diet                               0
previous_heart_problems            0
medication_use                     0
stress_level                       0
sedentary_hours_per_day            0
income                             0
bmi                                0
triglycerides                      0
physical_activity_days_per_week    0
sleep_hours_per_day                0
country                            0
heart_attack_risk                  0
systolic_bp                        0
diastolic_bp                       0
dtype: int64

In [101]:
df_full_train.heart_attack_risk.value_counts(normalize=True)

heart_attack_risk
0    0.643723
1    0.356277
Name: proportion, dtype: float64

We can calculate the global heart attack risk rate. `Global` here refers to the entire dataset.

In [102]:
global_heart_attack_risk_rate = df_full_train.heart_attack_risk.mean()
round(global_heart_attack_risk_rate, 2)

0.36

In [103]:
categorical.remove('patient_id')
categorical

['gender',
 'diabetes',
 'family_history',
 'smoking',
 'obesity',
 'alcohol_consumption',
 'diet',
 'previous_heart_problems',
 'medication_use',
 'country']

In [104]:
numerical.remove('heart_attack_risk')
numerical

['age',
 'cholesterol',
 'heart_rate',
 'exercise_hours_per_week',
 'stress_level',
 'sedentary_hours_per_day',
 'income',
 'bmi',
 'triglycerides',
 'physical_activity_days_per_week',
 'sleep_hours_per_day',
 'systolic_bp',
 'diastolic_bp']