# **Part 1: Exploratory Data Analysis (EDA)**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from scipy import stats
from scipy.stats import chi2_contingency

## **Load and inspect dataset structure**

In [2]:
df = pd.read_csv("../data/raw/coronary_disease.csv")

In [3]:
df.head(10)

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,M,39,4.0,No,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,F,46,2.0,No,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,M,48,1.0,Yes,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,F,61,3.0,Yes,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,F,46,3.0,Yes,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,F,43,2.0,No,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0
6,F,63,1.0,No,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
7,F,45,2.0,Yes,20.0,0.0,0,0,0,313.0,100.0,71.0,21.68,79.0,78.0,0
8,M,52,1.0,No,0.0,0.0,0,1,0,260.0,141.5,89.0,26.36,76.0,79.0,0
9,M,43,1.0,Yes,30.0,0.0,0,1,0,225.0,162.0,107.0,23.61,93.0,88.0,0


In [4]:
print("Dataset shape:", df.shape)
print("Number of observations:", df.shape[0])

Dataset shape: (4238, 16)
Number of observations: 4238


In [5]:
# Column names
df.columns.tolist()

['sex',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes',
 'totChol',
 'sysBP',
 'diaBP',
 'BMI',
 'heartRate',
 'glucose',
 'TenYearCHD']

In [6]:
# Rename columns to make them more uniform
df = df.rename(
    columns={
        "education": "education_level",
        "currentSmoker": "current_smoker",
        "cigsPerDay": "cigarettes_per_day",
        "BPMeds": "bp_medication",
        "prevalentStroke": "previous_stroke",
        "prevalentHyp": "hypertension",
        "totChol": "total_cholesterol",
        "sysBP": "systolic_bp",
        "diaBP": "diastolic_bp",
        "BMI": "bmi",
        "heartRate": "heart_rate",
        "TenYearCHD": "ten_year_chd",
    }
)

In [7]:
# Check variable types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sex                 4238 non-null   object 
 1   age                 4238 non-null   int64  
 2   education_level     4133 non-null   float64
 3   current_smoker      4238 non-null   object 
 4   cigarettes_per_day  4209 non-null   float64
 5   bp_medication       4185 non-null   float64
 6   previous_stroke     4238 non-null   int64  
 7   hypertension        4238 non-null   int64  
 8   diabetes            4238 non-null   int64  
 9   total_cholesterol   4188 non-null   float64
 10  systolic_bp         4238 non-null   float64
 11  diastolic_bp        4238 non-null   float64
 12  bmi                 4219 non-null   float64
 13  heart_rate          4237 non-null   float64
 14  glucose             3850 non-null   float64
 15  ten_year_chd        4238 non-null   int64  
dtypes: flo

In [8]:
# Change variable types
df["education_level"] = df["education_level"].astype("Int64")

# Binary categorical variables
df["sex"] = df["sex"].map({"M": 1, "F": 0}).astype("int")
df["current_smoker"] = df["current_smoker"].map({"Yes": 1, "No": 0}).astype("int")
df["bp_medication"] = df["bp_medication"].astype("Int64")
df["previous_stroke"] = df["previous_stroke"].astype("int")
df["hypertension"] = df["hypertension"].astype("int")
df["diabetes"] = df["diabetes"].astype("int")