# Data Cleaning

In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv("../Data/heart_attack_china.csv")

In [28]:
print(df.head(5))


   Patient_ID  Age  Gender Smoking_Status Hypertension Diabetes Obesity  \
0           1   55    Male     Non-Smoker           No       No     Yes   
1           2   66  Female         Smoker          Yes       No      No   
2           3   69  Female         Smoker           No       No      No   
3           4   45  Female         Smoker           No      Yes      No   
4           5   39  Female         Smoker           No       No      No   

  Cholesterol_Level Air_Pollution_Exposure Physical_Activity  ...  \
0            Normal                   High              High  ...   
1               Low                 Medium              High  ...   
2               Low                 Medium              High  ...   
3            Normal                 Medium               Low  ...   
4            Normal                 Medium            Medium  ...   

  Hospital_Availability TCM_Use Employment_Status Education_Level  \
0                   Low     Yes        Unemployed         Primary

In [29]:
print(df.describe())
print(df.info())
print(df.shape)
print(df.columns)
print(df.dtypes)

          Patient_ID            Age  Blood_Pressure  CVD_Risk_Score
count  239266.000000  239266.000000   239266.000000   239266.000000
mean   119633.500000      59.466284      134.513378       54.485581
std     69070.289091      17.348308       25.991827       25.987743
min         1.000000      30.000000       90.000000       10.000000
25%     59817.250000      44.000000      112.000000       32.000000
50%    119633.500000      59.000000      135.000000       54.000000
75%    179449.750000      75.000000      157.000000       77.000000
max    239266.000000      89.000000      179.000000       99.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239266 entries, 0 to 239265
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Patient_ID              239266 non-null  int64 
 1   Age                     239266 non-null  int64 
 2   Gender                  239266 non-null  object
 3   Smoking

In [30]:
# Firstly we need to check if there are any missing values in the dataset
print(df.isnull().sum())

Patient_ID                    0
Age                           0
Gender                        0
Smoking_Status                0
Hypertension                  0
Diabetes                      0
Obesity                       0
Cholesterol_Level             0
Air_Pollution_Exposure        0
Physical_Activity             0
Diet_Score                    0
Stress_Level                  0
Alcohol_Consumption           0
Family_History_CVD            0
Healthcare_Access             0
Rural_or_Urban                0
Region                        0
Province                      0
Hospital_Availability         0
TCM_Use                       0
Employment_Status             0
Education_Level           59617
Income_Level                  0
Blood_Pressure                0
Chronic_Kidney_Disease        0
Previous_Heart_Attack         0
CVD_Risk_Score                0
Heart_Attack                  0
dtype: int64


In [31]:
# We have 59617 null values.
print(df['Education_Level'].value_counts())
# Education Level is categorical. Lets set all the null values to a new category called "uknown or none"
df['Education_Level'] = df['Education_Level'].fillna('Unknown or None')

Education_Level
Higher       59942
Primary      59901
Secondary    59806
Name: count, dtype: int64


In [32]:
# Now lets see if we have fixed the missing values
df.isnull().sum()

Patient_ID                0
Age                       0
Gender                    0
Smoking_Status            0
Hypertension              0
Diabetes                  0
Obesity                   0
Cholesterol_Level         0
Air_Pollution_Exposure    0
Physical_Activity         0
Diet_Score                0
Stress_Level              0
Alcohol_Consumption       0
Family_History_CVD        0
Healthcare_Access         0
Rural_or_Urban            0
Region                    0
Province                  0
Hospital_Availability     0
TCM_Use                   0
Employment_Status         0
Education_Level           0
Income_Level              0
Blood_Pressure            0
Chronic_Kidney_Disease    0
Previous_Heart_Attack     0
CVD_Risk_Score            0
Heart_Attack              0
dtype: int64

In [33]:
# Now we can continue the cleaning process by checking for outliers in the dataset
import seaborn as sns
import matplotlib.pyplot as plt


In [34]:
# Function to detect outliers using IQR
def detect_outliers(df):
    outliers = pd.DataFrame(columns=df.columns)
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = pd.concat([outliers, df[(df[col] < lower_bound) | (df[col] > upper_bound)]])
    return outliers

# Detect outliers
foundOutliers = detect_outliers(df)
print("Number of outliers detected:", foundOutliers.shape[0])
print(foundOutliers)

Number of outliers detected: 0
Empty DataFrame
Columns: [Patient_ID, Age, Gender, Smoking_Status, Hypertension, Diabetes, Obesity, Cholesterol_Level, Air_Pollution_Exposure, Physical_Activity, Diet_Score, Stress_Level, Alcohol_Consumption, Family_History_CVD, Healthcare_Access, Rural_or_Urban, Region, Province, Hospital_Availability, TCM_Use, Employment_Status, Education_Level, Income_Level, Blood_Pressure, Chronic_Kidney_Disease, Previous_Heart_Attack, CVD_Risk_Score, Heart_Attack]
Index: []

[0 rows x 28 columns]


In [35]:
# We didnt find any outliers in the dataset. Now we can check for duplicates in the dataset.
print("Number of duplicates in the dataset:", df.duplicated().sum())

Number of duplicates in the dataset: 0


In [36]:
# Onehot encoding of the gender column
df = pd.get_dummies(df, columns=['Gender'], drop_first=False)



In [37]:
print(df.dtypes)
df.head(5)

Patient_ID                 int64
Age                        int64
Smoking_Status            object
Hypertension              object
Diabetes                  object
Obesity                   object
Cholesterol_Level         object
Air_Pollution_Exposure    object
Physical_Activity         object
Diet_Score                object
Stress_Level              object
Alcohol_Consumption       object
Family_History_CVD        object
Healthcare_Access         object
Rural_or_Urban            object
Region                    object
Province                  object
Hospital_Availability     object
TCM_Use                   object
Employment_Status         object
Education_Level           object
Income_Level              object
Blood_Pressure             int64
Chronic_Kidney_Disease    object
Previous_Heart_Attack     object
CVD_Risk_Score             int64
Heart_Attack              object
Gender_Female               bool
Gender_Male                 bool
dtype: object


Unnamed: 0,Patient_ID,Age,Smoking_Status,Hypertension,Diabetes,Obesity,Cholesterol_Level,Air_Pollution_Exposure,Physical_Activity,Diet_Score,...,Employment_Status,Education_Level,Income_Level,Blood_Pressure,Chronic_Kidney_Disease,Previous_Heart_Attack,CVD_Risk_Score,Heart_Attack,Gender_Female,Gender_Male
0,1,55,Non-Smoker,No,No,Yes,Normal,High,High,Moderate,...,Unemployed,Primary,Low,104,Yes,No,78,No,False,True
1,2,66,Smoker,Yes,No,No,Low,Medium,High,Healthy,...,Unemployed,Secondary,Middle,142,No,No,49,No,True,False
2,3,69,Smoker,No,No,No,Low,Medium,High,Moderate,...,Unemployed,Primary,High,176,No,No,31,No,True,False
3,4,45,Smoker,No,Yes,No,Normal,Medium,Low,Healthy,...,Employed,Primary,Low,178,No,Yes,23,No,True,False
4,5,39,Smoker,No,No,No,Normal,Medium,Medium,Healthy,...,Retired,Higher,Middle,146,Yes,No,79,No,True,False


<!--- Hypertension (Yes/No)
Diabetes (Yes/No)
Obesity (Yes/No)
Cholesterol_Level (High/Normal/Low)
Air_Pollution_Exposure (Low/Medium/High)
Physical_Activity (Low/Medium/High)
Diet_Score (Healthy/Moderate/Poor)
Stress_Level (Low/Medium/High)
Alcohol_Consumption (Yes/No)
Family_History_CVD (Yes/No)
Healthcare_Access (Good/Moderate/Poor)
Rural_or_Urban (Rural/Urban)
Region (Eastern/Western/Northern/Southern/Central)
Province (e.g., Beijing, Shanghai, Gansu, etc.)
Hospital_Availability (High/Medium/Low)
TCM_Use (Yes/No)
Employment_Status (Employed/Unemployed/Retired)
Education_Level (None/Primary/Secondary/Higher)
Income_Level (Low/Middle/High)
Blood_Pressure (Numerical)
Chronic_Kidney_Disease (Yes/No)
Previous_Heart_Attack (Yes/No)
CVD_Risk_Score (0-100)
Heart_Attack (Yes/No - Target Variable)

In [None]:
# If we want to use the data for categorical prediction, we need to convert the values from object to category
# These are all the columns we want to modify Smoking_Status (Smoker/Non-Smoker)

df['Smoking_Status'] = df['Smoking_Status'].astype('category')
df['Hypertension'] = df['Hypertension'].astype('category')
df['Diabetes'] = df['Diabetes'].astype('category')
df['Obesity'] = df['Obesity'].astype('category')
df['Cholesterol_Level'] = df['Cholesterol_Level'].astype('category')
df['Air_Pollution_Exposure'] = df['Air_Pollution_Exposure'].astype('category')
df['Physical_Activity'] = df['Physical_Activity'].astype('category')

