In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("heart_2020_cleaned.csv" , sep=",")
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [4]:
df.drop([ 'AlcoholDrinking',
        'MentalHealth', 'DiffWalking',
       'Race',  'PhysicalActivity',
       'Asthma', 'KidneyDisease', 'SkinCancer'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   HeartDisease    319795 non-null  object 
 1   BMI             319795 non-null  float64
 2   Smoking         319795 non-null  object 
 3   Stroke          319795 non-null  object 
 4   PhysicalHealth  319795 non-null  float64
 5   Sex             319795 non-null  object 
 6   AgeCategory     319795 non-null  object 
 7   Diabetic        319795 non-null  object 
 8   GenHealth       319795 non-null  object 
 9   SleepTime       319795 non-null  float64
dtypes: float64(3), object(7)
memory usage: 24.4+ MB


In [5]:
numerical=[]
for i in df.columns:
    if df[i].dtype == float:
        numerical.append(i)
print(numerical)

['BMI', 'PhysicalHealth', 'SleepTime']


In [6]:
categorical=[]
for i in df.columns:
    if df[i].dtype == object:
        categorical.append(i)
print(categorical)

['HeartDisease', 'Smoking', 'Stroke', 'Sex', 'AgeCategory', 'Diabetic', 'GenHealth']


In [7]:
df.duplicated().sum()

69204

In [8]:
df=df.drop_duplicates()

In [9]:
df_BMI = df[df['BMI']>=80]
indexBMI = list(df_BMI.index.values)
df['BMI'] = df['BMI'].drop(indexBMI)


In [10]:
bins=[0,18.5,25,30,35,np.inf]
names = ['Underweight', 'Normal weight', 'Overweight', 'Obese', 'Extremly Obese']
df['Bmi'] = pd.cut(df['BMI'] , bins , labels=names)
df=df.drop('BMI', axis=1)

In [11]:
le =LabelEncoder()

cat_list = df.select_dtypes('object').columns
cat_list2 = df.select_dtypes('category').columns
for i in cat_list:
    df[i]=le.fit_transform(df[i])
for a in cat_list2:
    df[a]=le.fit_transform(df[a])
df


Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,Sex,AgeCategory,Diabetic,GenHealth,SleepTime,Bmi
0,0,1,0,3.0,0,7,2,4,5.0,4
1,0,0,1,0.0,0,12,0,4,7.0,1
2,0,1,0,20.0,1,9,2,1,8.0,3
3,0,0,0,0.0,0,11,0,2,6.0,1
4,0,0,0,28.0,0,4,0,4,8.0,1
...,...,...,...,...,...,...,...,...,...,...
319789,0,0,0,0.0,0,0,0,0,8.0,1
319790,1,1,0,7.0,1,8,2,1,6.0,3
319792,0,0,0,0.0,0,5,0,2,6.0,1
319793,0,0,0,0.0,0,1,0,2,12.0,2


In [12]:
df.isnull().sum()
# df=df.dropna()
# df.isnull().sum()

HeartDisease      0
Smoking           0
Stroke            0
PhysicalHealth    0
Sex               0
AgeCategory       0
Diabetic          0
GenHealth         0
SleepTime         0
Bmi               0
dtype: int64

In [12]:
sc= StandardScaler()


x=df.drop(['HeartDisease'] , axis=1).values
y = df['HeartDisease'].values
x= sc.fit_transform(x)
x_train ,x_test , y_train, y_test = train_test_split(x,y ,test_size=0.3, random_state=10)



In [13]:
oversample=RandomOverSampler(sampling_strategy='minority')
x_resamled, y_resampled = oversample.fit_resample(x, y)

In [14]:
data =df.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250591 entries, 0 to 319794
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   HeartDisease    250591 non-null  int32  
 1   Smoking         250591 non-null  int32  
 2   Stroke          250591 non-null  int32  
 3   PhysicalHealth  250591 non-null  float64
 4   Sex             250591 non-null  int32  
 5   AgeCategory     250591 non-null  int32  
 6   Diabetic        250591 non-null  int32  
 7   GenHealth       250591 non-null  int32  
 8   SleepTime       250591 non-null  float64
 9   Bmi             250591 non-null  int32  
dtypes: float64(2), int32(8)
memory usage: 21.4 MB


In [15]:
# Convert the arrays to a DataFrame
cols =[ 'HeartDisease']
data = pd.concat([pd.DataFrame(x_resamled), pd.Series(y_resampled, name='HeartDisease')], axis=1)

print(data.shape)

(447646, 10)


In [16]:
y=data['HeartDisease']
X=data.drop(['HeartDisease'], axis=1)



In [17]:
X_train, X_test, y_train, y_test = train_test_split(x_resamled,y_resampled,test_size=0.2,random_state=10)
X_train.shape, X_test.shape

((358116, 9), (89530, 9))

In [18]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the model: {:.2f}%".format(accuracy * 100))
joblib.dump(random_forest_model, 'model.pkl')

Accuracy of the model: 83.33%


['model.pkl']

In [19]:
classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.82     44813
           1       0.80      0.89      0.84     44717

    accuracy                           0.83     89530
   macro avg       0.84      0.83      0.83     89530
weighted avg       0.84      0.83      0.83     89530

