<a href="https://colab.research.google.com/github/mmassonn/heart_attack_prediction/blob/main/heart_attack_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Projet : Heart attack prediction


##I. Définir l'objectif

Objectif : Prédiction du risque de développer une cardiopathie à partir des données cliniques

metrique : F1 score

##II. Importer les bibliothèques/framework

In [132]:
#import pre-processing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [185]:
#import evaluation packages
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

In [134]:
#Connect drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##III. Load data

In [167]:
#load data
df = pd.read_csv('drive/MyDrive/Projet_2022/heart_attack_prediction/dataset.csv')

In [136]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


#IV. Pre-processing

In [168]:
#remove Cholesterol column
df = df.drop('Cholesterol', axis=1) 

##1.Diviser la base de donnée

In [169]:
#split Train and Test set  
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)

##2.Répartition des différentes variables

###HeartDisease

In [139]:
train_set['HeartDisease'].value_counts(normalize = True)

1    0.546322
0    0.453678
Name: HeartDisease, dtype: float64

In [140]:
test_set['HeartDisease'].value_counts(normalize = True)

1    0.581522
0    0.418478
Name: HeartDisease, dtype: float64

###Sex

In [141]:
train_set['Sex'].value_counts(normalize = True)

M    0.788828
F    0.211172
Name: Sex, dtype: float64

In [142]:
test_set['Sex'].value_counts(normalize = True)

M    0.793478
F    0.206522
Name: Sex, dtype: float64

###ChestPainType

In [143]:
train_set['ChestPainType'].value_counts(normalize = True)

ASY    0.532698
NAP    0.220708
ATA    0.197548
TA     0.049046
Name: ChestPainType, dtype: float64

In [144]:
test_set['ChestPainType'].value_counts(normalize = True)

ASY    0.570652
NAP    0.222826
ATA    0.152174
TA     0.054348
Name: ChestPainType, dtype: float64

###FastingBS

In [145]:
train_set['FastingBS'].value_counts(normalize = True)

0    0.782016
1    0.217984
Name: FastingBS, dtype: float64

In [146]:
test_set['FastingBS'].value_counts(normalize = True)

0    0.706522
1    0.293478
Name: FastingBS, dtype: float64

###RestingECG

In [147]:
train_set['RestingECG'].value_counts(normalize = True)

Normal    0.606267
ST        0.197548
LVH       0.196185
Name: RestingECG, dtype: float64

In [148]:
test_set['RestingECG'].value_counts(normalize = True)

Normal    0.581522
LVH       0.239130
ST        0.179348
Name: RestingECG, dtype: float64

###ExerciseAngina

In [149]:
train_set['ExerciseAngina'].value_counts(normalize = True)

N    0.589918
Y    0.410082
Name: ExerciseAngina, dtype: float64

In [150]:
test_set['ExerciseAngina'].value_counts(normalize = True)

N    0.619565
Y    0.380435
Name: ExerciseAngina, dtype: float64

###ST_Slope

In [151]:
train_set['ST_Slope'].value_counts(normalize = True)

Flat    0.495913
Up      0.433243
Down    0.070845
Name: ST_Slope, dtype: float64

In [152]:
test_set['ST_Slope'].value_counts(normalize = True)

Flat    0.521739
Up      0.418478
Down    0.059783
Name: ST_Slope, dtype: float64

**Conclusion :** La répartition des différentes variables est homogène.

##3.Normalisation des variables

###a.Normalisation standard des variables quantitatives

In [170]:
#normalisazion
def make_standard_normal(df):
    df = df[['Age', 'RestingBP', 'MaxHR', 'Oldpeak']]
    #calculate the mean and standard deviation of the training set
    mean = df.mean(axis = 0)
    stdev = df.std(axis = 0)
    # standardize the training set
    df = (df-mean)/stdev
    return df

###b.Encodage des variables quantitatives

In [171]:
#encodage function
def encodage_function(df):
  df = df[['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope']]
  qualitative_cols = ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
  df = pd.get_dummies(df, columns=qualitative_cols)
  return df

###c.Fonction pré-processing

In [172]:
#defined pre-processing    
def preprocessing(df):
  df1 = make_standard_normal(df)
  df2 = encodage_function(df)
  Y = df[['HeartDisease']] 
  X = pd.concat([df1, df2], axis=1)
  return X,Y

In [173]:
#applied pre-processing
X_train, y_train = preprocessing(train_set)
X_test, y_test = preprocessing(test_set)

#V. Modelling

###1.Voting classifier

In [174]:
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [181]:
model_1 = SGDClassifier(random_state=0)
model_2 = DecisionTreeClassifier(random_state=0)
model_3 = KNeighborsClassifier(n_neighbors=2)

model_4 = VotingClassifier([('SGD', model_1),
                            ('Tree', model_2),
                            ('KNN', model_3)],
                          voting='hard')

for model in (model_1, model_2, model_3, model_4):
    model.fit(X_train, y_train.values.ravel())
    y_preds = model.predict(X_test)

    print(model.__class__.__name__, f1_score(y_test, y_preds))

SGDClassifier 0.8061224489795918
DecisionTreeClassifier 0.7653061224489796
KNeighborsClassifier 0.8121827411167512
VotingClassifier 0.8316831683168316


###2.Bagging

In [159]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [180]:
model = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=100)

model.fit(X_train, y_train.values.ravel())
y_preds = model.predict(X_test)

print('F1 Score - BaggingClassifier:', f1_score(y_test, y_preds)) 

F1 Score - BaggingClassifier: 0.8744186046511628


In [182]:
model = RandomForestClassifier(n_estimators=100)

model.fit(X_train, y_train.values.ravel())
y_preds = model.predict(X_test)

print('F1 Score - RandomForestClassifier:', f1_score(y_test, y_preds)) 

F1 Score - RandomForestClassifier: 0.8744186046511628


###3.Boosting

In [163]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [183]:
model = AdaBoostClassifier(n_estimators=100)
model.fit(X_train, y_train.values.ravel())
y_preds = model.predict(X_test)

print('F1 Score - AdaBoostClassifier:', f1_score(y_test, y_preds)) 

F1 Score - AdaBoostClassifier: 0.8380952380952381


###4.Stacking

In [164]:
from sklearn.ensemble import StackingClassifier

In [184]:
model = StackingClassifier([('SGD', model_1),
                            ('Tree', model_2),
                            ('KNN', model_3)],
                             final_estimator=KNeighborsClassifier())

model.fit(X_train, y_train.values.ravel())
y_preds = model.predict(X_test)

print('F1 Score - StackingClassifier:', f1_score(y_test, y_preds)) 

F1 Score - StackingClassifier: 0.863849765258216
