<a href="https://colab.research.google.com/github/mmassonn/heart_attack_prediction/blob/main/heart_attack_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Projet : Heart attack prediction


##I. Définir l'objectif

Objectif : Prédiction du risque d’infartus à partir des données cliniques

metrique : c-index

##II. Importer les bibliothèques/framework

In [None]:
#import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
#import model packages
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  StandardScaler

In [None]:
#import evaluation packages
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

In [None]:
#Connect drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##III. Load data

In [None]:
#load data
df = pd.read_csv('drive/MyDrive/Projet_2022/heart_attack_prediction/dataset.csv')

#IV. Pre-processing

In [None]:
#split Train and Test set  
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
trainset['target'].value_counts()
testset['target'].value_counts()

In [None]:
#defined encodage, feature_engineering and imputation tools

def make_standard_normal(df):
    
    # Remove skew by applying the log function to the train set, and to the test set
    df= np.log(df)
    
    #calculate the mean and standard deviation of the training set
    mean = df.mean(axis = 0)
    stdev = df.std(axis = 0)
    
    # standardize the training set
    df = (df-mean)/stdev
 
    return df


def imputation(df):   
    df = df.dropna(axis=0)
    return df

#defined pre-processing    
def preprocessing(df):
    col=['age','trestbps', 'chol', 'thalach']
    df[col] = df[col].apply(make_standard_normal)
    df = imputation(df)
    
    X = df.drop('target', axis =1)
    y = df['target']
    
    print(y.value_counts())
    
    return X,y

In [None]:
#applied pre-processing
X_train, y_train = preprocessing(trainset)
X_test, y_test = preprocessing(testset)

#V. Modelling

In [None]:
#pipelined pre-processor and model
RandomForest = RandomForestClassifier (random_state = 0)
AdaBoost = AdaBoostClassifier (random_state = 0)
SVM = make_pipeline(StandardScaler(),SVC (probability=True))
KNN = make_pipeline(StandardScaler(),KNeighborsClassifier ())

#VI. Evaluation process

##defined evaluation


In [None]:
#Evaluate the model using the C-index
def cindex(y_true, scores):
    n = len(y_true)
    assert len(scores) == n
    concordant = 0
    permissible = 0
    ties = 0 
   # use two nested for loops to go through all unique pairs of patients
    for i in range(n):
        for j in range(i+1, n): #choose the range of j so that j>i
            # Check if the pair is permissible (the patient outcomes are different)
            if y_true[i] != y_true[j]:
                # Count the pair if it's permissible
                permissible += 1
                # For permissible pairs, check if they are concordant or are ties
                # check for ties in the score
                if scores[i] == scores[j]:
                    # count the tie
                    ties += 1
                    # if it's a tie, we don't need to check patient outcomes, continue to the top of the for loop.
                    continue
                # case 1: patient i doesn't get the disease, patient j does
                if y_true[i] == 0 and y_true[j] == 1:
                    # Check if patient i has a lower risk score than patient j
                    if scores[i] < scores[j]:
                        # count the concordant pair
                        concordant += 1
                    # Otherwise if patient i has a higher risk score, it's not a concordant pair.
                    # Already checked for ties earlier
                # case 2: patient i gets the disease, patient j does not
                if y_true[i] == 1 and y_true[j] == 0:
                    # Check if patient i has a higher risk score than patient j
                    if scores[i] > scores[j]:
                        #count the concordant pair
                        concordant += 1
                    # Otherwise if patient i has a lower risk score, it's not a concordant pair.
                    # We already checked for ties earlier
    # calculate the c-index using the count of permissible pairs, concordant pairs, and tied pairs.
    c_index = (concordant + 0.5 * ties) / permissible
    return c_index

##defined evaluation model

In [None]:
#defined evaluation model
def evaluation(model):
    model.fit(X_train, y_train) 
    y_train_preds = model.predict_proba(X_train)[:, 1]
    print(cindex(y_train.values, y_train_preds))
    
    y_test_preds = model.predict_proba(X_test)[:, 1]
    print(cindex(y_test.values, y_test_preds))
       
    N,train_score,val_score = learning_curve(model, X_train, y_train, cv=4, scoring ='f1') 
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='val score')
    plt.legend()  

In [None]:
#evaluated many models
dict_of_models = {'RandomForest' : RandomForest,'AdaBoost' : AdaBoost,'SVM': SVM,'KNN': KNN}
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)