In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
print("imports ok")

imports ok


In [None]:
"""
Python classification
There are 3 types of input features:
    Objective: factual information;
    Examination: results of medical examination;
    Subjective: information given by the patient.
Features:
    Age | Objective Feature | age | int (days)
    
    Height | Objective Feature | height | int (cm) |
    
    Weight | Objective Feature | weight | float (kg) |
    
    Gender | Objective Feature | gender | categorical code |
    
    Systolic blood pressure | Examination Feature | ap_hi | int |
    
    Diastolic blood pressure | Examination Feature | ap_lo | int |
    
    Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal|
    
    Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
    
    Smoking | Subjective Feature | smoke | binary |
    
    Alcohol intake | Subjective Feature | alco | binary |
    
    Physical activity | Subjective Feature | active | binary |
    
    Presence or absence of cardiovascular disease | Target Variable | cardio | binary |
    """

In [6]:
#lecture du CSV de données avec pandas :
csv = pd.read_csv(r"./data/cardio_train.csv", error_bad_lines=False, delimiter=";")
csv.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [9]:
# la colonne des ages est trop funky, ajout d'une colonne en années pour visualisation :
csv["age_years"] = csv.age.map(lambda x: round(x/365.25, 2))

In [19]:
# IMC ou BMI est une mesure utilisée pour mesurée l'état physique des gens 
# La fonction est déclarée ici pour clarté, 
# mais elle aurait très bien marché en lambda pour aller plus vite
def calcimc(line):
    return round(line["weight"]/ ((line["height"]/100)**2),1)

In [20]:
csv["BMI"] = csv.apply(calcimc, axis=1) #axis=1 applique la fonction à chaque ligne
csv.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.36,22.0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.38,34.9
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51.63,23.5
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.25,28.7
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47.84,23.0
