In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### importation de données

In [5]:
data = pd.read_csv("processed-cleaned-data.csv", index_col=0)
data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,52.659873,74.361618,1.02,4.0,0.0,1,1,0,0,148.036517,...,38.0,6000.0,4.707435,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,0.099594,1,1,0,0,124.336042,...,31.0,7500.0,4.707435,0,1,0,1,0,1,0
3,48.0,70.0,1.017629,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0


## Reduction de dimension

#### normalisation des données

In [42]:
sc = StandardScaler()
X = sc.fit_transform(X)

#### réduction de la dimensionnalité à l'aide de l'ACP

In [43]:
pca = PCA(svd_solver="full")
pc = pca.fit_transform(X)

#####  explained_variance_ratio_ représente le pourcentage de variance expliquée par chacune des composantes principales

In [44]:
pca.explained_variance_ratio_*100

array([28.58607888,  6.87985239,  5.27220675,  4.87850842,  4.79365143,
        4.42803579,  4.06737064,  3.98370969,  3.7794601 ,  3.68642013,
        3.35424964,  3.23342901,  2.84752856,  2.818274  ,  2.69794693,
        2.52144064,  2.19950572,  2.0550645 ,  1.69329577,  1.49188225,
        1.43865192,  1.35707758,  1.22742894,  0.70893032])

#### On voit clairement que :
#### la première composante principale (PC1) représente 28,58 % de la variance totale (information).
#### le reste des composantes principales représente 71,42 % de la variance totale.
#### la question est maintenant le nombre de composants principaux à retenir ?
#### Selon la règle de Kaiser, il est recommandé de conserver toutes les composantes dont les valeurs propres sont supérieures à 1.
#### on peut aussi utiliser le critère de coude
#### mais comme il s'agit d'un problème de classification alors le nombre de composantes principales à retenir doit être en fonction de la performances de la classification.

# Classification

## Logistic Regression

In [57]:
# Define a pipeline Gridsearch to search for the best combination of PCA n_components
# and the logistic regression classifier regularization C.

pca = PCA()
logistic_regression = LogisticRegression(max_iter=10000)
pipe = Pipeline(steps=[("pca", pca), ("logistic", logistic_regression)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    "logistic__C": np.logspace(-3, 3, 7)
}
cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=2)
cv.fit(X, y)
print(cv.best_params_)

{'logistic__C': 10.0, 'pca__n_components': 18}


In [63]:
pca = PCA(n_components=18)
pc = pca.fit_transform(X)

92.0827332249762

In [66]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [67]:
logistic_regression = LogisticRegression(C = 10.0)
logistic_regression.fit(X_train, y_train)

LogisticRegression(C=10.0)

In [69]:
y_pred_lr = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        74
           1       1.00      1.00      1.00        46

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



## KNN

In [82]:
pca = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[("pca", pca), ("knn", knn)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'knn__leaf_size' : list(range(1,30)),
    'knn__n_neighbors' : list(range(1,25)),
    'knn__p' : [1,2]
}
cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=-1)
cv.fit(X, y)
print(cv.best_params_)

{'knn__leaf_size': 1, 'knn__n_neighbors': 6, 'knn__p': 1, 'pca__n_components': 5}


In [83]:
pca = PCA(n_components=5)
pc = pca.fit_transform(X)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [85]:
knn = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=6)
knn.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=6, p=1)

In [86]:
y_pred_knn = knn.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        74
           1       0.98      1.00      0.99        46

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



## SVM

In [78]:
pca = PCA()
svm = SVC()
pipe = Pipeline(steps=[("pca", pca), ("svm", svm)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'svm__C': [1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'svm__kernel': ['linear', 'poly', 'sigmoid', 'rbf']
}

cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=2)
cv.fit(X, y)
print(cv.best_params_)

{'pca__n_components': 19, 'svm__C': 10, 'svm__gamma': 0.01, 'svm__kernel': 'sigmoid'}


In [79]:
pca = PCA(n_components=19)
pc = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [80]:
svm = SVC(C=10, gamma=0.01, kernel='sigmoid')
svm.fit(X_train, y_train)

SVC(C=10, gamma=0.01, kernel='sigmoid')

In [81]:
y_pred_svm = svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        74
           1       0.98      1.00      0.99        46

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



## Decision Tree

In [91]:
pca = PCA()
decision_tree = DecisionTreeClassifier()
pipe = Pipeline(steps=[("pca", pca), ("dct", decision_tree)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'dct__max_features': ['sqrt', 'log2'],
    'dct__ccp_alpha': [0.1, 0.01, 0.001],
    'dct__max_depth' : [5, 6, 7, 8, 9],
    'dct__criterion' :['gini', 'entropy']
}

cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=-1)
cv.fit(X, y)
print(cv.best_params_)

{'dct__ccp_alpha': 0.001, 'dct__criterion': 'entropy', 'dct__max_depth': 5, 'dct__max_features': 'sqrt', 'pca__n_components': 2}


In [92]:
pca = PCA(n_components=2)
pc = pca.fit_transform(X)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [94]:
decision_tree = DecisionTreeClassifier(ccp_alpha=0.001, max_depth=5, max_features='sqrt', criterion="entropy")
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=5,
                       max_features='sqrt')

In [95]:
y_pred_dt = decision_tree.predict(X_test)
print(classification_report(y_pred_dt, y_test))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        74
           1       0.96      0.96      0.96        46

    accuracy                           0.97       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.97      0.97      0.97       120



#### Nous voyons clairement que la réduction de la dimensionnalité à l'aide de l'ACP donne vraiment de très bons résultats pour différents algorithmes de classification atteignant 100 % de précision dans la régression logistique.