## TP Linear Model : Binary Classification

In [None]:
# builtin
import os, warnings

# data
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import LabelEncoder
# metrics
from sklearn.metrics import roc_curve, auc, confusion_matrix
# estimators
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# visualisation
import matplotlib.pyplot as plt

In [None]:
# warnings.filterwarnings('ignore')
warnings.filterwarnings(action="once")

In [None]:
X_train, X_test, y_train, y_test = pd.DataFrame({})

### 3. Modelisation

#### 3.1 Dummy classifier

In [None]:
estimator = DummyClassifier(strategy="most_frequent")
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)
y_pred

In [None]:
pd.Series(y_pred).values_counts() # transform the numpy array into a Pandas Series so that we can use the values count method

In [None]:
y_test.value_counts(normalize=True)

In [None]:
y_test.values

In [None]:
def score(estimator):
    """
    Computes and prints train score and test score.
    :param estimator:
    :return:
    """
    tr_score = estimator.score(X_train, y_train).round(4)
    te_score = estimator.score(X_test, y_test).round(4)

    print(f"score train : {tr_score} score test : {te_score} ")

In [None]:
score(estimator)

In [None]:
def confusion(y_test, y_pred):
    """
    Displays a fancy confusion matrix
    :param y_test:
    :param y_pred:
    :return:
    """
    mat = confusion_matrix(y_test, y_pred) # a numpy array
    mat = pd.DataFrame(mat)
    mat.columns = [f"pred_{i}" for i in mat.columns]
    mat.index = [f"test_{i}" for i in mat.index]

    return mat

In [None]:
confusion(y_pred, y_test)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
plt.figure(figsize=(10,10))
plt.title("Receiver Operating Characteristic")
plt.plot(false_positive_rate, true_positive_rate, color="red", label= "AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

#### 3.2 Simple Logistic regression & Linear SVM

##### 3.2.1 Logistic Regression


In [None]:
estimator = LogisticRegression(solver="liblinear")
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)
y_pred

In [None]:
y_prob = estimator.predict_proba(X_test).round(2)
y_prob # la confiance / proba d'appartenance à telle ou telle classe

In [None]:
score(estimator)
confusion(y_test, y_pred)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title("Receiver Operating Characteristic")
plt.plot(false_positive_rate, true_positive_rate, color="red", label= "AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

##### 3.2.2 SVM

In [None]:
estimator = LinearSVC()
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)
y_pred

In [4]:
# linear svc pas de y_proba comme méthode

In [None]:
score(estimator)
confusion(y_test, y_pred)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title("Receiver Operating Characteristic")
plt.plot(false_positive_rate, true_positive_rate, color="red", label= "AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

#### 3.3 Finding Hyper-parameters

##### 3.3.1 Logistic Regression

In [None]:
estimator = LogisticRegression()

params = {
    'C': np.logspace(-3, 3, 7),
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

In [None]:
grid = GridSearchCV(estimator,
                    params,
                    cv=10,
                    n_jobs=-1,
                    return_train_score=True,
                    verbose=1)

grid.fit(X_train, y_train)
# cv=10 : donc 10 folds == train on 10 train set => moyenne sur les 10 et std deviation donc idée ==> idée de la variance
# verbose = qtt info renvoyé par la grid

# params :dict de string et chq param testé doit etre un itérable (list)
# params : clé => valeur (list for example)

In [None]:
best_params = grid.best_params_
best_params

In [None]:
res = grid.cv_results_
res = pd.DataFrame(res)
res

In [None]:
cols = [i for i in res.columns if "split" not in i]
cols # so that we will have an average of the scores

In [None]:
res = res[cols]
res = res.sort_values("rank_test_score")
res.head(10)

In [None]:
def rezultize(grid):
    """
    Make a fancy df from grid.cv_results
    :param grid:
    :return:
    """
    res = grid.cv_results_
    res = pd.DataFrame(res)
    cols = [i for i in res.columns if "split" not in i]
    res = res[cols]
    res = res.sort_values("rank_test_score")

    return res

In [None]:
rezultize(grid).head(10)

Train again the estimator with the best parameters

In [None]:
# OR automatic operation
estimator = LogisticRegression(**best_params)

In [None]:
estimator = LogisticRegression(**best_params)
estimator.fit(X_train, y_train) # no folds here
y_pred = estimator.predict(X_test)
y_pred

In [None]:
score(estimator)
confusion(y_test, y_pred)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title("Receiver Operating Characteristic")
plt.plot(false_positive_rate, true_positive_rate, color="red", label= "AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

Plus de confiance dans le model comme pls entrainement faits
progres 1% ?
avant marge 5% erreur
progression 1/5 = 20% erreur
?
reduit l'erreur de 20%

##### 3.3.2 SVM

In [None]:
estimator = LinearSVC()
params = {'C': np.logspace(-3, 3, 7)}
# 7 * 10 cv = 70 fits

https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/6774321-entrainez-vous-a-classer-automatiquement-des-feuilles-d-arbres TRY TO DO THIS PROJECT ;)