In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch import Generator
from torch.utils.data.dataset import random_split
import transformers as ppb
import joblib
from time import time
from datetime import datetime
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import train_test_split, KFold
## evaluation des classifications 
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
## validation croisée
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
## differents modèles à tester 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
import lightgbm as lgb
## keras
#from sklearn.model_selection import RepeatedKFold
#from keras.models import Sequential
#from keras.layers import Dense
import warnings
warnings.filterwarnings('ignore')

### Chargement des données vectorisées via BERT 

In [2]:
## read vectorized data 
open_file = open("data_text_vectorized.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
## read all vectorized data 
#open_file = open("data_text_vectorized_bert_all.pkl", "rb")
#textes_td = pickle.load(open_file)
#open_file.close()
## train dataset
train_features = loaded_list[0]
train_labels = loaded_list[1]
## val dataset
val_features = loaded_list[2]
val_labels = loaded_list[3]
label_names =  ['+', '-', '0','i', 'j', 'f', 's', 'p', 'm', 'a', 't']

print("train_features.shape", train_features.shape)
print("train_labels.shape", train_labels.shape)
print("val_features.shape", val_features.shape)
print("val_labels.shape", val_labels.shape)

train_features.shape (1318, 768)
train_labels.shape (1318, 11)
val_features.shape (330, 768)
val_labels.shape (330, 11)


### Le jeu de données entier pour le modèle final 

In [3]:
all_features = np.vstack((train_features, val_features))
all_labels = np.vstack((train_labels, val_labels))
print("all_features.shape", all_features.shape)
print("all_labels.shape", all_labels.shape)

all_features.shape (1648, 768)
all_labels.shape (1648, 11)


###  Test de deux règles de classification au hasard et Bayes naïf

In [4]:
dummy_model = DummyClassifier(strategy="uniform")
dummy_model.fit(train_features, train_labels)
val_pred_dummy = dummy_model.predict(val_features)
#print('classification au hasard \n',classification_report(val_labels, val_pred_dummy,target_names=label_names))

nb_model = OneVsRestClassifier(GaussianNB())
nb_model.fit(train_features, train_labels)
val_pred = nb_model.predict(val_features)
print('classification par NB \n',classification_report(val_labels, val_pred,target_names=label_names))

classification par NB 
               precision    recall  f1-score   support

           +       0.49      0.73      0.59        82
           -       0.33      0.65      0.44        63
           0       0.23      0.62      0.34        47
           i       0.66      0.75      0.70       146
           j       0.89      0.69      0.78       272
           f       0.38      0.80      0.51        45
           s       0.47      0.77      0.59        57
           p       0.64      0.84      0.73       124
           m       0.60      0.78      0.68       123
           a       0.39      0.61      0.48        72
           t       0.09      0.43      0.14        23

   micro avg       0.51      0.72      0.60      1054
   macro avg       0.47      0.70      0.54      1054
weighted avg       0.61      0.72      0.64      1054
 samples avg       0.58      0.70      0.60      1054



### Classification par régression logistique pénalisée par elasticnet

In [5]:
# grille de recherche : !!! issue de plusieurs essais
nb_folds = 5
cv = KFold(n_splits=nb_folds, random_state=109, shuffle=True)
model_to_set = OneVsRestClassifier(LogisticRegression(solver='saga', penalty='elasticnet'))
parameters = [{'estimator__C': [8, 7.5, 7, 6.5, 6, 5.5, 5],
               'estimator__l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}] # The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
enet_model = GridSearchCV(model_to_set, 
                          param_grid=parameters,
                          cv = cv,
                          verbose=3, 
                          n_jobs=100)
# record current time
start = time()
enet_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/LogisticRegression :')
print('best parameters: ', enet_model.best_params_)
print('best score: ', enet_model.best_score_)

Fitting 5 folds for each of 77 candidates, totalling 385 fits




[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.266 total time=   8.9s




[CV 1/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.250 total time=   9.5s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.265 total time=   9.5s




[CV 5/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.285 total time=   9.6s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.254 total time=   9.7s
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.261 total time=   9.5s




[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.246 total time=   9.8s




[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.250 total time=  10.0s




[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.285 total time=  10.3s




[CV 4/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.266 total time=  11.0s




[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.6;, score=0.262 total time=  12.2s




[CV 5/5] END estimator__C=8, estimator__l1_ratio=0.3;, score=0.289 total time=  12.6s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.5;, score=0.246 total time=  12.6s




[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.9;, score=0.242 total time=  12.7s
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.7;, score=0.254 total time=  12.7s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.2;, score=0.266 total time=  12.8s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.8;, score=0.262 total time=  12.9s




[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.8;, score=0.258 total time=  12.7s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.8;, score=0.242 total time=  12.7s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.3;, score=0.239 total time=  12.9s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.3;, score=0.262 total time=  12.9s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.4;, score=0.262 total time=  12.9s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.7;, score=0.250 total time=  12.8s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.3;, score=0.250 total time=  13.1s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.8;, score=0.261 total time=  13.0s
[CV 1/5] END estimator__C=8, estimator__l1_ratio=0.5;, score=0.239 total time=  13.1s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.2;, score=0.266 total time=  13.1s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=1;, score=0.250 total time=  13.0s
[CV 1/5] END estimator__C=8, estimator__l1



[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.6;, score=0.246 total time=  13.2s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.7;, score=0.246 total time=  13.2s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.5;, score=0.239 total time=  13.1s
[CV 1/5] END estimator__C=8, estimator__l1_ratio=0.2;, score=0.239 total time=  13.2s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.4;, score=0.250 total time=  13.2s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.5;, score=0.258 total time=  13.3s
[CV 1/5] END estimator__C=8, estimator__l1_ratio=1;, score=0.246 total time=  13.2s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.6;, score=0.262 total time=  13.1s
[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0.4;, score=0.289 total time=  13.2s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.8;, score=0.246 total time=  13.2s
[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0.8;, score=0.278 total time=  12.7s
[CV 5/5] END estimator__C=8, estimator__l1_ratio



[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.8;, score=0.250 total time=  13.1s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.7;, score=0.261 total time=  13.4s
[CV 5/5] END estimator__C=8, estimator__l1_ratio=0.9;, score=0.281 total time=  13.4s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.6;, score=0.246 total time=  13.3s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.7;, score=0.270 total time=  13.4s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.9;, score=0.259 total time=  13.4s
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.4;, score=0.254 total time=  13.3s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.6;, score=0.239 total time=  13.4s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.2;, score=0.250 total time=  13.4s
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.6;, score=0.254 total time=  13.2s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.1;, score=0.246 total time=  13.6s
[CV 5/5] END estimator__C=7.5, estimator



[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0.1;, score=0.285 total time=  13.5s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=1;, score=0.258 total time=  13.6s
[CV 5/5] END estimator__C=8, estimator__l1_ratio=0.2;, score=0.289 total time=  13.8s
[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0.7;, score=0.281 total time=  13.4s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=1;, score=0.262 total time=  13.7s
[CV 5/5] END estimator__C=8, estimator__l1_ratio=0.5;, score=0.285 total time=  13.7s
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.4;, score=0.266 total time=  13.7s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.6;, score=0.254 total time=  13.8s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.2;, score=0.242 total time=  13.6s




[CV 5/5] END estimator__C=8, estimator__l1_ratio=0.8;, score=0.281 total time=  13.7s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.4;, score=0.250 total time=  13.7s
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.1;, score=0.261 total time=  13.9s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.8;, score=0.255 total time=  13.4s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.1;, score=0.254 total time=  14.0s
[CV 5/5] END estimator__C=8, estimator__l1_ratio=1;, score=0.278 total time=  14.0s
[CV 1/5] END estimator__C=8, estimator__l1_ratio=0.8;, score=0.250 total time=  14.0s




[CV 4/5] END estimator__C=8, estimator__l1_ratio=0.1;, score=0.266 total time=  14.2s
[CV 1/5] END estimator__C=8, estimator__l1_ratio=0.1;, score=0.250 total time=  14.2s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.1;, score=0.266 total time=  14.2s




[CV 1/5] END estimator__C=8, estimator__l1_ratio=0.7;, score=0.250 total time=  14.4s
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0.2;, score=0.250 total time=  14.5s




[CV 3/5] END estimator__C=8, estimator__l1_ratio=0.4;, score=0.258 total time=  15.5s




[CV 1/5] END estimator__C=7, estimator__l1_ratio=0;, score=0.250 total time=   9.2s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0;, score=0.265 total time=   8.7s




[CV 4/5] END estimator__C=7, estimator__l1_ratio=0;, score=0.266 total time=   8.7s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0.9;, score=0.246 total time=  12.6s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0;, score=0.246 total time=   9.1s




[CV 5/5] END estimator__C=7, estimator__l1_ratio=0;, score=0.285 total time=   8.9s
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0.9;, score=0.239 total time=  12.7s




[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.9;, score=0.254 total time=  12.7s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0;, score=0.246 total time=   8.7s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0;, score=0.261 total time=   8.7s
[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0.9;, score=0.278 total time=  12.7s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0;, score=0.246 total time=   8.8s




[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0;, score=0.262 total time=   8.8s
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=1;, score=0.246 total time=  12.8s
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0.9;, score=0.259 total time=  12.9s




[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=1;, score=0.239 total time=  12.7s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0;, score=0.289 total time=   9.1s




[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=1;, score=0.254 total time=  12.7s




[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=1;, score=0.266 total time=  12.7s




[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=1;, score=0.278 total time=  12.8s




[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.5;, score=0.235 total time=  12.4s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.4;, score=0.254 total time=  12.5s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.4;, score=0.239 total time=  12.5s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.2;, score=0.285 total time=  12.6s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.4;, score=0.289 total time=  12.5s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.3;, score=0.289 total time=  12.6s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.5;, score=0.285 total time=  12.5s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.2;, score=0.239 total time=  12.7s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.6;, score=0.242 total time=  12.5s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.6;, score=0.246 total time=  12.5s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.2;, score=0.246 total time=  12.7s
[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.4;,



[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.7;, score=0.255 total time=  12.5s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.1;, score=0.285 total time=  12.9s
[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.6;, score=0.259 total time=  12.6s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.5;, score=0.246 total time=  12.7s
[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.1;, score=0.262 total time=  12.9s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.4;, score=0.250 total time=  12.8s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.7;, score=0.250 total time=  12.7s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.8;, score=0.254 total time=  12.7s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.9;, score=0.239 total time=  12.6s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.9;, score=0.281 total time=  12.6s




[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.8;, score=0.259 total time=  12.7s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=1;, score=0.246 total time=  12.7s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.9;, score=0.246 total time=  12.7s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.9;, score=0.254 total time=  12.7s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=1;, score=0.258 total time=  12.7s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=1;, score=0.281 total time=  12.7s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.3;, score=0.235 total time=  12.5s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.3;, score=0.258 total time=  13.0s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.8;, score=0.281 total time=  12.8s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=1;, score=0.239 total time=  12.7s
[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.9;, score=0.262 total time=  12.7s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.2;, sco



[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.2;, score=0.242 total time=  12.6s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.2;, score=0.261 total time=  12.6s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.2;, score=0.239 total time=  12.7s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.3;, score=0.246 total time=  12.6s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.3;, score=0.262 total time=  12.5s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.4;, score=0.235 total time=  12.5s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.4;, score=0.242 total time=  12.5s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.4;, score=0.254 total time=  12.5s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.5;, score=0.231 total time=  12.4s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.1;, score=0.246 total time=  12.8s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.4;, score=0.281 total time=  12.5s




[CV 4/5] END estimator__C=7, estimator__l1_ratio=1;, score=0.255 total time=  13.0s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.1;, score=0.261 total time=  12.9s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.5;, score=0.254 total time=  12.5s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.7;, score=0.242 total time=  13.2s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.5;, score=0.259 total time=  12.6s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.1;, score=0.266 total time=  13.0s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.2;, score=0.258 total time=  13.6s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.3;, score=0.258 total time=  12.9s




[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.5;, score=0.242 total time=  12.7s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.6;, score=0.242 total time=  12.4s
[CV 5/5] END estimator__C=7, estimator__l1_ratio=0.7;, score=0.278 total time=  13.4s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.5;, score=0.278 total time=  12.6s
[CV 4/5] END estimator__C=7, estimator__l1_ratio=0.3;, score=0.262 total time=  13.7s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.8;, score=0.254 total time=  13.5s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.6;, score=0.242 total time=  12.6s
[CV 1/5] END estimator__C=7, estimator__l1_ratio=0.1;, score=0.242 total time=  13.9s




[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.6;, score=0.250 total time=  12.5s
[CV 3/5] END estimator__C=7, estimator__l1_ratio=0.7;, score=0.250 total time=  13.6s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.2;, score=0.281 total time=  13.4s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.3;, score=0.281 total time=  13.3s




[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.1;, score=0.285 total time=  13.6s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.4;, score=0.255 total time=  13.4s




[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.1;, score=0.239 total time=  14.0s
[CV 2/5] END estimator__C=7, estimator__l1_ratio=0.8;, score=0.239 total time=  14.4s




[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.6;, score=0.259 total time=  13.8s




[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.6;, score=0.281 total time=  13.3s




[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.7;, score=0.242 total time=  12.5s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.7;, score=0.239 total time=  12.5s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.7;, score=0.246 total time=  12.5s




[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.7;, score=0.255 total time=  12.5s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.7;, score=0.278 total time=  12.5s




[CV 1/5] END estimator__C=6, estimator__l1_ratio=0;, score=0.242 total time=   8.7s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.8;, score=0.250 total time=  12.6s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0;, score=0.262 total time=   8.7s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0;, score=0.281 total time=   8.7s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0;, score=0.258 total time=   8.8s




[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.8;, score=0.239 total time=  12.6s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.8;, score=0.255 total time=  12.5s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.8;, score=0.281 total time=  12.5s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.8;, score=0.258 total time=  12.7s




[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=0.9;, score=0.242 total time=  12.7s
[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=0.9;, score=0.255 total time=  12.7s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=0.9;, score=0.258 total time=  12.8s




[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0;, score=0.254 total time=   8.7s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0;, score=0.246 total time=   9.6s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0;, score=0.246 total time=   8.8s
[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0;, score=0.242 total time=   8.8s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0;, score=0.285 total time=   8.8s
[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=1;, score=0.239 total time=  12.8s
[CV 3/5] END estimator__C=6.5, estimator__l1_ratio=1;, score=0.258 total time=  12.6s




[CV 4/5] END estimator__C=6.5, estimator__l1_ratio=1;, score=0.255 total time=  12.6s
[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=0.9;, score=0.278 total time=  13.3s




[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0;, score=0.259 total time=   9.4s
[CV 1/5] END estimator__C=6.5, estimator__l1_ratio=1;, score=0.239 total time=  13.5s




[CV 2/5] END estimator__C=6.5, estimator__l1_ratio=0.9;, score=0.239 total time=  15.3s




[CV 5/5] END estimator__C=6.5, estimator__l1_ratio=1;, score=0.278 total time=  14.2s




[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.3;, score=0.259 total time=  12.4s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.4;, score=0.231 total time=  12.4s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.3;, score=0.239 total time=  12.5s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.3;, score=0.285 total time=  12.4s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.4;, score=0.254 total time=  12.5s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.2;, score=0.254 total time=  12.6s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.2;, score=0.242 total time=  12.6s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.2;, score=0.262 total time=  12.6s




[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.5;, score=0.250 total time=  12.4s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.4;, score=0.242 total time=  12.6s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.4;, score=0.255 total time=  12.5s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.5;, score=0.259 total time=  12.4s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.5;, score=0.231 total time=  12.5s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.1;, score=0.246 total time=  12.8s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.1;, score=0.262 total time=  12.8s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.6;, score=0.258 total time=  12.4s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.5;, score=0.274 total time=  12.5s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.6;, score=0.278 total time=  12.4s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.1;, score=0.242 total time=  12.9s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.7;,



[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.4;, score=0.278 total time=  12.6s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.1;, score=0.258 total time=  12.9s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.8;, score=0.242 total time=  12.5s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.6;, score=0.231 total time=  12.7s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.8;, score=0.239 total time=  12.6s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.2;, score=0.239 total time=  13.0s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.8;, score=0.258 total time=  12.6s




[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.3;, score=0.254 total time=  13.1s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.8;, score=0.278 total time=  12.6s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.7;, score=0.274 total time=  12.8s
[CV 1/5] END estimator__C=6, estimator__l1_ratio=0.9;, score=0.239 total time=  12.7s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.9;, score=0.258 total time=  12.7s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=1;, score=0.239 total time=  12.6s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.9;, score=0.255 total time=  12.6s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.9;, score=0.281 total time=  12.7s




[CV 1/5] END estimator__C=6, estimator__l1_ratio=1;, score=0.235 total time=  12.7s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=1;, score=0.255 total time=  12.6s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.3;, score=0.239 total time=  13.3s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=1;, score=0.278 total time=  12.7s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.8;, score=0.255 total time=  13.0s




[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.2;, score=0.285 total time=  13.5s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.2;, score=0.285 total time=  12.5s
[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.3;, score=0.235 total time=  12.4s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.5;, score=0.239 total time=  13.3s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.3;, score=0.239 total time=  12.4s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.2;, score=0.239 total time=  12.6s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.2;, score=0.259 total time=  12.6s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.1;, score=0.262 total time=  12.7s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.6;, score=0.259 total time=  13.3s
[CV 5/5] END estimator__C=6, estimator__l1_ratio=0.1;, score=0.281 total time=  13.7s
[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.1;, score=0.242 total time=  12.8s




[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.9;, score=0.239 total time=  13.2s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=0.7;, score=0.254 total time=  13.4s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.7;, score=0.239 total time=  13.4s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.1;, score=0.254 total time=  12.9s
[CV 4/5] END estimator__C=6, estimator__l1_ratio=0.7;, score=0.255 total time=  13.5s
[CV 2/5] END estimator__C=6, estimator__l1_ratio=0.6;, score=0.239 total time=  13.6s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.3;, score=0.255 total time=  12.6s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.4;, score=0.227 total time=  12.5s
[CV 3/5] END estimator__C=6, estimator__l1_ratio=1;, score=0.258 total time=  13.5s




[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.3;, score=0.254 total time=  13.0s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.3;, score=0.281 total time=  12.8s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.1;, score=0.285 total time=  13.3s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.2;, score=0.239 total time=  13.5s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.2;, score=0.254 total time=  13.5s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.1;, score=0.246 total time=  13.7s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.4;, score=0.254 total time=  12.7s




[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.4;, score=0.235 total time=  13.4s




[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.4;, score=0.274 total time=  12.5s




[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.4;, score=0.259 total time=  13.8s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.5;, score=0.227 total time=  12.2s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.5;, score=0.235 total time=  12.2s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.5;, score=0.258 total time=  12.2s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.5;, score=0.259 total time=  12.2s




[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.5;, score=0.274 total time=  12.2s
[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.6;, score=0.227 total time=  12.2s




[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.6;, score=0.235 total time=  12.2s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.6;, score=0.281 total time=  12.2s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.6;, score=0.255 total time=  12.2s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0;, score=0.242 total time=   8.5s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0;, score=0.254 total time=   8.5s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.6;, score=0.254 total time=  12.3s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0;, score=0.262 total time=   8.5s




[CV 5/5] END estimator__C=5, estimator__l1_ratio=0;, score=0.281 total time=   8.5s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.7;, score=0.239 total time=  12.2s




[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.7;, score=0.255 total time=  12.2s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.7;, score=0.278 total time=  12.2s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.8;, score=0.235 total time=  12.2s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.7;, score=0.254 total time=  12.6s
[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.8;, score=0.281 total time=  12.2s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.7;, score=0.223 total time=  12.9s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0;, score=0.246 total time=   9.3s
[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.8;, score=0.239 total time=  12.4s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.8;, score=0.254 total time=  12.4s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=0.9;, score=0.258 total time=  12.2s
[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=0.9;, score=0.235 total time=  12.4s




[CV 1/5] END estimator__C=5.5, estimator__l1_ratio=1;, score=0.223 total time=  12.2s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.9;, score=0.251 total time=  12.7s




[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=1;, score=0.239 total time=  12.2s
[CV 3/5] END estimator__C=5.5, estimator__l1_ratio=1;, score=0.254 total time=  12.1s
[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=0.8;, score=0.251 total time=  13.0s




[CV 2/5] END estimator__C=5.5, estimator__l1_ratio=0.9;, score=0.235 total time=  13.3s




[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=0.9;, score=0.278 total time=  13.3s




[CV 4/5] END estimator__C=5.5, estimator__l1_ratio=1;, score=0.243 total time=  11.9s




[CV 5/5] END estimator__C=5.5, estimator__l1_ratio=1;, score=0.274 total time=  11.9s




[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.2;, score=0.239 total time=  11.7s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.2;, score=0.235 total time=  11.7s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.2;, score=0.258 total time=  11.7s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.3;, score=0.231 total time=  11.6s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.3;, score=0.254 total time=  11.6s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.2;, score=0.281 total time=  11.7s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.3;, score=0.278 total time=  11.7s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.4;, score=0.227 total time=  11.7s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.4;, score=0.254 total time=  11.6s




[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.1;, score=0.239 total time=  12.0s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.1;, score=0.262 total time=  12.0s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.4;, score=0.274 total time=  11.7s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.5;, score=0.259 total time=  11.6s




[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.6;, score=0.255 total time=  11.6s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.6;, score=0.274 total time=  11.6s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.1;, score=0.258 total time=  12.2s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.7;, score=0.220 total time=  11.6s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.5;, score=0.278 total time=  11.7s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.7;, score=0.239 total time=  11.7s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.7;, score=0.254 total time=  11.7s




[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.2;, score=0.259 total time=  12.3s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.7;, score=0.247 total time=  11.6s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.7;, score=0.278 total time=  11.7s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.8;, score=0.223 total time=  11.6s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.5;, score=0.220 total time=  12.2s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.4;, score=0.259 total time=  12.2s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.8;, score=0.239 total time=  11.7s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.8;, score=0.254 total time=  11.7s




[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.5;, score=0.235 total time=  12.2s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.8;, score=0.274 total time=  11.6s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.9;, score=0.223 total time=  11.6s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.9;, score=0.231 total time=  11.7s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.9;, score=0.243 total time=  11.7s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=0.6;, score=0.220 total time=  12.2s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.9;, score=0.270 total time=  11.7s




[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.3;, score=0.235 total time=  12.7s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=1;, score=0.270 total time=  11.7s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.3;, score=0.259 total time=  12.7s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=1;, score=0.235 total time=  11.8s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.1;, score=0.239 total time=  13.0s
[CV 5/5] END estimator__C=5, estimator__l1_ratio=0.1;, score=0.281 total time=  12.9s
[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.6;, score=0.239 total time=  12.4s




[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.5;, score=0.258 total time=  12.8s
[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.6;, score=0.254 total time=  12.8s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=0.8;, score=0.247 total time=  12.5s




[CV 3/5] END estimator__C=5, estimator__l1_ratio=0.9;, score=0.250 total time=  12.4s




[CV 3/5] END estimator__C=5, estimator__l1_ratio=1;, score=0.250 total time=  12.6s
[CV 1/5] END estimator__C=5, estimator__l1_ratio=1;, score=0.227 total time=  12.7s
[CV 4/5] END estimator__C=5, estimator__l1_ratio=1;, score=0.240 total time=  12.7s




[CV 2/5] END estimator__C=5, estimator__l1_ratio=0.4;, score=0.235 total time=  14.6s
71.538 seconds
Evaluation of OneVsRestClassifier/LogisticRegression :
best parameters:  {'estimator__C': 8, 'estimator__l1_ratio': 0.1}
best score:  0.2640569189998848


In [6]:
val_pred_enet = enet_model.predict(val_features)
print('classification par elasticnet \n',classification_report(val_labels, val_pred_enet,target_names=label_names))

classification par elasticnet 
               precision    recall  f1-score   support

           +       0.65      0.55      0.60        82
           -       0.75      0.38      0.51        63
           0       0.25      0.09      0.13        47
           i       0.77      0.76      0.76       146
           j       0.88      0.96      0.91       272
           f       0.87      0.60      0.71        45
           s       0.87      0.70      0.78        57
           p       0.87      0.76      0.81       124
           m       0.84      0.83      0.83       123
           a       0.78      0.64      0.70        72
           t       0.71      0.22      0.33        23

   micro avg       0.81      0.72      0.76      1054
   macro avg       0.75      0.59      0.64      1054
weighted avg       0.79      0.72      0.74      1054
 samples avg       0.82      0.72      0.75      1054



### modèle final sur l'ensemble du jeu de données

In [7]:
#print('best parameters: ', enet_model.best_params_)
#enet_all = OneVsRestClassifier(LogisticRegression(solver='saga', 
#                                                  penalty='elasticnet',
#                                                  C =  enet_model.best_params_['estimator__C'],
#                                                  l1_ratio =  enet_model.best_params_['estimator__l1_ratio']))  
#enet_all.fit(all_features, all_labels)

### Classification par gradient boosting (light-gbm)

In [8]:
parameters = [{'estimator__learning_rate': [0.05, 0.1, 0.25],
               'estimator__n_estimators': [600, 700, 800]}]
model_to_set = OneVsRestClassifier(lgb.LGBMClassifier(boosting_type='gbdt',  
                                                      objective='binary'))
lgb_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3, n_jobs=150)
model_to_set.get_params().keys() #pour connaitre la liste des hyper-paramètres 

dict_keys(['estimator__boosting_type', 'estimator__class_weight', 'estimator__colsample_bytree', 'estimator__importance_type', 'estimator__learning_rate', 'estimator__max_depth', 'estimator__min_child_samples', 'estimator__min_child_weight', 'estimator__min_split_gain', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_leaves', 'estimator__objective', 'estimator__random_state', 'estimator__reg_alpha', 'estimator__reg_lambda', 'estimator__silent', 'estimator__subsample', 'estimator__subsample_for_bin', 'estimator__subsample_freq', 'estimator', 'n_jobs'])

In [9]:
# record current time
start = time()
lgb_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/lightgbm :')
print('best parameters: ', lgb_model.best_params_)
print('best score: ', lgb_model.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 2/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.227 total time= 1.2min
[CV 5/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.190 total time= 1.2min
[CV 4/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.198 total time= 1.3min
[CV 1/5] END estimator__learning_rate=0.25, estimator__n_estimators=700;, score=0.239 total time= 1.3min
[CV 5/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.190 total time= 1.3min
[CV 2/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.227 total time= 1.3min
[CV 1/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.239 total time= 1.3min
[CV 3/5] END estimator__learning_rate=0.25, estimator__n_estimators=700;, score=0.258 total time= 1.3min
[CV 3/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.258 total time= 1.

In [10]:
val_pred_lgb = lgb_model.predict(val_features)
print('classification par lgb \n',classification_report(val_labels, 
                                                        val_pred_lgb,
                                                        target_names=label_names))

classification par lgb 
               precision    recall  f1-score   support

           +       0.72      0.44      0.55        82
           -       0.79      0.17      0.29        63
           0       0.25      0.02      0.04        47
           i       0.74      0.77      0.75       146
           j       0.86      0.96      0.91       272
           f       0.94      0.38      0.54        45
           s       0.87      0.47      0.61        57
           p       0.86      0.68      0.76       124
           m       0.79      0.70      0.74       123
           a       0.76      0.47      0.58        72
           t       0.00      0.00      0.00        23

   micro avg       0.81      0.64      0.71      1054
   macro avg       0.69      0.46      0.52      1054
weighted avg       0.77      0.64      0.67      1054
 samples avg       0.82      0.64      0.70      1054



### modèle final sur l'ensemble du jeu de données

In [11]:
#print('best parameters: ', lgb_model.best_params_)
#lgb_all = OneVsRestClassifier(lgb.LGBMClassifier(boosting_type='gbdt',  
#                                                 objective='binary',
#                                                 learning_rate = lgb_model.best_params_['estimator__learning_rate'],
#                                                 n_estimators = lgb_model.best_params_['estimator__n_estimators']))
#lgb_all.fit(all_features, all_labels)

### Classification par random forest 

In [12]:
# Grid search for random forest 
parameters = [{#'estimator__max_features': [int(x) for x in np.linspace(start = 50, stop = 750, num = 10)],
               'estimator__max_features': [200],
               'estimator__n_estimators': [500],
               'estimator__bootstrap' : [False]}]
model_to_set = OneVsRestClassifier(RandomForestClassifier(n_jobs=150))
rf_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3)
# record current time
start = time()
rf_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/Gradient Boosting :')
print('best parameters: ', rf_model.best_params_)
print('best score: ', rf_model.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.159 total time=  37.0s
[CV 2/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.186 total time=  32.4s
[CV 3/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.170 total time=  32.6s
[CV 4/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.163 total time=  32.9s
[CV 5/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.179 total time=  31.9s
198.791 seconds
Evaluation of OneVsRestClassifier/Gradient Boosting :
best parameters:  {'estimator__bootstrap': False, 'estimator__max_features': 200, 'estimator__n_estimators': 500}
best score:  0.17147136766908627


In [13]:
val_pred_rf = rf_model.predict(val_features)
print('classification par RF \n',classification_report(val_labels, 
                                                       val_pred_rf,
                                                       target_names=label_names))

classification par RF 
               precision    recall  f1-score   support

           +       0.81      0.37      0.50        82
           -       0.62      0.08      0.14        63
           0       0.00      0.00      0.00        47
           i       0.74      0.73      0.74       146
           j       0.85      0.97      0.90       272
           f       1.00      0.22      0.36        45
           s       0.90      0.32      0.47        57
           p       0.86      0.51      0.64       124
           m       0.82      0.59      0.69       123
           a       0.71      0.33      0.45        72
           t       0.00      0.00      0.00        23

   micro avg       0.81      0.56      0.66      1054
   macro avg       0.66      0.37      0.45      1054
weighted avg       0.76      0.56      0.61      1054
 samples avg       0.81      0.57      0.65      1054



### modèle final sur l'ensemble du jeu de données

In [14]:
#print('best parameters: ', rf_model.best_params_)
#rf_all = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1,
#                                                    bootstrap = rf_model.best_params_['estimator__bootstrap'],
#                                                    max_features = rf_model.best_params_['estimator__max_features'],
#                                                    n_estimators = rf_model.best_params_['estimator__n_estimators']))
#rf_all.fit(all_features, all_labels)

### Classification par SVC

In [15]:
# Grid search for random forest 
parameters = [{'estimator__C': [0.1,1, 10, 100], 
               'estimator__gamma': [1,0.1,0.01,0.001],
               'estimator__kernel': ['rbf', 'poly', 'sigmoid']}]
model_to_set = OneVsRestClassifier(SVC())
svc_model = GridSearchCV(model_to_set, param_grid=parameters, cv=cv, verbose=3, n_jobs=150)
# record current time
start = time()
svc_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/Gradient Boosting :')
print('best parameters: ', svc_model.best_params_)
print('best score: ', svc_model.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 3/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   3.0s
[CV 1/5] END estimator__C=1, estimator__gamma=1, estimator__kernel=poly;, score=0.231 total time=   3.2s
[CV 3/5] END estimator__C=1, estimator__gamma=0.01, estimator__kernel=poly;, score=0.000 total time=   3.2s
[CV 4/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=poly;, score=0.103 total time=   3.7s
[CV 1/5] END estimator__C=1, estimator__gamma=1, estimator__kernel=sigmoid;, score=0.000 total time=   3.8s
[CV 1/5] END estimator__C=1, estimator__gamma=0.1, estimator__kernel=poly;, score=0.246 total time=   3.9s
[CV 4/5] END estimator__C=1, estimator__gamma=1, estimator__kernel=poly;, score=0.198 total time=   4.2s
[CV 2/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=sigmoid;, score=0.000 total time=   4.5s
[CV 4/5] END estimator__C=10, estimator__gamma=0.1, estimator__kernel=pol

In [16]:
val_pred_svc = svc_model.predict(val_features)
print('classification par SVC \n',classification_report(val_labels, 
                                                        val_pred_svc,
                                                        target_names=label_names))

classification par SVC 
               precision    recall  f1-score   support

           +       0.61      0.61      0.61        82
           -       0.65      0.48      0.55        63
           0       0.22      0.13      0.16        47
           i       0.75      0.73      0.74       146
           j       0.89      0.94      0.91       272
           f       0.86      0.67      0.75        45
           s       0.87      0.79      0.83        57
           p       0.86      0.81      0.84       124
           m       0.81      0.82      0.81       123
           a       0.72      0.69      0.71        72
           t       0.62      0.43      0.51        23

   micro avg       0.79      0.74      0.76      1054
   macro avg       0.71      0.65      0.68      1054
weighted avg       0.77      0.74      0.76      1054
 samples avg       0.80      0.74      0.75      1054



### modèle final sur l'ensemble du jeu de données

In [17]:
#print('best parameters: ', svc_model.best_params_)
#svc_all = OneVsRestClassifier(SVC(C = svc_model.best_params_['estimator__C'],
#                                  gamma = svc_model.best_params_['estimator__gamma'],
#                                  kernel = svc_model.best_params_['estimator__kernel']))
#svc_all.fit(all_features, all_labels)

In [18]:
## enregistrer les différents modèles
# elasticnet
enet_file = 'enet_model.sav'
joblib.dump(enet_model, enet_file)
# light gbm
lgb_file = 'lgb_model.sav'
joblib.dump(lgb_model, lgb_file)
# random forest
rf_file = 'rf_model.sav'
joblib.dump(rf_model, rf_file)
# support vector machine
svc_file = 'svc_model.sav'
joblib.dump(svc_model, svc_file)
 

['svc_model.sav']