In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch import Generator
from torch.utils.data.dataset import random_split
import transformers as ppb
import joblib
from time import time
from datetime import datetime
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import train_test_split, KFold
## evaluation des classifications 
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
## validation croisée
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
## differents modèles à tester 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
import lightgbm as lgb
## keras
#from sklearn.model_selection import RepeatedKFold
#from keras.models import Sequential
#from keras.layers import Dense
import warnings
warnings.filterwarnings('ignore')

In [2]:
# General settings
allow_cuda = True
warnings.filterwarnings('ignore')
# Check whether cuda is available, and select processor or GPU
if torch.cuda.is_available() and allow_cuda:
    print('GPU (Cuda) power !')
    hardware = torch.device('cuda:0')
    print(hardware)
    torch.cuda.set_device(0)
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    torch.cuda.empty_cache()
else:
    print('CPU running')
    hardware = torch.device('cpu')
    torch.set_default_tensor_type('torch.FloatTensor')

labels_list = ['+', '-', '0', 'i', 'j', 'f', 's', 'p', 'm', 'a', 't']
labels_nb = len(labels_list)

# Fonctions communes
def print_cuda_memory_usage():
    if hardware == torch.device('cuda:0'):
        t = torch.cuda.get_device_properties(0).total_memory / 1073741824
        r = torch.cuda.memory_reserved(0) / 1048576
        a = torch.cuda.memory_allocated(0) / 1048576
        print('cuda total=', t, 'reserved=', r, 'allocated=', a, 'free=', r - a, '...quelle unité ?')

# Log a string, and store to globals the log filename
def log(line, log_file='logs/default.txt'):
    print(line)
    if not ('logfile' in globals()):
        global logfile
        logfile = log_file
        print('set logfile to ', logfile)
    with open(logfile, "a") as f:
        print(line, file=f)

GPU (Cuda) power !
cuda:0


In [3]:
# Stockage des donnees dans un TensorDataset, permet aussi de produire les minibatches
# dataframe pandas en entrée au lieu d'un chemin vers un fichier
class Textes_TensorDataset(TensorDataset):
    def __init__(self, textes_df, tokenizer):
        super(TensorDataset, self).__init__()
        self.df = textes_df
        self.tokenizer = tokenizer
        # get the CLS token to check it
        cls_sep_tokens = tokenizer.encode('', add_special_tokens=True)
        tokenized = self.df['phrases'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)
        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
        

        self.input_ids = padded
        print('input_ids.shape', self.input_ids.shape)
        self.attention_mask = np.where(padded != 0, 1, 0)
        self.max_len = self.input_ids.shape[1]
        self.nb_textes = self.input_ids.shape[0]

        # Préparation des labels à partir des colonnes (troisième colonne)
        self.raw_labels_names = self.df.columns[2:]
        for i, row in self.df.iterrows():
            for key in labels_list:
                self.df.at[i, key] = max([row[c] for c in self.raw_labels_names if key in c])
        print('Textes_TensorDataset/__init()__ loaded', self.nb_textes, 'texts', self.max_len, 'of chars length')
    # Au démarrage on n'a pas encore calculé les hidden states
        self.last_hidden_states = None 

    # Quand on a calculé les hidden states
    def store_last_hidden_states(self, lhd):
        self.last_hidden_states = lhd
        print('Textes_TensorDataset/store_last_hidden_states()')
    # renvoie un item : input_id, attention_mask, labels, last_hidden_states (si disponibles)
    def __getitem__(self, index):
        if(self.last_hidden_states is None):
            return self.input_ids[index], self.attention_mask[index], self.df.loc[index, labels_list].to_numpy(dtype=np.float32)
        else:
            return self.input_ids[index], self.attention_mask[index], self.df.loc[index, labels_list].to_numpy(dtype=np.float32), self.last_hidden_states[index]

    def __len__(self):
        return self.nb_textes

#
# Class CamemBERT avec couche linear on-top
#
class CustomBertModel(torch.nn.Module):
    def __init__(self, bert_model, weights, layer_sizes):
        super(CustomBertModel, self).__init__()
        self.layer_sizes = layer_sizes

        self.bert = bert_model.from_pretrained(weights)
        self.dropout = torch.nn.Dropout(.05)
        self.linear1 = torch.nn.Linear(1 * 768, self.layer_sizes[0])
        self.nonlinear1 = torch.nn.Tanh()
        self.linear2 = torch.nn.Linear(self.layer_sizes[0], self.layer_sizes[1])
        self.nonlinear2 = torch.nn.Tanh() # attention Tanh donne de meilleurs résultats que Sigmoid
        self.linear3 = torch.nn.Linear(self.layer_sizes[1], self.layer_sizes[2])
        print('CustomBertModel/__init__ with layers on top', layer_sizes)

    # forwarde le modele, mode = 0 ne sort que la sortie par defaut, mode = 1 sort la sortie de bert aussi
    def forward(self, input_ids, attention_mask, mode = 0):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        # Ne recupere le last_hidden_state que du CLS token
        lhd = bert_output.last_hidden_state
        lhd_cls_token = lhd[:, 0, :]
        
        # Linear layer on top of BERT
        x = self.dropout(lhd_cls_token)
        x = self.nonlinear1(self.linear1(x))
        x = self.nonlinear2(self.linear2(x))
        x = self.linear3(x)
        
        if mode == 0:
            return x
        elif mode == 1:
            return x, lhd_cls_token

    # Bloque ou permet l'apprentissage sur la partie bert du modele
    def bert_training(self, authorisation, tune_depth):
        for name, p in self.named_parameters():
            p.requires_grad = False
            if ("linear1" in name) or ("linear2" in name) or ("linear3" in name): 
                p.requires_grad = authorisation
            if(tune_depth < 12) and (tune_depth >= 0) :
                for depth in np.arange(tune_depth, 12):
                    if ("bert.encoder.layer."+ str(depth) in name):
                        p.requires_grad = authorisation
                if ("bert.pooler.dense" in name) :
                    p.requires_grad = authorisation

    # Test on a batch, return loss
    def evaluate(self, x, attention_mask, expected, criterion):
        #print('EVALUATE :')
        self.eval()
        with torch.no_grad():
            output = self.forward(x, attention_mask=attention_mask)
            loss_train = criterion(output, expected)
            xx = np.rint(100 * expected[:, :].cpu().numpy())
            yy = np.rint(100 * torch.sigmoid(output[:, :]).cpu().numpy())
        return loss_train.item(), xx, yy
    # Train on a batch, return loss
    def learn(self, x, attention_mask, expected, criterion, optimizer):
        self.train()
        optimizer.zero_grad()
        output = self.forward(x, attention_mask=attention_mask)
        loss_train = criterion(output, expected)
        loss_train.backward()  # Compute the back propagation gradients
        optimizer.step()  # Setup coefficients
        return loss_train.item()

    def save(self, fn):
        torch.save(self.state_dict(), fn)

    def load(self, fn):
        self.load_state_dict(torch.load(fn))
        self.eval()

In [4]:
date_start = datetime.now().strftime("%Y%m%d_%Hh%M")
model_name = f'{date_start}'

# load model, tokenizer and weights
camembert, tokenizer, weights = (ppb.CamembertModel, ppb.CamembertTokenizer, 'camembert-base')
print('Camembert', camembert)
print('Tokenizer', tokenizer)
# Load pretrained model/tokenizer
tokenizer = tokenizer.from_pretrained(weights)
model = camembert.from_pretrained(weights)

# mettre de côté un jeu de données test avec une graine du générateur 
# charger le jeu de données en TensorDataset
data_df = pd.read_csv('DonneesPedoPsy/labeled_data.csv', sep=";", encoding='cp1252')
data_df.drop('num', axis=1, inplace=True)
textes_td = Textes_TensorDataset(data_df, tokenizer)
print(textes_td.nb_textes)

Camembert <class 'transformers.models.camembert.modeling_camembert.CamembertModel'>
Tokenizer <class 'transformers.models.camembert.tokenization_camembert.CamembertTokenizer'>


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


input_ids.shape (1648, 345)
Textes_TensorDataset/__init()__ loaded 1648 texts 345 of chars length
1648


In [5]:
# Forward text data into BERT model in order to get last_hidden_layer state
last_hidden_states = None
with torch.no_grad():
    for input_ids, attention_masks, labels in DataLoader(dataset=textes_td, batch_size=64, shuffle=False):
        output = model(input_ids, attention_mask=attention_masks)
        lhd = output.last_hidden_state.cpu().numpy() # shape(1318, 345, 768)
        lhd = np.squeeze(lhd[:, 0, :]) # shape(1318, 768) Extract the last hidden state of the token `[CLS]` for classification task
        if last_hidden_states is None: # bout de code moche mal optimisé !
            last_hidden_states = lhd
        else:
            last_hidden_states = np.vstack([last_hidden_states, lhd])
        print(last_hidden_states.shape)
    textes_td.store_last_hidden_states(last_hidden_states)

(64, 768)
(128, 768)
(192, 768)
(256, 768)
(320, 768)
(384, 768)
(448, 768)
(512, 768)
(576, 768)
(640, 768)
(704, 768)
(768, 768)
(832, 768)
(896, 768)
(960, 768)
(1024, 768)
(1088, 768)
(1152, 768)
(1216, 768)
(1280, 768)
(1344, 768)
(1408, 768)
(1472, 768)
(1536, 768)
(1600, 768)
(1648, 768)
Textes_TensorDataset/store_last_hidden_states()


In [6]:
# Construit des datasets d'entrainement et de test
text_td_train_nb = int(textes_td.nb_textes*.8)
text_td_val_nb =textes_td.nb_textes-text_td_train_nb
text_td_train, text_td_val = random_split(textes_td, [text_td_train_nb, text_td_val_nb], generator=Generator(device=hardware).manual_seed(11))


# Récupère les données du dataset d'entrainement
_,_, train_labels, train_features = next(iter(DataLoader(text_td_train, batch_size=text_td_train.__len__())))
train_features, train_labels = train_features.cpu().numpy(), train_labels.cpu().numpy()
print("train_features.shape", train_features.shape)
print("train_labels.shape", train_labels.shape)

# Récupère les données du dataset de test
_,_, val_labels, val_features = next(iter(DataLoader(text_td_val, batch_size=text_td_val.__len__())))
val_features, val_labels = val_features.cpu().numpy(), val_labels.cpu().numpy()
print("val_features.shape", val_features.shape)
print("val_labels.shape", val_labels.shape)

train_features.shape (1318, 768)
train_labels.shape (1318, 11)
val_features.shape (330, 768)
val_labels.shape (330, 11)


###  Test de deux règles de classification au hasard et Bayes naïf

In [7]:
dummy_model = DummyClassifier(strategy="uniform")
dummy_model.fit(train_features, train_labels)
val_pred_dummy = dummy_model.predict(val_features)
#print('classification au hasard \n',classification_report(val_labels, val_pred_dummy,target_names=label_names))

nb_model = OneVsRestClassifier(GaussianNB())
nb_model.fit(train_features, train_labels)
val_pred = nb_model.predict(val_features)
print('classification par NB \n',classification_report(val_labels, val_pred,target_names=labels_list))

classification par NB 
               precision    recall  f1-score   support

           +       0.46      0.76      0.58        72
           -       0.33      0.65      0.44        63
           0       0.23      0.62      0.34        47
           i       0.66      0.75      0.70       146
           j       0.87      0.67      0.76       263
           f       0.31      0.81      0.45        36
           s       0.47      0.77      0.59        57
           p       0.64      0.83      0.72       124
           m       0.60      0.78      0.68       123
           a       0.39      0.61      0.48        72
           t       0.09      0.43      0.14        23

   micro avg       0.50      0.72      0.59      1026
   macro avg       0.46      0.70      0.53      1026
weighted avg       0.59      0.72      0.63      1026
 samples avg       0.57      0.68      0.58      1026



### Classification par régression logistique pénalisée par elasticnet

In [8]:
# grille de recherche : !!! issue de plusieurs essais
nb_folds = 5
cv = KFold(n_splits=nb_folds, random_state=109, shuffle=True)
model_to_set = OneVsRestClassifier(LogisticRegression(solver='saga', penalty='elasticnet', max_iter = 10000))
parameters = [{'estimator__C': [8, 7.5, 7, 6.5, 6, 5.5, 5],
               'estimator__l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}] # The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
enet_model = GridSearchCV(model_to_set, 
                          param_grid=parameters,
                          cv = cv,
                          verbose=3, 
                          n_jobs=100)
# record current time
start = time()
enet_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/LogisticRegression :')
print('best parameters: ', enet_model.best_params_)
print('best score: ', enet_model.best_score_)

Fitting 5 folds for each of 77 candidates, totalling 385 fits
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.254 total time= 1.7min
[CV 1/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.261 total time= 1.7min
[CV 4/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.266 total time= 1.7min
[CV 1/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.265 total time= 1.7min
[CV 3/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.250 total time= 1.8min
[CV 4/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.262 total time= 1.8min
[CV 2/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.254 total time= 1.8min
[CV 2/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.254 total time= 1.8min
[CV 5/5] END estimator__C=7.5, estimator__l1_ratio=0;, score=0.259 total time= 1.9min
[CV 5/5] END estimator__C=8, estimator__l1_ratio=0;, score=0.266 total time= 1.9min
[CV 3/5] END estimator__C=7.5, estimator__l1_ratio=0.6;, score=0.265 total time= 2.1min


In [9]:
val_pred_enet = enet_model.predict(val_features)
print('classification par elasticnet \n',classification_report(val_labels, val_pred_enet,target_names=labels_list))

classification par elasticnet 
               precision    recall  f1-score   support

           +       0.65      0.56      0.60        72
           -       0.71      0.40      0.51        63
           0       0.28      0.11      0.15        47
           i       0.74      0.73      0.74       146
           j       0.85      0.94      0.89       263
           f       0.74      0.39      0.51        36
           s       0.89      0.68      0.77        57
           p       0.90      0.76      0.82       124
           m       0.83      0.82      0.83       123
           a       0.77      0.67      0.72        72
           t       0.67      0.26      0.38        23

   micro avg       0.80      0.71      0.75      1026
   macro avg       0.73      0.57      0.63      1026
weighted avg       0.78      0.71      0.73      1026
 samples avg       0.80      0.69      0.72      1026



### Classification par gradient boosting (light-gbm)

In [10]:
parameters = [{'estimator__learning_rate': [0.05, 0.1, 0.25],
               'estimator__n_estimators': [600, 700, 800]}]
model_to_set = OneVsRestClassifier(lgb.LGBMClassifier(boosting_type='gbdt',  
                                                      objective='binary'))
lgb_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3, n_jobs=150)
model_to_set.get_params().keys() #pour connaitre la liste des hyper-paramètres 

dict_keys(['estimator__boosting_type', 'estimator__class_weight', 'estimator__colsample_bytree', 'estimator__importance_type', 'estimator__learning_rate', 'estimator__max_depth', 'estimator__min_child_samples', 'estimator__min_child_weight', 'estimator__min_split_gain', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_leaves', 'estimator__objective', 'estimator__random_state', 'estimator__reg_alpha', 'estimator__reg_lambda', 'estimator__silent', 'estimator__subsample', 'estimator__subsample_for_bin', 'estimator__subsample_freq', 'estimator', 'n_jobs'])

In [11]:
# record current time
start = time()
lgb_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/lightgbm :')
print('best parameters: ', lgb_model.best_params_)
print('best score: ', lgb_model.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 2/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.205 total time= 1.2min
[CV 1/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.235 total time= 1.3min
[CV 5/5] END estimator__learning_rate=0.25, estimator__n_estimators=700;, score=0.217 total time= 1.3min
[CV 3/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.269 total time= 1.3min
[CV 1/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.235 total time= 1.3min
[CV 4/5] END estimator__learning_rate=0.25, estimator__n_estimators=800;, score=0.194 total time= 1.3min
[CV 5/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.217 total time= 1.4min
[CV 4/5] END estimator__learning_rate=0.25, estimator__n_estimators=600;, score=0.194 total time= 1.4min
[CV 4/5] END estimator__learning_rate=0.25, estimator__n_estimators=700;, score=0.194 total time= 1.

In [12]:
val_pred_lgb = lgb_model.predict(val_features)
print('classification par lgb \n',classification_report(val_labels, 
                                                        val_pred_lgb,
                                                        target_names=labels_list))

classification par lgb 
               precision    recall  f1-score   support

           +       0.73      0.49      0.58        72
           -       0.88      0.22      0.35        63
           0       0.25      0.02      0.04        47
           i       0.75      0.75      0.75       146
           j       0.84      0.97      0.90       263
           f       0.75      0.17      0.27        36
           s       0.88      0.49      0.63        57
           p       0.84      0.70      0.77       124
           m       0.79      0.69      0.74       123
           a       0.83      0.47      0.60        72
           t       0.00      0.00      0.00        23

   micro avg       0.81      0.64      0.71      1026
   macro avg       0.68      0.45      0.51      1026
weighted avg       0.77      0.64      0.67      1026
 samples avg       0.81      0.63      0.68      1026



### Classification par random forest 

In [13]:
# Grid search for random forest 
parameters = [{#'estimator__max_features': [int(x) for x in np.linspace(start = 50, stop = 750, num = 10)],
               'estimator__max_features': [200],
               'estimator__n_estimators': [500],
               'estimator__bootstrap' : [False]}]
model_to_set = OneVsRestClassifier(RandomForestClassifier(n_jobs=150))
rf_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3)
# record current time
start = time()
rf_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/Gradient Boosting :')
print('best parameters: ', rf_model.best_params_)
print('best score: ', rf_model.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.155 total time=  43.1s
[CV 2/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.186 total time=  31.5s
[CV 3/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.186 total time=  33.3s
[CV 4/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.160 total time=  32.2s
[CV 5/5] END estimator__bootstrap=False, estimator__max_features=200, estimator__n_estimators=500;, score=0.163 total time=  32.5s
204.874 seconds
Evaluation of OneVsRestClassifier/Gradient Boosting :
best parameters:  {'estimator__bootstrap': False, 'estimator__max_features': 200, 'estimator__n_estimators': 500}
best score:  0.1699418135729923


In [14]:
val_pred_rf = rf_model.predict(val_features)
print('classification par RF \n',classification_report(val_labels, 
                                                       val_pred_rf,
                                                       target_names=labels_list))

classification par RF 
               precision    recall  f1-score   support

           +       0.82      0.38      0.51        72
           -       0.71      0.08      0.14        63
           0       0.00      0.00      0.00        47
           i       0.71      0.75      0.73       146
           j       0.82      0.96      0.89       263
           f       0.00      0.00      0.00        36
           s       0.89      0.30      0.45        57
           p       0.86      0.52      0.65       124
           m       0.76      0.60      0.67       123
           a       0.73      0.33      0.46        72
           t       0.00      0.00      0.00        23

   micro avg       0.79      0.56      0.65      1026
   macro avg       0.57      0.36      0.41      1026
weighted avg       0.71      0.56      0.59      1026
 samples avg       0.78      0.56      0.63      1026



### Classification par SVC

In [15]:
# Grid search for random forest 
parameters = [{'estimator__C': [0.1,1, 10, 100], 
               'estimator__gamma': [1,0.1,0.01,0.001],
               'estimator__kernel': ['rbf', 'poly', 'sigmoid']}]
model_to_set = OneVsRestClassifier(SVC())
svc_model = GridSearchCV(model_to_set, param_grid=parameters, cv=cv, verbose=3, n_jobs=150)
# record current time
start = time()
svc_model.fit(train_features, train_labels)
# record current time
end = time()
# report execution time
result = end - start
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/Gradient Boosting :')
print('best parameters: ', svc_model.best_params_)
print('best score: ', svc_model.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   2.2s
[CV 2/5] END estimator__C=1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   2.5s
[CV 2/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=sigmoid;, score=0.000 total time=   3.0s
[CV 5/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=sigmoid;, score=0.000 total time=   3.0s
[CV 2/5] END estimator__C=10, estimator__gamma=0.1, estimator__kernel=poly;, score=0.288 total time=   3.4s
[CV 2/5] END estimator__C=1, estimator__gamma=0.01, estimator__kernel=poly;, score=0.000 total time=   4.1s
[CV 2/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   4.7s
[CV 4/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.004 total time=   5.1s
[CV 3/5] END estimator__C=0.1, estimator__gamma=0.1, est

In [16]:
val_pred_svc = svc_model.predict(val_features)
print('classification par SVC \n',classification_report(val_labels, 
                                                        val_pred_svc,
                                                        target_names=labels_list))

classification par SVC 
               precision    recall  f1-score   support

           +       0.68      0.57      0.62        72
           -       0.79      0.30      0.44        63
           0       0.00      0.00      0.00        47
           i       0.76      0.77      0.77       146
           j       0.85      0.94      0.90       263
           f       0.68      0.42      0.52        36
           s       0.91      0.72      0.80        57
           p       0.90      0.77      0.83       124
           m       0.83      0.81      0.82       123
           a       0.82      0.64      0.72        72
           t       0.50      0.09      0.15        23

   micro avg       0.82      0.70      0.76      1026
   macro avg       0.70      0.55      0.60      1026
weighted avg       0.77      0.70      0.72      1026
 samples avg       0.81      0.69      0.72      1026



In [17]:
## enregistrer les différents modèles
# elasticnet
#enet_file = 'enet_model.sav'
#joblib.dump(enet_model, enet_file)
# light gbm
#lgb_file = 'lgb_model.sav'
#joblib.dump(lgb_model, lgb_file)
# random forest
#rf_file = 'rf_model.sav'
#joblib.dump(rf_model, rf_file)
# support vector machine
#svc_file = 'svc_model.sav'
#joblib.dump(svc_model, svc_file)
 