In [3]:
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import math
import pandas as pd
import pickle
import joblib
import itertools
import torch
from torch.utils.data import TensorDataset, DataLoader, Subset, SubsetRandomSampler
from torch import Generator
from torch.utils.data.dataset import random_split
# pip install transformers et pip instance sentencepiece
import transformers as ppb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import warnings
# visualisation :
import matplotlib.pyplot as plt
from datetime import datetime
import time
# General settings
allow_cuda = True
warnings.filterwarnings('ignore')
# Check whether cuda is available, and select processor or GPU
if torch.cuda.is_available() and allow_cuda:
    print('GPU (Cuda) power !')
    hardware = torch.device('cuda:0')
    print(hardware)
    torch.cuda.set_device(0)
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    torch.cuda.empty_cache()
else:
    print('CPU running')
    hardware = torch.device('cpu')
    torch.set_default_tensor_type('torch.FloatTensor')

labels_list = ['+', '-', '0', 'i', 'j', 'f', 's', 'p', 'm', 'a', 't']
labels_nb = len(labels_list)

# Fonctions communes
def print_cuda_memory_usage():
    if hardware == torch.device('cuda:0'):
        t = torch.cuda.get_device_properties(0).total_memory / 1073741824
        r = torch.cuda.memory_reserved(0) / 1048576
        a = torch.cuda.memory_allocated(0) / 1048576
        print('cuda total=', t, 'reserved=', r, 'allocated=', a, 'free=', r - a, '...quelle unité ?')

# Log a string, and store to globals the log filename
def log(line, log_file='logs/default.txt'):
    print(line)
    if not ('logfile' in globals()):
        global logfile
        logfile = log_file
        print('set logfile to ', logfile)
    with open(logfile, "a") as f:
        print(line, file=f)
        

GPU (Cuda) power !
cuda:0


In [5]:
# Stockage des donnees dans un TensorDataset, permet aussi de produire les minibatches
# dataframe pandas en entrée au lieu d'un chemin vers un fichier
class Textes_TensorDataset(TensorDataset):
    def __init__(self, textes_df, tokenizer):
        super(TensorDataset, self).__init__()
        self.df = textes_df
        self.tokenizer = tokenizer
        # get the CLS token to check it
        cls_sep_tokens = tokenizer.encode('', add_special_tokens=True)
        tokenized = self.df['phrases'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)
        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
        

        self.input_ids = padded
        print('input_ids.shape', self.input_ids.shape)
        self.attention_mask = np.where(padded != 0, 1, 0)
        self.max_len = self.input_ids.shape[1]
        self.nb_textes = self.input_ids.shape[0]

        # Préparation des labels à partir des colonnes (troisième colonne)
        self.raw_labels_names = self.df.columns[2:]
        for i, row in self.df.iterrows():
            for key in labels_list:
                self.df.at[i, key] = max([row[c] for c in self.raw_labels_names if key in c])
        print('Textes_TensorDataset/__init()__ loaded', self.nb_textes, 'texts', self.max_len, 'of chars length')

    # renvoie un item : input_id, attention_mask, labels, last_hidden_states (si disponibles)
    def __getitem__(self, index):
        return self.input_ids[index], self.attention_mask[index], self.df.loc[index, labels_list].to_numpy(dtype=np.float32)

    def __len__(self):
        return self.nb_textes

#
# Class CamemBERT avec couche linear on-top
#
class CustomBertModel(torch.nn.Module):
    def __init__(self, bert_model, weights, layer_sizes):
        super(CustomBertModel, self).__init__()
        self.layer_sizes = layer_sizes

        self.bert = bert_model.from_pretrained(weights)
        self.dropout = torch.nn.Dropout(.05)
        self.linear1 = torch.nn.Linear(1 * 768, self.layer_sizes[0])
        self.nonlinear1 = torch.nn.Tanh()
        self.linear2 = torch.nn.Linear(self.layer_sizes[0], self.layer_sizes[1])
        self.nonlinear2 = torch.nn.Tanh() # attention Tanh donne de meilleurs résultats que Sigmoid
        self.linear3 = torch.nn.Linear(self.layer_sizes[1], self.layer_sizes[2])
        print('CustomBertModel/__init__ with layers on top', layer_sizes)

    # forwarde le modele, mode = 0 ne sort que la sortie par defaut, mode = 1 sort la sortie de bert aussi
    def forward(self, input_ids, attention_mask, mode = 0):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        # Ne recupere le last_hidden_state que du CLS token
        lhd = bert_output.last_hidden_state
        lhd_cls_token = lhd[:, 0, :]
        
        # Linear layer on top of BERT
        x = self.dropout(lhd_cls_token)
        x = self.nonlinear1(self.linear1(x))
        x = self.nonlinear2(self.linear2(x))
        x = self.linear3(x)
        
        if mode == 0:
            return x
        elif mode == 1:
            return x, lhd_cls_token

    # Bloque ou permet l'apprentissage sur la partie bert du modele
    def bert_training(self, authorisation, tune_depth):
        for name, p in self.named_parameters():
            p.requires_grad = False
            if ("linear1" in name) or ("linear2" in name) or ("linear3" in name): 
                p.requires_grad = authorisation
            if(tune_depth < 12) and (tune_depth >= 0) :
                for depth in np.arange(tune_depth, 12):
                    if ("bert.encoder.layer."+ str(depth) in name):
                        p.requires_grad = authorisation
                if ("bert.pooler.dense" in name) :
                    p.requires_grad = authorisation

    # Test on a batch, return loss
    def evaluate(self, x, attention_mask, expected, criterion):
        #print('EVALUATE :')
        self.eval()
        with torch.no_grad():
            output = self.forward(x, attention_mask=attention_mask)
            loss_train = criterion(output, expected)
            xx = np.rint(100 * expected[:, :].cpu().numpy())
            yy = np.rint(100 * torch.sigmoid(output[:, :]).cpu().numpy())
        return loss_train.item(), xx, yy
    # Train on a batch, return loss
    def learn(self, x, attention_mask, expected, criterion, optimizer):
        self.train()
        optimizer.zero_grad()
        output = self.forward(x, attention_mask=attention_mask)
        loss_train = criterion(output, expected)
        loss_train.backward()  # Compute the back propagation gradients
        optimizer.step()  # Setup coefficients
        return loss_train.item()

    def save(self, fn):
        torch.save(self.state_dict(), fn)

    def load(self, fn):
        self.load_state_dict(torch.load(fn))
        self.eval()



In [6]:
date_start = datetime.now().strftime("%Y%m%d_%Hh%M")
model_name = f'{date_start}'

# load model, tokenizer and weights
camembert, tokenizer, weights = (ppb.CamembertModel, ppb.CamembertTokenizer, 'camembert-base')
print('Camembert', camembert)
print('Tokenizer', tokenizer)
# Load pretrained model/tokenizer
tokenizer = tokenizer.from_pretrained(weights)

# mettre de côté un jeu de données test avec une graine du générateur 
# charger le jeu de données en TensorDataset
data_df = pd.read_csv('DonneesPedoPsy/labeled_data.csv', sep=";", encoding='cp1252')
data_df.drop('num', axis=1, inplace=True)
textes_td = Textes_TensorDataset(data_df, tokenizer)
print(textes_td.nb_textes)
text_td_train_nb = int(textes_td.nb_textes * .8)
text_td_test_nb = textes_td.nb_textes - text_td_train_nb
#train_df, test_df = train_test_split(data_df, train_size=text_td_train_nb,
#                                     test_size=text_td_test_nb, random_state=11)
# Construit des datasets d'entrainement et de test
#train_df.reset_index(inplace=True)
# ATTENTION modifier pour rendre disjoints les jeux de validation !!!!
textes_td_train, textes_td_test = random_split(textes_td, [text_td_train_nb, text_td_test_nb],
                                               generator=torch.Generator(device=hardware).manual_seed(11))
print(len(textes_td_train.indices))
print(len(textes_td_test.indices))
joblib.dump(textes_td_train.indices, 'train_indices.sav')    
joblib.dump(textes_td_test.indices, 'test_indices.sav')

Camembert <class 'transformers.models.camembert.modeling_camembert.CamembertModel'>
Tokenizer <class 'transformers.models.camembert.tokenization_camembert.CamembertTokenizer'>
input_ids.shape (1648, 345)
Textes_TensorDataset/__init()__ loaded 1648 texts 345 of chars length
1648


In [37]:
def fit_model(current_train, current_val, hyperp):
    current_train_nb = len(current_train)
    print(current_train_nb)
    current_val_nb = len(current_val)
    print(current_val_nb)
    batch_size_train = int(hyperp['bs_train'])
    batch_size_val = int(hyperp['bs_val'])
    tune_depth = int(hyperp['tune_depth'])

    # model = camembert.from_pretrained(weights)
    model = CustomBertModel(camembert, weights=weights, layer_sizes=[int(hyperp['layer_1']), int(hyperp['layer_2']), labels_nb])

    # Define optimized parameters
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=torch.ones([len(labels_list)]))
    optim = torch.optim.Adam(model.parameters(), lr=hyperp['lr'].item())
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=1, gamma=1) # gamma décroissance de "lr"
    model.bert_training(True, tune_depth)
    # containers pour l'entrainement
    record_train_step = list()
    record_train_loss = list()
    record_val_loss = list()
    record_learning_rate = list()

    #time.sleep(1)
    # Parametres du early stopping mechanism
    epoch_min = 0
    early_stopping_patience = int(hyperp['esp'])  # Nombre de fois avant que le early stopping s'actionne
    early_stopping_count = 0
    loss_val_min, loss_train_min = 1000000, 1000000

    for epoch in range(int(hyperp['nb_epoch'])):
        record_train_step.append(epoch)
        # Entraine le modele
        loss_train = 0
        sample_done = 0
        for input_ids, attention_masks, labels in DataLoader(dataset=current_train, batch_size=batch_size_train,
                                                             shuffle=True, generator=Generator(device=hardware)):
            loss_train += model.learn(input_ids, attention_masks, labels, loss_fn, optim)
            sample_done += batch_size_train
       
    
        loss_train /= current_train_nb
        record_train_loss.append(loss_train)
        record_learning_rate.append(scheduler.get_last_lr())
        scheduler.step()
        #Valide sur les donnees de validation x, attention_mask, expected, criterion, optimizer
        loss_val = 0
        sample_done = 0
        sklearn_ytrue, sklearn_ypred = [], []
        for input_ids, attention_masks, labels in DataLoader(dataset=current_val, batch_size=batch_size_val,
                                                             shuffle=False):
            loss_val_batch, xx, yy = model.evaluate(input_ids, attention_masks, labels, loss_fn)
            loss_val += loss_val_batch
            sample_done += batch_size_val
            # To compute Precision/recall msklearn metrics
            sklearn_ytrue.append(labels.cpu().numpy())
            sklearn_ypred.append(yy > 50)
            
        loss_val /= current_val_nb
        record_val_loss.append(loss_val)
        #
        sklearn_ytrue = np.vstack(sklearn_ytrue)
        sklearn_ypred = np.vstack(sklearn_ypred)
        if loss_train_min > loss_train:
            loss_train_min = loss_train
        # Early stopping
        if loss_val_min < loss_val:
            early_stopping_count += 1
            if early_stopping_count >= early_stopping_patience:
                break
        else:
            early_stopping_count = 0
            epoch_min = epoch
            loss_val_min = loss_val
    model.save('Models/adopsy_fine_tuning_train_data.m3')  
    return model

In [38]:
unpickled_df = pd.read_pickle("repeated_5folds_cv.pkl")
m = np.zeros(len(unpickled_df[0]["cv_error"]))
for i in range(len(unpickled_df)) : 
     m=m + unpickled_df[i]["cv_error"]
mymin =  np.argmin(m/len(unpickled_df)) 
hyper_param_optim = unpickled_df[0].iloc[[mymin]]
#print(hyper_param_optim['lr'].item())
model = fit_model(current_train=textes_td_train, current_val=textes_td_test, hyperp=hyper_param_optim)

1318
330


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomBertModel/__init__ with layers on top [200, 110, 11]


In [39]:
def predict(datatest, model, batch_size=64):
    batch_size = len(datatest)
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=torch.ones([len(labels_list)]))
    ytrue, ypred = [], []
    for input_ids, attention_masks, labels in DataLoader(dataset=datatest, batch_size=batch_size,
                                                             shuffle=False):
        loss_val_batch, xx, yy = model.evaluate(input_ids, attention_masks, labels, loss_fn)
        #loss_val += loss_val_batch
        #sample_done += batch_size_val
        # To compute Precision/recall msklearn metrics
        ytrue.append(labels.cpu().numpy())
        ypred.append(yy > 50)
    return np.array(ytrue)[0], np.array(ypred).astype(float)[0]

In [51]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
ytrue, ypred = predict(textes_td_test, model, 128)

print('classification par fine tuning \n',classification_report(ytrue, ypred,target_names=labels_list))
f1_score(ytrue, ypred, average="weighted")  

classification par fine tuning 
               precision    recall  f1-score   support

           +       0.67      0.82      0.74        72
           -       0.85      0.37      0.51        63
           0       0.40      0.38      0.39        47
           i       0.78      0.81      0.79       146
           j       0.86      0.97      0.91       263
           f       0.91      0.89      0.90        36
           s       0.97      0.98      0.97        57
           p       0.97      0.93      0.95       124
           m       0.94      0.93      0.93       123
           a       0.91      0.86      0.89        72
           t       0.78      0.61      0.68        23

   micro avg       0.84      0.84      0.84      1026
   macro avg       0.82      0.78      0.79      1026
weighted avg       0.85      0.84      0.84      1026
 samples avg       0.83      0.81      0.81      1026



0.8376288298503156