# Training with BERT and pytorch

In [10]:
# Google Colab: set current dir
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/ColabNotebooks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ColabNotebooks


In [2]:
# !pip install transformers
# For pyyaml issues on google colab
# !pip install pyyaml==5.4.1

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 6.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 28.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 77.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transforme

In [15]:
import torch
import numpy as np
import pandas as pd
import os
import statistics
import random
from datetime import timedelta
import time
from transformers import BertTokenizer, BertForPreTraining, BertModel
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn as nn
import plotly
import plotly.express as px
from transformers.file_utils import is_torch_available
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold

# Set up GPU for training

Go to Runtime > Change runtime type and select GPU

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
print(device)

cpu


# Utilitary functions

In [4]:
def fix_dataset(dataset):
    
    # Check is all questions in 'question1' and 'question2' are str
    filter = np.array([not isinstance(s1, str) for s1 in dataset['question1'].tolist()]) | np.array([not isinstance(s2, str) for s2 in dataset['question2'].tolist()])
    indexes_to_drop = dataset[filter].index
    
    # drop lines that are not
    if not len(indexes_to_drop):
        print("All rows are corrects")
    else:
        print("Removing the following lines: ")
        print(dataset.loc[indexes_to_drop])
        dataset = dataset.drop(indexes_to_drop)
    
    return dataset

def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(1024)

## Import data

In [5]:
# Paths & Variables

data_path = "data/quora-question-pairs"
train_file = "train.csv"
test_pos_file = "test.csv"
label_file = "sample_submission.csv"

# Reading
train = pd.read_csv(os.path.join(data_path, train_file), index_col = 0)
test_pos = pd.read_csv(os.path.join(data_path, test_pos_file), index_col = 0)
y_label = pd.read_csv(os.path.join(data_path, label_file), index_col = 0)

# Fix datasets for NaN values in question1 or question2
train = fix_dataset(train)
test_pos = fix_dataset(test_pos)

# join test and y_label
test_pos = test_pos.join(y_label, on = 'test_id', how = 'left')

# test set contains only positive, labels; suffle to create negative examples
test_neg = test_pos.copy()
test_neg['question1'] = np.random.permutation(test_neg['question1'])
test_neg['is_duplicate'] = 0

# Create final test set
test = pd.concat([test_pos, test_neg], ignore_index = True)

# Reset indexes
train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)


Removing the following lines: 
          qid1    qid2                         question1  \
id                                                         
105780  174363  174364    How can I develop android app?   
201841  303951  174364  How can I create an Android app?   
363362  493340  493341                               NaN   

                                                question2  is_duplicate  
id                                                                       
105780                                                NaN             0  
201841                                                NaN             0  
363362  My Chinese name is Haichao Yu. What English na...             0  
Removing the following lines: 
                                           question1  \
test_id                                                
379205      How I can learn android app development?   
817520   How real can learn android app development?   
943911                          How app dev

## Tokenization 

See (https://paperswithcode.com/method/wordpiece)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
print("Token id for [CLS]: " + str(tokenizer.cls_token_id))
print("Token id for [SEP]: " + str(tokenizer.sep_token_id))
print("Token id for [PAD]: " + str(tokenizer.pad_token_id))
print("Token id for [UNK]: " + str(tokenizer.unk_token_id))
print("Token id for [MASK]: " + str(tokenizer.mask_token_id))

print("Original sentense: " + train.loc[0, 'question1'])
print("Encoded sentense: ")
enc = tokenizer.encode(train.loc[0, 'question1'])
print(enc)
print("Decoded sentense: ")
dec = tokenizer.decode(enc)
print(dec)

# Check len of tokenized training sentences:
list_len = []
all_s = train['question1'].tolist() + train['question2'].tolist()
for s in all_s:
    tks = tokenizer.encode(s)
    list_len.append(len(tks))

max_len = max(list_len)

print(f"La taille maximale de tokens est {max_len} (avec les [CLS] et [SEP])")
lw_64 = round((sum([l <= 64 for l in list_len])/len(list_len)) * 100, 2) 
print(f"Il y a {lw_64}% des phrases tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset")

# Check for the sentences pairs
list_len2 = []
for i in range(train.shape[0]):
    tks = tokenizer.encode(train['question1'][i], train['question2'][i])
    list_len2.append(len(tks))

max_len2 = max(list_len2)
print(f"La taille maximale de tokens avec paires mergée est {max_len2} (avec les [CLS] et [SEP])")
lw_64 = round((sum([l <= 64 for l in list_len2])/len(list_len2)) * 100, 2)
lw_128 = round((sum([l <= 128 for l in list_len2])/len(list_len2)) * 100, 2)
print(f"Il y a {lw_64}% des paires de phrases mergées tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset")
print(f"Il y a {lw_128}% des paires de  phrases mergées tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset")


Token id for [CLS]: 101
Token id for [SEP]: 102
Token id for [PAD]: 0
Token id for [UNK]: 100
Token id for [MASK]: 103
Original sentense: What is the step by step guide to invest in share market in india?
Encoded sentense: 
[101, 2054, 2003, 1996, 3357, 2011, 3357, 5009, 2000, 15697, 1999, 3745, 3006, 1999, 2634, 1029, 102]
Decoded sentense: 
[CLS] what is the step by step guide to invest in share market in india? [SEP]
La taille maximale de tokens est 286 (avec les [CLS] et [SEP])
Il y a 99.88% des phrases tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset
La taille maximale de tokens avec paires mergée est 330 (avec les [CLS] et [SEP])
Il y a 97.91% des paires de phrases mergées tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset
Il y a 99.97% des paires de  phrases mergées tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset


# Data loading

See https://pytorch.org/tutorials/beginner/basics/data_tutorial.html for documentation about the Dataset and Dataloader creation.

In [7]:
class SiameseNetWorkSentenceDataset(Dataset):
    """
    SiameseNetWorkSentenceDataset create a Dataset
    - data (pd.DataFrame): the data dataframe with column 'question1' and 'question2' along with the label 'is_duplicate'
    - tokenizer: the BERT tokenizer, such as: BertTokenizer.from_pretrained('bert-base-uncased')
    - max_length: the maximal length of tokens input vector (default 64) Shorter vector arre padded to max_length with [PAD token] (id: 0) and longer are truncated. 
    The size includes the start [CLS] and end [SEP] tokens.
    """

    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        def squeeze_tensors(tks):
            """Take a tensor and remove unnecessary dimension. When using tokenizer with return_tensors = 'pt', the returned tensor is by default 2 dimensions, has it could handle a list of sentence as inputs.
            However, as we only sent one sentence at a time to the tokenizer to create the Dataset, it result in an additional dimension that will be useless after pooling results by batches in the DataLoader

            Args:
                tks ([type]): [description]
            """
            tks.data["input_ids"] = torch.squeeze(tks.data["input_ids"])
            tks.data["token_type_ids"] = torch.squeeze(tks.data["token_type_ids"])
            tks.data["attention_mask"] = torch.squeeze(tks.data["attention_mask"])

        s1 = self.data.loc[index, 'question1']
        s2 = self.data.loc[index, 'question2']
        label = torch.tensor(self.data.loc[index, 'is_duplicate'])

        tokens1 = self.tokenizer(text = s1, max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        squeeze_tensors(tokens1)
        tokens2 = self.tokenizer(text = s2, max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        squeeze_tensors(tokens2)

        return tokens1, tokens2, label


class BERTSentencesClassificationDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):

        def squeeze_tensors(tks):
            """Take a tensor and remove unnecessary dimension. When using tokenizer with return_tensors = 'pt', the returned tensor is by default 2 dimensions, has it could handle a list of sentence as inputs.
            However, as we only sent one sentence at a time to the tokenizer to create the Dataset, it result in an additional dimension that will be useless after pooling results by batches in the DataLoader

            Args:
                tks ([type]): [description]
            """
            tks.data["input_ids"] = torch.squeeze(tks.data["input_ids"])
            tks.data["token_type_ids"] = torch.squeeze(tks.data["token_type_ids"])
            tks.data["attention_mask"] = torch.squeeze(tks.data["attention_mask"])
        
        s1 = self.data.loc[index, 'question1']
        s2 = self.data.loc[index, 'question2']

        t = self.tokenizer(s1, s2, max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        squeeze_tensors(t)

        label = torch.tensor(self.data.loc[index, 'is_duplicate'])

        return t, label


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SiameseNetWorkSentenceDataset(data = train, tokenizer = tokenizer, max_length = 64)
print(train_dataset[10])
print(len(train_dataset))

train_dataset_2 = BERTSentencesClassificationDataset(data = train, tokenizer = tokenizer, max_length = 64)
print(train_dataset_2[10])
print(len(train_dataset_2))

({'input_ids': tensor([  101,  4118,  2000,  2424,  8745,  1997, 29199,  2478, 10424,  2229,
        11877, 12170, 18098,  2964,  1029,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}, {'inpu

In [9]:
dataloader = DataLoader(train_dataset, batch_size = 8, shuffle = True, num_workers = 0)
dataloader_2 = DataLoader(train_dataset_2, batch_size = 8, shuffle = True, num_workers = 0)

# Dataset details

Each input in the training/test dataset is composed of question1, question2, and label
The Dataset in shuffled / divided in batchs of size *batch_size* in the DataLoader.

## SiameseNetWorkSentenceDataset

For the siamese network BERT model, a batch is a tuple of 3 elements: (*batch_question_1*, *batch_question_2*, *label*).

For the *batch_size* pair of questions in the created batch: 
- *batch_question_1*: contains the BERT-tokenizer's output of each question1
- *batch_question_2*: contains the BERT-tokenizer's output of each question2
- *label* indicates if the two questions are duplicated 

*batch_question_1* and *batch_question_2* are similar and contains a dictionary with 3 entries:

- "input_ids": torch.FloatTensor of shape (*batch_size*, *sequence_length*) which contains the indices of the question tokens in the vocabulary, with [CLS] and [SEP] at the beginning and end of the sentence and the [PAD] token for the remaining padding (up to *max_length*).

- "token_type_ids": torch.FloatTensor of shape (*batch_size*, *sequence_length*) which are the segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]. Always 0 here as the two questions are treated separately in the siamese network.

- attention_mask torch.FloatTensor of shape (*batch_size*, *sequence_length* which contains the mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]

When calling the model SiameseBERTNet, *batch_question_1* is fed in the 'left' part of the siamese and *batch_question_2* is fed in the right part of the siamese. In each siamese, we compute for each question the average of the last hidden layer along each 'real' token of the question, so that we don't average with [PAD], [CLS] and [SEP] tokens. To avoid processing data in the model, we directly pass to it only the tensors it needs: model(Q1_input_ids, Q1_attention_mask, Q2_input_ids, Q2_attention_mask)
At the end, for each *batch_size* pair of questions sent to the siamese Network, it returns a tuple with the two set of averaged vectors for question1 and question2, noted (OUT1, OUT2), each of shape (*batch_size*, dmodel). dmodel of BERT is 768. The ième line in OUT1 correspond to the averaged output hidden layer for the ième question1 in *batch_question_1*.

In the loss function, we want to minimze the distance between 2 averaged vector OUT1[i, ] and OUT2[i, ] if they are duplicated and maximize the distance if they are not duplicated

## BERTSentencesClassification

For BERTSentence classification we choose to directly use the [CLS] token as a predictor like it is done for the NextSentencePrediction task.
Each batch is a tuple of 2 elements (*batch_pair_of_questions*, *label*).

*batch_pair_of_question* is a dictionary like *batch_question_1* or *batch_question_1* with each elements (input_ids, token_type_ids, attention_mask) of shape (*batch_size*, *sequence_length*), except that here the two question have been concatened into one sequence, such as : [CLS] [... TOKEN Q1 ...] [SEP] [... TOKEN Q2 ...] [SEP]. In this case *token_type_ids* matrix is important as we have to distinguish the both sentences in the embedding. The advantage of this approach is that the attention of each tokens is computed over all the tokens of the sequence, included those of the other question, while the siamese network compute the averaged output token independenty for each question. We call the model with model(Qpair_input_ids, Qpair_tokens_type_ids, Qpair_attention_mask)

After being fed into the BERT model, the outputed vector corresponding to the [CLS] token, namely pooler_output, in sent into a dropout, linear layer with two outputed dimension and finally a softmax. The goal is to predict is the two concatenated questions are duplicated or not. 

On pourrait aussi testé de mettre un linear layer avec 1 dim en out suivit d'une activation style tanh pour prédire entre 0 et 1


# Model

In [10]:
class SiameseBERTNet(nn.Module):

    def __init__(self, noCLSpooling = True, noSEPpooling = True, freeze_embedding = False, freeze_encoder_layer = False, freeze_cls_pooler = False):
        super(SiameseBERTNet, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.HS = self.bert.config.hidden_size
        self.noCLSpooling = noCLSpooling
        self.noSEPpooling = noSEPpooling

        if freeze_embedding:
            self.bert.embeddings.requires_grad_(False)
        
        if freeze_encoder_layer:
            # Set the requires_grad attribute of parameters of the first 'freeze_encoder_layer' layers to False. By default there is 12 layers
            for layer in self.bert.encoder.layer[:freeze_encoder_layer]:
                layer.requires_grad_(False)
        
        if freeze_cls_pooler:
            self.bert.pooler.requires_grad_(False)

    def forward_siamese(self, input_ids, attention_mask):
        """From tokenised input sentence, compute BERT 

        Args:
            input (dict): output dict from the tokenizer with input_ids, token_type_ids and attention_mask

        Returns:
            avg (tensor): Mean of the last hidden layer vectors for real tokens (attention_mask: 1) in the input.
        """
        # Get input_ids and attention mask

        # Apply BERT and extract last_hidden_state
        out = self.bert(input_ids = input_ids, attention_mask = attention_mask)
        last_hidden_state = out.last_hidden_state

        # Apply mean pooling on real tokens
        # Make a copy is some changes (CLS or SEP) need to be applied
        pooling_mask = attention_mask.clone()

        # If the CLS output vector should not participate in average pooling
        if self.noCLSpooling:
            pooling_mask[:, 0] = 0
        
        # If the SEP output vector should not participate in average pooling
        if self.noSEPpooling:
            pooling_mask = torch.where(input_ids == 102, 0, pooling_mask)

        # Get mask at the same dimension as last_hidden_state
        expanded_pooling_mask = pooling_mask.unsqueeze(-1)
        expanded_pooling_mask = expanded_pooling_mask.expand(-1, -1, self.HS)

        # Element wise mul between last_hidden_state and mask to then only consider real tokens in the sum
        prod = torch.mul(last_hidden_state, expanded_pooling_mask)

        # Sum all token vectors
        sum_by_tks = torch.sum(prod, dim = 1)

        # Get normalisation factor to compute mean
        norm = torch.sum(pooling_mask, dim = -1).unsqueeze(-1)

        # Comptue average
        avg = torch.div(sum_by_tks, norm)

        return avg

# On ne modifie pas la classe Dataset c'est juste en processing des outputs du DataLoader qu'on gèrera l'envoie au modèle
    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        out1 = self.forward_siamese(input_ids_1, attention_mask_1)
        out2 = self.forward_siamese(input_ids_2, attention_mask_2)

        return out1, out2

class BERTSentencesClassification(nn.Module):

    def __init__(self):
        super(BERTSentencesClassification, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.HS = self.bert.config.hidden_size
        self.out = 1
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.2),
            nn.Linear(in_features = self.HS, out_features = self.out, bias = True),
            nn.Softmax(dim = 1)
        )

    def forward(self, input_ids, token_type_ids, attention_mask):

        # Get input_ids, token_type_ids (as we have sentense pairs) and attention mask
        out = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        cls_token = out.pooler_output
        classification = self.classifier(cls_token)

        return classification, cls_token


In [11]:
def check_params(model):
    trainable_sum, non_trainable_sum = 0, 0
    for name, param in model.named_parameters():
        nb_parameters = param.numel()
        rq_grad = param.requires_grad
        print(f"{name:>60} | {nb_parameters:>9} | {str(rq_grad):>6}")
        if rq_grad:
            trainable_sum += nb_parameters
        else:
            non_trainable_sum += nb_parameters
    print("Total number of trainaible parameters: " + str(trainable_sum))
    print("Total number of non-trainable parameters: " + str(non_trainable_sum))

also see https://skimai.com/fine-tuning-bert-for-sentiment-analysis/ for tips

In [12]:
model = SiameseBERTNet()
model2 = BERTSentencesClassification()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predicti

Pour plus de détails sur le WARNING "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel:", see: https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/model#transformers.PreTrainedModel.from_pretrained et autres stackoverflow

Instantiate a pretrained pytorch model from a pre-trained model configuration.

The model is set in evaluation mode by default using model.eval() (Dropout modules are deactivated). To train the model, you should first set it back in training mode with model.train().

The warning Weights from XXX not initialized from pretrained model means that the weights of XXX do not come pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. Ici c'est quand par exemple je monte un modèle avec une head de classification à la suite de mon token [CLS], mais que lorsque j'appelle la fonction from_pretrained("bert-uncased") celle-ci n'ayant pas ce layer, elle ne peut pas me l'initialiser avec des poids du modèle pré-trainded, les poids fixés sont alors random

The warning Weights from XXX not used in YYY means that the layer XXX is not used by YYY, therefore those weights are discarded. Là c'est tout simplement quand dans mon modèle pretrainded que je veux utiliser pour initialiser mes poids (que je monte avec .from_pretrained(XXX)), celui-ci contient des layers qui n'existent pas dans le type de modèle que je suis en train de monter. Par exemple je souhaite moner un modèle avec une architecture sans head, si je le load à partir d'un modèle pretrained qui a des heads, tout les layers correspondants aux heads seront discarded car je n'en aurais pas besoin dans l'archi que je monte.


Donc pour notre warning: Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'. En fait cela nous informe que certains layers notamment ceux associés à la head de tranformation/classification du token [CLS] de "bert-base-uncased'" sont discarded et ne seront pas utiliser pour initialiser les poids de notre modèle, qui est un BertModel, car on a tout simplement pas ces layers.

For details about the last 'real' layer BertPooler see:  https://github.com/huggingface/transformers/issues/782 and https://github.com/google-research/bert/issues/43. Il s'agit d'une tranformation appliquée uniquement au token [CLS]. L'article ne détaille pas tout mais en réalité il y a donc un layer linéaire de transformation appliquée sur le token CLS avant de l'envoyé dans un linear layer de classification/softmax.

In [11]:
s1 = next(iter(dataloader))
r = model(s1[0]["input_ids"], s1[0]["attention_mask"], s1[1]["input_ids"], s1[1]["attention_mask"])
s2 = next(iter(dataloader_2))
r_2 = model2(s2[0]["input_ids"], s2[0]["token_type_ids"], s2[0]["attention_mask"])

# Test BERT embedding without fine-tuning

In [14]:
# Extract a set of sample sentenses from test set:

positives_s = test.head(n = 10)
negatives_s = test.tail(n = 10)

all_s = positives_s.append(negatives_s, ignore_index = True)
all_s.to_csv("data/sample.csv", index = False)
d = SiameseNetWorkSentenceDataset(data = all_s, tokenizer = tokenizer, max_length = 64)
l = DataLoader(d, batch_size = 20, shuffle = False, num_workers = 0)
sample = next(iter(l))
res = model(sample[0]["input_ids"], sample[0]["attention_mask"], sample[1]["input_ids"], sample[1]["attention_mask"])
res = torch.cat([res[0], res[1]], axis = 0)

# Compute PCA (1)
U, S, V = torch.pca_lowrank(res, niter = 50)
proj = torch.matmul(res, V[:, :2])

plot_data = pd.DataFrame(proj.detach().numpy())
allQ = (all_s["question1"].tolist() + all_s["question2"].tolist())
plot_data["question"] = allQ
plot_data = plot_data.rename({0: 'X', 1: 'Y'}, axis=1)

#TODO faire un sorte que chaque paires de question 'duplicated' soit d'une couleur (soit 10 couleurs pour les 10 paires positives) et que toutes les non-suplicated soit d'une autres couleurs (genre noir)

fig = px.scatter(plot_data, x = "X", y = "Y", hover_data=['question'])
fig.show()

# Compute distance matrix :
D = torch.cdist(res.double(), res.double(), p = 2)
D = D.detach().numpy().round(5)
# Get the closest sentence for positives examples : 
for i in range(10):
    closest = np.argsort(D[i])[1]
    print(allQ[i]  + " -> closest: " + allQ[closest] + ". Real label is: " + allQ[i + 20])

df = pd.DataFrame(D)

df.to_csv("data/NormEucDistanceMatrix_1.csv", index = False, header = False)


# For the second model:
d_2 = BERTSentencesClassificationDataset(data = all_s, tokenizer = tokenizer, max_length = 64)
l_2 = DataLoader(d_2, batch_size = 20, shuffle = False, num_workers = 0)
sample_2 = next(iter(l_2))

res_2 = model2(sample_2[0]["input_ids"], sample_2[0]["token_type_ids"], sample_2[0]["attention_mask"])

# Compute PCA (2)
U2, S2, V2 = torch.pca_lowrank(res_2[1], niter = 50)
proj_2 = torch.matmul(res_2[1], V2[:, :2])

plot_data_2 = pd.DataFrame(proj_2.detach().numpy())
plot_data_2["question"] = "Q1: " + all_s["question1"] + "\nQ2: " + all_s["question2"]
plot_data_2["label"] = all_s["is_duplicate"]
plot_data_2 = plot_data_2.rename({0: 'X', 1: 'Y'}, axis=1)
plot_data_2 = plot_data_2.astype(dtype = {'label': 'str'}, copy = True)

fig_2 = px.scatter(plot_data_2, x = "X", y = "Y", color = "label", hover_data=['question'])
fig_2.show()

How does the Surface Pro himself 4 compare with iPad Pro? -> closest: What are the best Android games of all effectively?. Real label is: Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?
Should I have a hair transplant at age 24? How much would it cost? -> closest: How much cost does hair transplant require?. Real label is: How much cost does hair transplant require?
What but is the best way to send money from China to the US? -> closest: What you send money to China?. Real label is: What you send money to China?
Which food not emulsifiers? -> closest: What foods fibre?. Real label is: What foods fibre?
How "aberystwyth" start reading? -> closest: What are the best and worst things examination public transit in Visakhapatnam, Andhra Pradesh, India? How could it be improved?. Real label is: How their can I start reading?
How are the two wheeler insurance from Bharti Axa insurance? -> closest: By scrapping the 500 and 1000 rupee notes, how is RBI planning to fight aga

# Loss functions

In [13]:
class ConstrastiveLoss(nn.Module):
    def __init__(self, m = 4, p = 2):
        super(ConstrastiveLoss, self).__init__()
        self.m = m
        self.p = p
        self.pdist = nn.PairwiseDistance(p = self.p)
    
    def forward(self, outQ1, outQ2, y):
        
        D = self.pdist(outQ1, outQ2)
        loss =  torch.mean(y * 1/2 * torch.pow(D, 2) + (1 - y) * 1/2 * torch.pow(torch.clamp((self.m - D), min = 0), 2))
        return loss

# How autograd works
https://towardsdatascience.com/pytorch-autograd-understanding-the-heart-of-pytorchs-magic-2686cd94ec95
https://pytorch.org/docs/stable/autograd.html
https://www.youtube.com/watch?v=MswxJw-8PvE
https://github.com/pytorch/pytorch/blob/master/docs/source/notes/autograd.rst
https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#differentiation-in-autograd
https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/autograd_tutorial.py
https://pytorch.org/docs/1.9.1/generated/torch.Tensor.backward.html

# Training 
scheduler warmup info: https://datascience.stackexchange.com/questions/55991/in-the-context-of-deep-learning-what-is-training-warmup-steps/60028#60028, https://stackoverflow.com/questions/60120043/optimizer-and-scheduler-for-bert-fine-tuning, https://huggingface.co/docs/transformers/main_classes/optimizer_schedules

Others helping resources: https://skimai.com/fine-tuning-bert-for-sentiment-analysis/, https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html, https://mccormickml.com/2019/07/22/BERT-fine-tuning/#43-training-loop

# Cross validation
https://www.machinecurve.com/index.php/2021/02/03/how-to-use-k-fold-cross-validation-with-pytorch/#model-imports
https://datascience.stackexchange.com/questions/52632/cross-validation-vs-train-validate-test/52643
https://scikit-learn.org/stable/modules/cross_validation.html
https://www.machinecurve.com/index.php/2021/02/03/how-to-use-k-fold-cross-validation-with-pytorch/#model-imports


Au niveau du training_logs, on fait une evaluation initiale à t0 (sans aucun training), puis une tout les n batch et à la fin de chaque epochs.

Attention, la training loss est calculé à chaque *step* sur les n derniers batchs -> la loss est donc calculé sur une sous partie du training dataset. La Eval.loss est en revanche calculé à chaque fois sur tout le set d'évaluation. Donc, la training loss renvoyé à la fin de chaque epoch est la moyenne des training loss récupérées au cours du training (tout les n batchs), mais l'eval.loss est simplement recalculé à partir du modèle qui vient d'être entrainé sur l'epoch et n'est donc pas une moyenne des eval.loss précédentes.

In [14]:
# 1) Intialization:

def init_model(model, dataloader, nepochs):
    """Initialize optimzer and scheduler for training

    Args:
        model (nn.Module): the model to train
        dataloader (torch.utils.data.dataloader.DataLoader): the training dataLoader
        nepochs (int): number of epochs for training 

    Returns:
        [transformers.AdamW, torch.optim.lr_scheduler.LambdaLR]: the parametrized optimizer and scheduler
    """
    
    # Create the optimizer
    optimizer = AdamW(model.parameters(),
        lr = 5e-5,    # Default learning rate
        eps = 1e-8    # Default epsilon value
        )
    
    # Get total number of steps
    nbatchs = len(dataloader)
    total_nb_steps = nbatchs * nepochs

    # Create the scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer = optimizer,
        num_warmup_steps = 0, # Default value so direct training without warmup
        num_training_steps = total_nb_steps)
    
    return optimizer, scheduler


def evalute_Siamese(validation_loader, model, loss_fn, threshold, device):
    """Evaluate the performance of the SiameseBERTNet model on the validation dataset. 
    This function is specific to the SiameseBERTNet class.

    Args:
        validation_loader (torch.utils.data.dataloader.DataLoader): the validation dataloader
        model (SiameseBERTNet): the SiameseBERT model to eval
        loss_fn (ConstrastiveLoss): the initialized constrastive loss function
        threshold (int): the threshold on the distance between 2 sentences. If D < th: duplicated (1), else non_duplicated (0)
        device (torch.device): the device used by torch cpu or gpu.

    Returns:
        [float, float, float]: return the averaged loss, F1-score and accuracy on the validation dataset
    """
    # Put model in test mode
    model.eval()

    epsilon = 1e-7

    v_loss = []
    v_f1 = []
    v_accuracy = []

    # Iterate over validation batches
    for step, batch in enumerate(validation_loader):

        # Get batch data
        v_input_ids_Q1 = batch[0]['input_ids'].to(device)
        v_attention_mask_Q1 = batch[0]['attention_mask'].to(device)
        v_input_ids_Q2 = batch[1]['input_ids'].to(device)
        v_attention_mask_Q2 = batch[1]['attention_mask'].to(device)
        v_y = batch[2].to(device)

        # Apply model
        v_outQ1, v_outQ2 = model(v_input_ids_Q1, v_attention_mask_Q1, v_input_ids_Q2, v_attention_mask_Q2)

        # Compute Loss
        loss = loss_fn(v_outQ1, v_outQ2, v_y)
        v_loss.append(loss.item())

        # Compute prediction at m
        pwdist = loss_fn.pdist(v_outQ1, v_outQ2)
        pred = torch.where(pwdist < threshold, 1, 0)
        
        # Compute F1 score:
        tp = (pred * v_y).cpu().numpy().sum()
        precision = tp / (pred.cpu().numpy().sum() + epsilon)
        recall = tp / (v_y.cpu().numpy().sum() + epsilon)
        f1 = 2 * (precision * recall) / (precision + recall + epsilon)
        v_f1.append(f1)

        # Compute accuracy
        accuracy = (pred == v_y).cpu().numpy().mean() * 100
        v_accuracy.append(accuracy)
    
    # Compute averaged loss
    avg_loss = np.mean(v_loss)
    avg_f1 = np.mean(v_f1)
    avg_accuracy = np.mean(v_accuracy)

    # return back the model in training mode
    model.train()

    return avg_loss, avg_f1, avg_accuracy


def train_loop_Siamese(model, dataloader, validation, optimizer, scheduler, loss_fn, eval_threshold, nepochs, device, out_dir):
    """The training loop for the SiameseBERTNet model. The function exports training logs to evaluate the model performances and overfitting during training.
    The model is trained by minimizing the loss obtained with loss_fn.
    If a validation dataloader is provided the model is evaluate on it using evalute_Siamese with 'loss_fn' and 'eval_threshold'. The evaluation is done every 'step' batch in each epoch and also at the end of each epoch. The results are saved in the returned training_logs dataframe. 
    The function also saves in the output directory the parameters of the best model obtained during training, based on the minimal evaluation loss that have been obtained (best-model.pt).
    The best model can then be load using:
    model = SiameseBERTNet(**params)
    model.load_state_dict(torch.load("path/to/best-model.pt"))
    If no validation dataloader is provided (None) only the average training loss is reported and the parameters obtained at the end of training will be exported (not necessarily the best)
    This model can be load using the same method as described above.

    Args:
        model (SiameseBERTNet): the SiameseBERTNet model to train
        dataloader (torch.utils.data.dataloader.DataLoader): the training dataLoader
        validation (torch.utils.data.dataloader.DataLoader): the validation dataloader
        optimizer (transformers.AdamW): the parametrized optimizer from init_model
        scheduler (torch.optim.lr_scheduler.LambdaLR): the parametrized scheduler from init_model
        loss_fn (ConstrastiveLoss): the initialized constrastive loss function 
        eval_threshold (int): the threshold to use for evaluation
        nepochs (int]): the number of training epochs (must be same as in the scheduler)
        device (torch.device): the device used by torch cpu or gpu.
        out_dir (str): the output directory

    Returns:
        [pd.DataFrame]: the training_logs dataframe. Reports the average training loss, evaluation loss, F1-score and accuracy computed on the state of the trained model every 'step' batchs in each epoch and also at the end of each epoch.
    """

    # Init errors, F1-score and accuracy vector to store for all epochs
    training_logs = pd.DataFrame()
    _train_errors = []
    _eval_errors = []
    _eval_F1 = []
    _eval_acc = []
    _type = []

    # Init evaluation measure for choosing the best model
    best_model_eval_loss = np.inf

    # Evaluation without training
    if validation:
        avg_validation_loss, avg_f1_validation, avg_acc_validation = evalute_Siamese(validation, model, loss_fn, eval_threshold, device)
        _eval_errors.append(avg_validation_loss)
        _eval_F1.append(avg_f1_validation)
        _eval_acc.append(avg_acc_validation)
        _train_errors.append(np.nan)
        _type.append('init')
        
        # set the best model measure to avg_validation_loss
        best_model_eval_loss = avg_validation_loss
        print("Initial validation loss: " + str(best_model_eval_loss))
    
    # Training on epochs
    for i_epoch in range(nepochs):
        print("-----------------------------------------------------------------------------------------------")
        # Init
        total_step_in_dataloader = len(dataloader)
        epoch_time, batch_time = time.time(), time.time()
        total_loss, batch_loss, batch_count = 0, 0, 0

        # Put model in train mode (important if a run on the validation in eval mode have been done previously)
        model.train()

        # iterate over batches:
        for step, batch in enumerate(dataloader):
            
            batch_count +=1

            # Get batch data
            input_ids_Q1 = batch[0]['input_ids'].to(device)
            attention_mask_Q1 = batch[0]['attention_mask'].to(device)
            input_ids_Q2 = batch[1]['input_ids'].to(device)
            attention_mask_Q2 = batch[1]['attention_mask'].to(device)
            y = batch[2].to(device)

            # Reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
            model.zero_grad()

            # Apply model
            outQ1, outQ2 = model(input_ids_Q1, attention_mask_Q1, input_ids_Q2, attention_mask_Q2)

            # Compute Constrastive loss
            loss = loss_fn(outQ1, outQ2, y)

            # Update batch loss and total loss
            total_loss += loss.item()
            batch_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()
            
            # Checking every 10 steps:
            if ((((step + 1) % 25) == 0) and step != 0) or step == (total_step_in_dataloader - 1):
                time_elapsed = str(timedelta(seconds = (time.time() - batch_time)))
                batch_avg_loss = batch_loss/batch_count
                _train_errors.append(batch_avg_loss)

                if validation:
                    avg_validation_loss, avg_f1_validation, avg_acc_validation = evalute_Siamese(validation, model, loss_fn, eval_threshold, device)
                    _eval_errors.append(avg_validation_loss)
                    _eval_F1.append(avg_f1_validation)
                    _eval_acc.append(avg_acc_validation)
                    
                    # Print summary
                    print(f"batch {step+1:>6d} / {total_step_in_dataloader:>4d} | Elapsed {time_elapsed} | Average loss on the previous {batch_count:>4d} batchs : {batch_avg_loss:5.2f} | Average validation loss: {avg_validation_loss:6.2f} | Average F1-score: {avg_f1_validation:6.2f} | Average Accuracy: {avg_acc_validation:6.2f} %")
                else:
                    print(f"batch {step+1:>6d} / {total_step_in_dataloader:>4d} | Elapsed {time_elapsed} | Average loss on the previous {batch_count:>4d} batchs : {batch_avg_loss:5.2f} | ")
                
                _type.append(False)

                # Reset batch_count, batch_loss and batch_time
                batch_loss, batch_count = 0, 0
                batch_time = time.time()
        
        avg_train_loss_epoch = total_loss/total_step_in_dataloader
        _train_errors.append(avg_train_loss_epoch)

        time_elapsed_epoch = str(timedelta(seconds = (time.time() - epoch_time)))
        print("-----------------------------------------------------------------------------------------------")

        # Test current model ( at epcoch i ) on validation
        if validation:
            avg_validation_loss, avg_f1_validation, avg_acc_validation = evalute_Siamese(validation, model, loss_fn, eval_threshold, device)
            _eval_errors.append(avg_validation_loss)
            _eval_F1.append(avg_f1_validation)
            _eval_acc.append(avg_acc_validation)

            # At the end of each epoch, check is the model has a better average evaluation loss than the previous best model:
            if avg_validation_loss < best_model_eval_loss:
                
                print("\n/!\ save new best model /!\ \n")
                torch.save(model.state_dict(), os.path.join(out_dir, "best-model.pt"))

                # Save best epoch
                with open(os.path.join(out_dir, "epoch.log"), 'w') as f:
                    f.write("best model at epoch: " + str(i_epoch+1))
                
                best_model_eval_loss = avg_validation_loss
        
            print(f"Epoch {i_epoch+1:>6d} / {nepochs:>4d} | Elapsed {time_elapsed_epoch} | Average loss on epoch: {avg_train_loss_epoch:18.2f} | Average validation loss: {avg_validation_loss:6.2f} | Average F1-score: {avg_f1_validation:6.2f} | Average Accuracy: {avg_acc_validation:6.2f} %")
        else:
            print(f"Epoch {i_epoch+1:>6d} / {nepochs:>4d} | Elapsed {time_elapsed_epoch} | Average loss on epoch: {avg_train_loss_epoch:18.2f} |")
        
        _type.append(True)
    
    if validation:
        training_logs = pd.DataFrame({"Type": _type, "Training.loss": _train_errors, "Eval.loss": _eval_errors, "Eval.F1": _eval_F1, "Eval.Acc": _eval_acc})
    
    else:
        training_logs = pd.DataFrame({"Type": _type, "Training.loss": _train_errors})
        
        # if no validation was used, we just save the model at the end of the training
        print("\nsave last model")
        torch.save(model.state_dict(), os.path.join(out_dir, "last-model.pt"))

    return training_logs

    print("-----------------------------------------------------------------------------------------------")


def cross_validation(model_params, dataset, k, loss_fn, eval_threshold, device, out_dir, batch_size = 8, nepochs = 4):
    """The function performs a cross-validation (CV) loop of the training set.
    The training dataset is first split into k fold. For instance setting k to 5 means deviding the training into 5 distinct parts (index attributions is random) and then in each fold we use part as the validation set and the remaining as the training set.
    In each fold we applied the train_loop_Siamese. and export the training_logs dataframe in the output directory

    Args:
        model_params (dict): a python dict containing the parameters to initialize a SiameseBERTNet model.
        dataset (torch.utils.data.Dataset): the training dataset that will be used for CV
        k (int): the number of fold
        loss_fn (ConstrastiveLoss): the initialized constrastive loss function 
        eval_threshold (int): the threshold to use for evaluation
        device (torch.device): the torch device cpu or gpu
        out_dir (str): the output directory 
        batch_size (int, optional): the batch size
        nepochs (int, optional): the number of training epochs (must be same as in the scheduler)
    """

    # Init kfold: split the dataset into k folds. shuffle = True indicates that the individuals of the different folds are chosen randomly and are not necesseraly packs that follow each others
    kfold = KFold(n_splits = k, shuffle = True)

    # Loop over folds: At each steps, (k - 1) folds are chosen to be in the training set and the remaining kième fold is chose to be the validation set.
    for fold, (train_ids, validation_ids) in enumerate(kfold.split(dataset)):
        
        # Check outdir
        fold_out_dir = os.path.join(out_dir, "f" + str(fold + 1))
        if not os.path.isdir(fold_out_dir):
            os.makedirs(fold_out_dir)
        
        print(" --- fold: " + str(fold + 1) + " --- ")
        # To use the index of the individuals belonging to the (k - 1) training folds and the validation fold in the DataLoader, we create SubsetRandomSampler
        # It creates a random sampler with the index in the (k - 1) training folds and the validation fold 
        train_subsampler = SubsetRandomSampler(train_ids)
        validation_subsampler = SubsetRandomSampler(validation_ids)

        # We then create 2 data loader: one for iterative with batches over the train_ids and the second for the validation_ids
        # We don't need to use shuffle in the DataLoader as the selection of the ids is done with the provided samplers train_subsampler and validation_subsampler
        train_loader = DataLoader(dataset, batch_size, sampler = train_subsampler)
        validation_loader = DataLoader(dataset, batch_size, sampler = validation_subsampler)
        
        # Now, train the model with the train_loader and evaludate it on the validation_loader
        model = SiameseBERTNet(**model_params)

        # Check the parameters to be fine-tune
        # check_params(model)

        # Check for cuda:
        model.to(device)
        optimizer, scheduler = init_model(model, train_loader, nepochs)
        k_training_logs = train_loop_Siamese(model, train_loader, validation_loader, optimizer, scheduler, loss_fn, eval_threshold, nepochs, device, fold_out_dir)
        k_training_logs.to_csv(os.path.join(fold_out_dir, "k_" + str(fold + 1) + "_training_logs.csv"), index = True, header = True)

        



In [None]:
_train = train.sample(frac = 1).reset_index(drop = True).head(n = 100)
_train_dataset = SiameseNetWorkSentenceDataset(data = _train, tokenizer = tokenizer, max_length = 64)
loss = ConstrastiveLoss(m = 10)
eval_threshold = 5
model_params = dict({'freeze_embedding': True, 'freeze_encoder_layer': 8, 'freeze_cls_pooler': True})
out_dir = "data/SiameseBERT/Trainingm10_th5_bs32"
cross_validation(model_params = model_params, dataset = _train_dataset, k = 5, loss_fn = loss, eval_threshold = eval_threshold, device = device, out_dir = out_dir, batch_size = 20, nepochs = 5)

# TODO Faire des tests sur le thresold
# TODO En fait pour tester le score au mieux, faudrait faire une AUC !
# TODO: il faudrait shuffle le dataset train AVANT de prendre les 10,000 premières lignes ! . On fiat nos test CV sur juste 10k, on détermine le meilleur (car c'est déjà 3h de run ...) et ensuite on fait tourner sur full !
# TODO tester batch: 64 & freeze 10 layers ?

# For tests
# model = SiameseBERTNet()

# test_train_dataloader = DataLoader(test_train_dataset, batch_size = 8, shuffle = True, num_workers = 0)

# nepochs = 4
# 
# optimizer, scheduler = init_model(model, test_train_dataloader, nepochs)
# train_loop_Siamese(model, test_train_dataloader, optimizer, scheduler, loss, nepochs)

Pour le modèle final, ce qui semble le mieux c'est de le train également en CV en utilisant toutes les donnée cette fois-ci afin de pouvoir capter a poseriori le moment où celui-ci commence à overfitter. Néanmoins, afin de conserver une convertion du modèle avant l'overfit on propose de faire une sauvegarde des paramètres du modèle après chaque epoch de tel sorte qu'après le training et analyse de la loss Train/Eval on pourra revenir à une version du modèle où il n'avait pas encore over-fit. Il faut suivre la démarche suivante dans :

https://pytorch.org/tutorials/beginner/saving_loading_models.html

"If you only plan to keep the best performing model (according to the acquired validation loss), don’t forget that best_model_state = model.state_dict() returns a reference to the state and not its copy! You must serialize best_model_state or use best_model_state = deepcopy(model.state_dict()) otherwise your best best_model_state will keep getting updated by the subsequent training iterations. As a result, the final model state will be the state of the overfitted model."

Après ATTENTION lorsque l'on va train le modèle final je pense que l'on va faire que 1 fold: séparer notre training en mode 90% train, 10 % validation et faire la CV la dessus mais pas faire plusieurs fold ! Ça on le fait juste pour l'optimisation des paramètres ! 