# Rendu du TP de NLP

## 1) Chargement et visualisation du dataset

In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import random
import yaml
import json

In [2]:
from datasets import load_dataset
snli = load_dataset("snli")

In [3]:
train_dataset = load_dataset("snli", split='train')
test_dataset = load_dataset("snli", split='test')
validation_dataset = load_dataset("snli", split='validation')

config = yaml.safe_load(open("project/config.yml", "r"))
tokenizer_config = config["tokenizer"]
training_config = config["training_config"]
model_name=training_config["model_name"]
tok_model_name = model_name
if "project/results" in model_name:
    json_config = json.load(open(model_name + "/config.json", "r"))
    tok_model_name = json_config["_name_or_path"]

# 2) Preprocessing

### On filtre notre dataset en enlevant les labels -1

In [4]:
from project.data_utils import PreProcessor, Filter
filter = Filter()
preprocessor = PreProcessor(max_length=tokenizer_config["max_length"], model_name=tok_model_name)

train_dataset_filtered = filter.transform(train_dataset)
test_dataset_filtered = filter.transform(test_dataset)
validation_dataset_filtered = filter.transform(validation_dataset)

### On vectorise nos données avec le modèle qu'on a choisi de Hugging Face: all-MiniLM-L6-v2

In [8]:
train_dataset_processed, labels_train = preprocessor.fit_transform(train_dataset_filtered) 
test_dataset_processed, labels_test = preprocessor.transform(test_dataset_filtered)
validation_dataset_processed, labels_val = preprocessor.transform(validation_dataset_filtered)

print(train_dataset_processed[0])
untransformed_sentence = preprocessor.decode_sentence(train_dataset_processed[0]['input_ids'])
print(untransformed_sentence)


{'input_ids': tensor([  101,  1037,  2711,  2006,  1037,  3586, 14523,  2058,  1037,  3714,
         2091, 13297,  1012,   102,  1037,  2711,  2003,  2731,  2010,  3586,
         2005,  1037,  2971,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'atte

AttributeError: 'PreProcessor' object has no attribute 'inverse_transform'

In [10]:
untransformed_sentence = preprocessor.decode_sentence(train_dataset_processed[0]['input_ids'])
print(untransformed_sentence)


[CLS] a person on a horse jumps over a broken down airplane. [SEP] a person is training his horse for a competition. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


# 3) Training / Testing
Depending on the configuration in `project/config.yml`.

In [7]:
from project.model import allMiniLMModel

num_labels=training_config["num_labels"] 
output_dir = training_config["output_dir"]

batch_size=training_config["batch_size"] 
epochs=training_config["epochs"] 
learning_rate=training_config["learning_rate"] 
seed=training_config["seed"] 
warmup_steps=training_config["warmup_steps"] #grosses variations de learning rate au début
weight_decay=training_config["weight_decay"] #poids de la régularisation L2

config_wandb = config["wandb_config"]
wandb_project_name=config_wandb["project"]
wandb_entity = config_wandb["entity"]

model = allMiniLMModel(model_name, num_labels, output_dir, train_dataset_processed, validation_dataset_processed, test_dataset_processed, batch_size, epochs, learning_rate, seed, warmup_steps,weight_decay,wandb_project_name=wandb_project_name, wandb_entity=wandb_entity, wandb_api_key=None)




0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁
eval_loss,▁
eval_runtime,▁
eval_samples_per_second,▁
eval_steps_per_second,▁

0,1
eval/accuracy,0.88925
eval/loss,0.31724
eval/runtime,3.9012
eval/samples_per_second,2518.231
eval/steps_per_second,19.738
eval_accuracy,0.88925
eval_loss,0.31724
eval_runtime,3.9012
eval_samples_per_second,2518.231
eval_steps_per_second,19.738


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668498283312752, max=1.0…

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/77 [00:00<?, ?it/s]

Test accuracy:  0.8892508143322475
