In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers==2.8.0
!pip install sentencepiece

Collecting transformers==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 25.9MB/s eta 0:00:01[K     |█▏                              | 20kB 29.4MB/s eta 0:00:01[K     |█▊                              | 30kB 20.5MB/s eta 0:00:01[K     |██▎                             | 40kB 23.6MB/s eta 0:00:01[K     |███                             | 51kB 26.2MB/s eta 0:00:01[K     |███▌                            | 61kB 28.5MB/s eta 0:00:01[K     |████                            | 71kB 29.8MB/s eta 0:00:01[K     |████▋                           | 81kB 25.0MB/s eta 0:00:01[K     |█████▎                          | 92kB 25.4MB/s eta 0:00:01[K     |█████▉                          | 102kB 26.7MB/s eta 0:00:01[K     |██████▍                         | 112kB 26.7MB/s eta 0:00:01[K     |███████                    

In [2]:
import os
import json
import time
import torch
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup

# Functions : preprocess() (create dataloaders from raw data) 
# load_models() (load tokenizers and models) training() (loop of one training step) evaluate()

In [3]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)  
torch.cuda.set_device(0)

In [4]:
dataset = pd.read_csv("/content/max-dataset.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9174 entries, 0 to 9173
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text        9174 non-null   object
 1   Difficulty  9174 non-null   object
dtypes: object(2)
memory usage: 143.5+ KB


In [5]:
difficulties = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, difficulty in zip(range(len(difficulties)), difficulties):
    dataset['Difficulty'] = dataset['Difficulty'].replace([difficulty], index)

In [6]:
# Split train-validation
a1_X = dataset[dataset.Difficulty == 0]['Text']
a2_X = dataset[dataset.Difficulty == 1]['Text']
b1_X = dataset[dataset.Difficulty == 2]['Text']
b2_X = dataset[dataset.Difficulty == 3]['Text']
c1_X = dataset[dataset.Difficulty == 4]['Text']
c2_X = dataset[dataset.Difficulty == 5]['Text']

a1_y = dataset[dataset.Difficulty == 0]['Difficulty']
a2_y = dataset[dataset.Difficulty == 1]['Difficulty']
b1_y = dataset[dataset.Difficulty == 2]['Difficulty']
b2_y = dataset[dataset.Difficulty == 3]['Difficulty']
c1_y = dataset[dataset.Difficulty == 4]['Difficulty']
c2_y = dataset[dataset.Difficulty == 5]['Difficulty']

In [7]:
X_train_a1, X_test_a1, y_train_a1, y_test_a1 = train_test_split(a1_X, a1_y, test_size=0.1)
X_train_a2, X_test_a2, y_train_a2, y_test_a2 = train_test_split(a2_X, a2_y, test_size=0.1)
X_train_b1, X_test_b1, y_train_b1, y_test_b1 = train_test_split(b1_X, b1_y, test_size=0.1)
X_train_b2, X_test_b2, y_train_b2, y_test_b2 = train_test_split(b2_X, b2_y, test_size=0.1)
X_train_c1, X_test_c1, y_train_c1, y_test_c1 = train_test_split(c1_X, c1_y, test_size=0.1)
X_train_c2, X_test_c2, y_train_c2, y_test_c2 = train_test_split(c2_X, c2_y, test_size=0.1)

In [8]:
texts_train = X_train_a1.append([X_train_a2, X_train_b1, X_train_b2, X_train_c1, X_train_c2], ignore_index=True).values.tolist()
texts_validation = X_test_a1.append([X_test_a2, X_test_b1, X_test_b2, X_test_c1, X_test_c2], ignore_index=True).values.tolist()
labels_train = y_train_a1.append([y_train_a2, y_train_b1, y_train_b2, y_train_c1, y_train_c2], ignore_index=True).values.tolist()
labels_validation = y_test_a1.append([y_test_a2, y_test_b1, y_test_b2, y_test_c1, y_test_c2], ignore_index=True).values.tolist()

In [9]:
TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

In [10]:
def preprocess(raw_texts, labels=None):
    """
    Cette fonction prends de la donnée brute en argument et retourne un 'dataloader' pytorch

    Args
        raw_texts (array-like) : Une liste de texts sous forme de 'str'
        
        labels : Une liste 'labels' (0 = negatif, 1 = positif) de la meme taille que
                     'raw_review'
    
    Returns
        inputs_ids, attention_masks, labels(optionel) : Objet  de PyTorch qui contient 
                    les versions tokenisees et encodees des donnees brutes
    """

    

    encoded_batch = TOKENIZER.batch_encode_plus(raw_texts,
                                                add_special_tokens=True,
                                                pad_to_max_length=True,
                                                return_attention_mask=True,
                                                return_tensors = 'pt')
    if labels:
        labels = torch.tensor(labels)
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], labels
    return encoded_batch['input_ids'], encoded_batch['attention_mask']

In [11]:
input_ids, attention_mask, labels_train = preprocess(texts_train, labels_train)
# Combine the training inputs into a TensorDataset
train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels_train)

input_ids, attention_mask, labels_validation = preprocess(texts_validation, labels_validation)
# Combine the validation inputs into a TensorDataset
validation_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels_validation)

In [12]:
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)

validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size)

In [13]:
try:
    state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/labelsfull.pt")
    print("Loading trained model...")
    model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    state_dict=state_dict,
    num_labels = 6)
    print("Trained model loaded!")
except Exception as e:
    print("Enable to load trained model.")
    print(e)
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base',
        num_labels = 6)

Loading trained model...
Trained model loaded!


In [14]:
def predict(texts, model=model):
    with torch.no_grad():
        model.eval()
        input_ids, attention_mask = preprocess(texts)
        retour = model(input_ids, attention_mask=attention_mask)
        return torch.argmax(retour[0], dim=1)

In [15]:
def evaluate(texts, labels, metric='report'):
    predictions = predict(texts)
    if metric == 'report':
        return metrics.classification_report(labels, predictions, zero_division=0)
    elif metric == 'matrix':
        return metrics.confusion_matrix(labels, predictions)

In [16]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate - Default is 5e-5
                  eps = 1e-8 # Adam Epsilon  - Default is 1e-8.
                )

In [18]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)  
torch.cuda.set_device(0)

model.to(device)

# Training loop
training_stats = []
                                                                                
# Measure the total training time for the whole run.
total_t0 = time.time()

epochs = 20

# Total number of training steps is [number of batches] x [number of epochs]
# (Note that this is not the same as the number of training samples)
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# This variable will evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0

# Training
for epoch in range(0, epochs):
    
    print("")
    print(f'########## Epoch {epoch} / {epochs} ##########')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = time.time() - t0
            
            # Report progress
            print(f'  Batch {step}  of  {len(train_dataloader)}    Elapsed: {format_time(elapsed)}.')

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the 'device' using the 'to' method
        #
        # 'batch' contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: skills 
        input_id = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch)
        # the loss (because we provided skills) and the "logits"--the model
        # outputs prior to activation
        loss, logits = model(input_id, 
                             token_type_ids=None, 
                             attention_mask=attention_mask, 
                             labels=labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 'loss' is a Tensor containing a
        # single value; the '.item()' function just returns the Python value 
        # from the tensor
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches
    avg_train_loss = total_train_loss / len(train_dataloader)   

    if epoch > 0:
        if min([stat['Training Loss'] for stat in training_stats]) <= avg_train_loss:
            # i.e. If there is not improvement
            consecutive_epochs_with_no_improve += 1
        else:
            # If there is improvement
            consecutive_epochs_with_no_improve = 0
            print("Model saved!")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labelsfull.pt")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labelsfull.pth")
    
    # Measure how long this epoch took
    training_time = time.time() - t0

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )
    if consecutive_epochs_with_no_improve == 2:
        print("Stop training : The loss has not changed since 2 epochs!")
        break

    gc.collect()
    torch.cuda.empty_cache()

print("Model saved!")
with open('/content/drive/MyDrive/Colab Notebooks/metricsfull.json', 'w+') as outfile:
    json.dump(training_stats, outfile)
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labelsfull.pt")
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labelsfull.pth")


########## Epoch 0 / 20 ##########
Training...
  Batch 50  of  516    Elapsed: 0:01:37.
  Batch 100  of  516    Elapsed: 0:03:15.
  Batch 150  of  516    Elapsed: 0:04:54.
  Batch 200  of  516    Elapsed: 0:06:32.
  Batch 250  of  516    Elapsed: 0:08:10.
  Batch 300  of  516    Elapsed: 0:09:49.
  Batch 350  of  516    Elapsed: 0:11:27.
  Batch 400  of  516    Elapsed: 0:13:06.
  Batch 450  of  516    Elapsed: 0:14:44.
  Batch 500  of  516    Elapsed: 0:16:22.

  Average training loss: 1.38
  Training epoch took: 1013.6667642593384

########## Epoch 1 / 20 ##########
Training...
  Batch 50  of  516    Elapsed: 0:01:38.
  Batch 100  of  516    Elapsed: 0:03:16.
  Batch 150  of  516    Elapsed: 0:04:54.
  Batch 200  of  516    Elapsed: 0:06:31.
  Batch 250  of  516    Elapsed: 0:08:08.
  Batch 300  of  516    Elapsed: 0:09:46.
  Batch 350  of  516    Elapsed: 0:11:23.
  Batch 400  of  516    Elapsed: 0:13:00.
  Batch 450  of  516    Elapsed: 0:14:37.
  Batch 500  of  516    Elapsed: 0:

In [None]:
device = torch.device('cpu') 
model.to(device)

# Evaluation with the confusion matrix
import seaborn
confusion_matrix = evaluate(texts_validation, labels_validation, 'matrix')
report = evaluate(texts_validation, labels_validation, 'report')
print(report)
seaborn.heatmap(confusion_matrix)

920

In [53]:
device = torch.device('cpu') 
model.to(device)

predictions = []
for sentence in texts_validation:
    predictions.append(predict([sentence]))

In [40]:
temp_predictions

[tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0])]

In [36]:
device = torch.device('cpu') 
model.to(device)

predictions = predict(texts_validation[600:])

In [54]:
print(metrics.classification_report(predictions, labels_validation))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       150
           1       0.93      0.94      0.93       175
           2       0.93      0.93      0.93       153
           3       0.95      0.95      0.95       157
           4       0.93      0.97      0.95       154
           5       0.97      0.97      0.97       131

    accuracy                           0.95       920
   macro avg       0.95      0.95      0.95       920
weighted avg       0.95      0.95      0.95       920



In [55]:
metrics.confusion_matrix(predictions, labels_validation)

array([[140,   8,   2,   0,   0,   0],
       [  2, 165,   7,   1,   0,   0],
       [  0,   3, 142,   6,   2,   0],
       [  0,   2,   1, 149,   5,   0],
       [  0,   0,   0,   1, 149,   4],
       [  0,   0,   0,   0,   4, 127]])

In [20]:
device = torch.device('cpu') 
model.to(device)

predict(["Dans un premier temps, nous nous demanderons si le travail n’est qu’une activité imposée par l’extérieur contre la volonté de l’Homme, puis dans un deuxième temps nous nous interrogerons sur le fait que le travail est une activité que l’être humain s’impose librement à lui-même."])

tensor([5])

In [21]:
device = torch.device('cpu') 
model.to(device)

predict("Dans un premier temps, nous nous demanderons si le travail n’est qu’une activité imposée par l’extérieur contre la volonté de l’Homme, puis dans un deuxième temps nous nous interrogerons sur le fait que le travail est une activité que l’être humain s’impose librement à lui-même.".split(' '))

tensor([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1])

# Full training

In [None]:
texts = dataset.Text.values.tolist()
labels = dataset.Difficulty.values.tolist()

In [None]:
input_ids, attention_mask, labels = preprocess(texts, labels)
# Combine the training inputs into a TensorDataset
full_train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels)

In [None]:
# size of 16 or 32.
batch_size = 4

# Create the DataLoaders
train_dataloader = DataLoader(
            full_train_dataset,
            sampler = RandomSampler(full_train_dataset),
            batch_size = batch_size)

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)  
torch.cuda.set_device(0)

model.to(device)

# Training loop
training_stats = []
                                                                                
# Measure the total training time for the whole run.
total_t0 = time.time()

epochs = 20

# Total number of training steps is [number of batches] x [number of epochs]
# (Note that this is not the same as the number of training samples)
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# This variable will evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0

# Training
for epoch in range(0, epochs):
    
    print("")
    print(f'########## Epoch {epoch} / {epochs} ##########')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = time.time() - t0
            
            # Report progress
            print(f'  Batch {step}  of  {len(train_dataloader)}    Elapsed: {format_time(elapsed)}.')

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the 'device' using the 'to' method
        #
        # 'batch' contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: skills 
        input_id = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch)
        # the loss (because we provided skills) and the "logits"--the model
        # outputs prior to activation
        loss, logits = model(input_id, 
                             token_type_ids=None, 
                             attention_mask=attention_mask, 
                             labels=labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 'loss' is a Tensor containing a
        # single value; the '.item()' function just returns the Python value 
        # from the tensor
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches
    avg_train_loss = total_train_loss / len(train_dataloader)   

    if epoch > 0:
        if min([stat['Training Loss'] for stat in training_stats]) <= avg_train_loss:
            # i.e. If there is not improvement
            consecutive_epochs_with_no_improve += 1
        else:
            # If there is improvement
            consecutive_epochs_with_no_improve = 0
            print("Model saved!")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labels.pt")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labels.pth")
    
    # Measure how long this epoch took
    training_time = time.time() - t0

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )
    if consecutive_epochs_with_no_improve == 2:
        print("Stop training : The loss has not changed since 2 epochs!")
        break

print("Model saved!")
with open('/content/drive/MyDrive/Colab Notebooks/metrics.json', 'w+') as outfile:
    json.dump(training_stats, outfile)
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/labels.pt")


########## Epoch 0 / 20 ##########
Training...
  Batch 50  of  255    Elapsed: 0:00:06.
  Batch 100  of  255    Elapsed: 0:00:12.
  Batch 150  of  255    Elapsed: 0:00:18.
  Batch 200  of  255    Elapsed: 0:00:25.
  Batch 250  of  255    Elapsed: 0:00:31.

  Average training loss: 1.57
  Training epoch took: 31.357661962509155

########## Epoch 1 / 20 ##########
Training...
  Batch 50  of  255    Elapsed: 0:00:06.
  Batch 100  of  255    Elapsed: 0:00:12.
  Batch 150  of  255    Elapsed: 0:00:18.
  Batch 200  of  255    Elapsed: 0:00:24.
  Batch 250  of  255    Elapsed: 0:00:31.
Model saved!

  Average training loss: 1.14
  Training epoch took: 35.01443815231323

########## Epoch 2 / 20 ##########
Training...
  Batch 50  of  255    Elapsed: 0:00:06.
  Batch 100  of  255    Elapsed: 0:00:12.
  Batch 150  of  255    Elapsed: 0:00:18.
  Batch 200  of  255    Elapsed: 0:00:24.
  Batch 250  of  255    Elapsed: 0:00:31.
Model saved!

  Average training loss: 0.72
  Training epoch took: 34.7

# Full test set

In [None]:
test_dataset = pd.read_csv('/content/test-dataset-header.csv')

difficulties = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, difficulty in zip(range(len(difficulties)), difficulties):
    test_dataset['Difficulty'] = test_dataset['Difficulty'].replace([difficulty], index)

In [None]:
texts = test_dataset.Text.values.tolist()
labels = test_dataset.Difficulty.values.tolist()

In [None]:
try:
    state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/labels.pt")
    print("Loading trained model...")
    model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    state_dict=state_dict,
    num_labels = 6)
    print("Trained model loaded!")
except Exception as e:
    print("Enable to load trained model.")
    print(e)
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base',
        num_labels = 6)

Loading trained model...
Trained model loaded!


In [None]:
int(len(texts)/12)

679

In [None]:
device = torch.device('cpu') 
model.to(device)

half = int(len(texts)/12)
predictions = predict(texts[6000:7000])

In [None]:
print(metrics.classification_report(predictions, labels[6000:7000]))

              precision    recall  f1-score   support

           0       0.13      0.43      0.20        35
           1       0.62      0.30      0.41       410
           2       0.15      0.23      0.18       151
           3       0.56      0.52      0.54       246
           4       0.32      0.50      0.39       131
           5       0.00      0.00      0.00        27

    accuracy                           0.37      1000
   macro avg       0.30      0.33      0.29      1000
weighted avg       0.46      0.37      0.39      1000



In [None]:
print(metrics.confusion_matrix(predictions, labels[6000:7000]))

[[ 15  12   7   1   0   0]
 [ 75 124 144  50  13   4]
 [ 17  41  34  26  29   4]
 [  3  16  18 128  71  10]
 [  4   6  17  22  65  17]
 [  0   1   0   1  25   0]]


# Test

In [56]:
data_apple = pd.read_csv('/content/data_apple.csv')

In [57]:
data_apple

Unnamed: 0,Text,Difficulty,Difficulty Annotator 1,Difficulty Annotator 2,Difficulty Annotator 3
0,Franck est français.,A1,A1,A1,A1
1,Il est employé,A1,A1,A1,A1
2,Il habite en Italie avec sa femme et ses enfants.,A1,A1,A1,A1
3,Il travaille à Rome,A1,A1,A1,A1
4,Il parle italien et anglais,A1,A1,A1,A1
...,...,...,...,...,...
1114,Si la connaissance est convoquée une troisième...,C2,C2,C2,C2
1115,« Parce que les révisions aident à faire retro...,C2,C2,C2,C2
1116,Un enfant peut avoir parfois l’impression d’av...,C2,C2,C2,C2
1117,Un simple indice donné durant les vacances va ...,C2,C2,C2,C2


In [58]:
difficulties = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, difficulty in zip(range(len(difficulties)), difficulties):
    data_apple['Difficulty'] = data_apple['Difficulty'].replace([difficulty], index)

In [59]:
data_apple

Unnamed: 0,Text,Difficulty,Difficulty Annotator 1,Difficulty Annotator 2,Difficulty Annotator 3
0,Franck est français.,0,A1,A1,A1
1,Il est employé,0,A1,A1,A1
2,Il habite en Italie avec sa femme et ses enfants.,0,A1,A1,A1
3,Il travaille à Rome,0,A1,A1,A1
4,Il parle italien et anglais,0,A1,A1,A1
...,...,...,...,...,...
1114,Si la connaissance est convoquée une troisième...,5,C2,C2,C2
1115,« Parce que les révisions aident à faire retro...,5,C2,C2,C2
1116,Un enfant peut avoir parfois l’impression d’av...,5,C2,C2,C2
1117,Un simple indice donné durant les vacances va ...,5,C2,C2,C2


In [61]:
device = torch.device('cpu') 
model.to(device)

predictions = []
for sentence in data_apple['Text']:
    predictions.append(predict([sentence]))

In [104]:
print(metrics.classification_report(data_apple['Difficulty'], predictions))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       192
           1       0.53      0.90      0.67       205
           2       0.53      0.90      0.67       174
           3       0.41      0.31      0.35       167
           4       0.56      0.33      0.41       199
           5       1.00      0.30      0.46       182

    accuracy                           0.58      1119
   macro avg       0.64      0.58      0.55      1119
weighted avg       0.64      0.58      0.56      1119



In [103]:
print(metrics.confusion_matrix(data_apple['Difficulty'], predictions))

[[144  41   7   0   0   0]
 [ 12 184   5   3   1   0]
 [  5  13 156   0   0   0]
 [ 11  45  53  51   7   0]
 [ 13  58  39  24  65   0]
 [  1   5  33  46  43  54]]


In [106]:
for x, y, z in zip(predictions, data_apple.Difficulty, data_apple.Text):
    # print(int(x), y, z)
    if int(x) == 1:
        if y == 5:
            print(z, x, y)

Les valorisations boursières des sociétés Internet comme AMAZON tensor([1]) 5
"Certes, un rêve de beignet, c’est un rêve, pas un beignet tensor([1]) 5
Mais à qui la faute ? À l'histoire, d'abord tensor([1]) 5
Je travaille tous les jours en anglais, et je suis souvent publié par des journaux américains tensor([1]) 5
Voici un extrait de sa conférence à Lannion, le 28 mai 1988 tensor([1]) 5
