In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers==2.8.0
!pip install sentencepiece

In [None]:
import os
import json
import time
import torch
import string
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup
punctuation = string.punctuation
# Credits to Olivier. (2021, January 5). Analyse de sentiments avec CamemBERT. Le Data Scientist. https://ledatascientist.com/analyse-de-sentiments-avec-camembert/

In [None]:
dataset = pd.read_csv("/content/max-dataset.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9174 entries, 0 to 9173
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text        9174 non-null   object
 1   Difficulty  9174 non-null   object
dtypes: object(2)
memory usage: 143.5+ KB


In [None]:
# Transpore A1-C2 scale into 0 to 5 
difficulties = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, difficulty in zip(range(len(difficulties)), difficulties):
    dataset['Difficulty'] = dataset['Difficulty'].replace([difficulty], index)

In [None]:
# text preprocessing - remove punctuation and lowercase
punctuation = punctuation.replace('-', '')
punctuation = punctuation.replace('~', '')

def preprocess_text(text: string):
    sentence = ''.join([ word for word in text if word not in punctuation ])
    return (sentence).lower()

dataset.Text = dataset.Text.apply(lambda x: preprocess_text(x))

In [None]:
# Filter by target variable in order to create a balanced train and test set.
a1_X = dataset[dataset.Difficulty == 0]['Text']
a2_X = dataset[dataset.Difficulty == 1]['Text']
b1_X = dataset[dataset.Difficulty == 2]['Text']
b2_X = dataset[dataset.Difficulty == 3]['Text']
c1_X = dataset[dataset.Difficulty == 4]['Text']
c2_X = dataset[dataset.Difficulty == 5]['Text']

a1_y = dataset[dataset.Difficulty == 0]['Difficulty']
a2_y = dataset[dataset.Difficulty == 1]['Difficulty']
b1_y = dataset[dataset.Difficulty == 2]['Difficulty']
b2_y = dataset[dataset.Difficulty == 3]['Difficulty']
c1_y = dataset[dataset.Difficulty == 4]['Difficulty']
c2_y = dataset[dataset.Difficulty == 5]['Difficulty']

In [None]:
# Train_test_split for each target variable
X_train_a1, X_test_a1, y_train_a1, y_test_a1 = train_test_split(a1_X, a1_y, test_size=0.1, random_state=707)
X_train_a2, X_test_a2, y_train_a2, y_test_a2 = train_test_split(a2_X, a2_y, test_size=0.1, random_state=707)
X_train_b1, X_test_b1, y_train_b1, y_test_b1 = train_test_split(b1_X, b1_y, test_size=0.1, random_state=707)
X_train_b2, X_test_b2, y_train_b2, y_test_b2 = train_test_split(b2_X, b2_y, test_size=0.1, random_state=707)
X_train_c1, X_test_c1, y_train_c1, y_test_c1 = train_test_split(c1_X, c1_y, test_size=0.1, random_state=707)
X_train_c2, X_test_c2, y_train_c2, y_test_c2 = train_test_split(c2_X, c2_y, test_size=0.1, random_state=707)

In [None]:
# Group X, y
texts_train = X_train_a1.append([X_train_a2, X_train_b1, X_train_b2, X_train_c1, X_train_c2], ignore_index=True).values.tolist()
texts_test = X_test_a1.append([X_test_a2, X_test_b1, X_test_b2, X_test_c1, X_test_c2], ignore_index=True).values.tolist()
labels_train = y_train_a1.append([y_train_a2, y_train_b1, y_train_b2, y_train_c1, y_train_c2], ignore_index=True).values.tolist()
labels_test = y_test_a1.append([y_test_a2, y_test_b1, y_test_b2, y_test_c1, y_test_c2], ignore_index=True).values.tolist()

In [None]:
TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=810912.0, style=ProgressStyle(descripti…




In [None]:
def preprocess(raw_texts, labels=None):
    """
    Takes raw data as argument and returns a pytorch dataloader.

    Args
        raw_texts (array-like) : A list of texts in the form of 'str'
        
        labels : a labels list from 0 to 5
    
    Returns
        inputs_ids, attention_masks, labels(optionel) : PyTorch object that contains tokenized and encoded versions of raw data
    """

    

    encoded_batch = TOKENIZER.batch_encode_plus(raw_texts,
                                                add_special_tokens=True,
                                                pad_to_max_length=True,
                                                return_attention_mask=True,
                                                return_tensors = 'pt')
    if labels:
        labels = torch.tensor(labels)
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], labels
    return encoded_batch['input_ids'], encoded_batch['attention_mask']

In [None]:
input_ids, attention_mask, labels_train = preprocess(texts_train, labels_train)
# Combine the training inputs into a TensorDataset
train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels_train)

input_ids, attention_mask, labels_test = preprocess(texts_test, labels_test)
# Combine the validation inputs into a TensorDataset
validation_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels_test)

In [None]:
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)

validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size)

In [None]:
try:
    state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/lingorank-v3.pt")
    print("Loading trained model...")
    model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    state_dict=state_dict,
    num_labels = 6)
    print("Trained model loaded!")
except Exception as e:
    print("Enable to load trained model.")
    print(e)
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base',
        num_labels = 6)

Loading trained model...
Trained model loaded!


In [None]:
def predict(texts, model=model):
    with torch.no_grad():
        model.eval()
        input_ids, attention_mask = preprocess(texts)
        retour = model(input_ids, attention_mask=attention_mask)
        return torch.argmax(retour[0], dim=1)

In [None]:
def evaluate(texts, labels, metric='report'):
    predictions = predict(texts)
    if metric == 'report':
        return metrics.classification_report(labels, predictions, zero_division=0)
    elif metric == 'matrix':
        return metrics.confusion_matrix(labels, predictions)

In [None]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate - Default is 5e-5
                  eps = 1e-8 # Adam Epsilon  - Default is 1e-8.
                )

In [None]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)  
torch.cuda.set_device(0)

model.to(device)

# Training loop
training_stats = []
                                                                                
# Measure the total training time for the whole run.
total_t0 = time.time()

epochs = 10

# Total number of training steps is [number of batches] x [number of epochs]
# (Note that this is not the same as the number of training samples)
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# This variable will evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0

# Training
for epoch in range(0, epochs):
    
    print("")
    print(f'########## Epoch {epoch} / {epochs} ##########')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = time.time() - t0
            
            # Report progress
            print(f'  Batch {step}  of  {len(train_dataloader)}    Elapsed: {format_time(elapsed)}.')

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the 'device' using the 'to' method
        #
        # 'batch' contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: skills 
        input_id = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch)
        # the loss (because we provided skills) and the "logits"--the model
        # outputs prior to activation
        loss, logits = model(input_id, 
                             token_type_ids=None, 
                             attention_mask=attention_mask, 
                             labels=labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 'loss' is a Tensor containing a
        # single value; the '.item()' function just returns the Python value 
        # from the tensor
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches
    avg_train_loss = total_train_loss / len(train_dataloader)   

    if epoch > 0:
        if min([stat['Training Loss'] for stat in training_stats]) <= avg_train_loss:
            # i.e. If there is not improvement
            consecutive_epochs_with_no_improve += 1
        else:
            # If there is improvement
            consecutive_epochs_with_no_improve = 0
            print("Model saved!")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/lingorank-v3.pt")
    
    # Measure how long this epoch took
    training_time = time.time() - t0

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )
    if consecutive_epochs_with_no_improve == 2:
        print("Stop training : The loss has not changed since 2 epochs!")
        break

    gc.collect()
    torch.cuda.empty_cache()

print("Model saved!")
with open('/content/drive/MyDrive/Colab Notebooks/metrics-v3.json', 'w+') as outfile:
    json.dump(training_stats, outfile)
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/lingorank-v3.pt")


########## Epoch 0 / 10 ##########
Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


  Batch 50  of  516    Elapsed: 0:00:44.
  Batch 100  of  516    Elapsed: 0:01:30.
  Batch 150  of  516    Elapsed: 0:02:17.
  Batch 200  of  516    Elapsed: 0:03:05.
  Batch 250  of  516    Elapsed: 0:03:54.
  Batch 300  of  516    Elapsed: 0:04:42.
  Batch 350  of  516    Elapsed: 0:05:31.
  Batch 400  of  516    Elapsed: 0:06:20.
  Batch 450  of  516    Elapsed: 0:07:10.
  Batch 500  of  516    Elapsed: 0:07:59.

  Average training loss: 0.06
  Training epoch took: 494.74662923812866

########## Epoch 1 / 10 ##########
Training...
  Batch 50  of  516    Elapsed: 0:00:49.
  Batch 100  of  516    Elapsed: 0:01:39.
  Batch 150  of  516    Elapsed: 0:02:28.
  Batch 200  of  516    Elapsed: 0:03:17.
  Batch 250  of  516    Elapsed: 0:04:07.
  Batch 300  of  516    Elapsed: 0:04:56.
  Batch 350  of  516    Elapsed: 0:05:45.
  Batch 400  of  516    Elapsed: 0:06:35.
  Batch 450  of  516    Elapsed: 0:07:24.
  Batch 500  of  516    Elapsed: 0:08:13.

  Average training loss: 0.06
  Training

In [None]:
device = torch.device('cpu') 
model.to(device)

# Make predictions on the test dataset
predictions = []
for sentence in texts_test:
    predictions.append(predict([sentence]))

In [None]:
print(metrics.classification_report(predictions, labels_test))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84       156
           1       0.72      0.72      0.72       179
           2       0.63      0.71      0.67       136
           3       0.78      0.61      0.68       201
           4       0.66      0.66      0.66       160
           5       0.60      0.89      0.71        88

    accuracy                           0.71       920
   macro avg       0.71      0.73      0.71       920
weighted avg       0.72      0.71      0.71       920



In [None]:
metrics.confusion_matrix(predictions, labels_test)

array([[125,  25,   4,   1,   1,   0],
       [ 17, 128,  23,   8,   1,   2],
       [  0,  21,  96,  10,   4,   5],
       [  0,   4,  24, 122,  38,  13],
       [  0,   0,   5,  16, 106,  33],
       [  0,   0,   0,   0,  10,  78]])

In [None]:
# Loop for checking specific wrongs predictions 
for x, y, z in zip(predictions, labels_test, texts_test):
    if int(x) == 0:
        if y == 1:
            print(z)

Ainsi, Pierre a le privilège d'admirer chaque jour l'un des monuments les plus visités au monde !
Je vais ensuite prendre ma douche dans ma salle-de-bain.
À bientôt!
Nous avons déménagé en France, parce qu'elle a toujours aimé la culture de ce pays.
Je propose des spécialités de la région lyonnaise.
J'imagine que les week-ends doivent être bien remplis !


In [None]:
device = torch.device('cpu') 
model.to(device)

# Predicts each word perceived difficulty in a sentence.
predict("Dans un premier temps, nous nous demanderons si le travail n’est qu’une activité imposée par l’extérieur contre la volonté de l’Homme, puis dans un deuxième temps nous nous interrogerons sur le fait que le travail est une activité que l’être humain s’impose librement à lui-même.".split(' '))

tensor([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1])