In [30]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers==2.8.0
!pip install sentencepiece

In [2]:
import os
import json
import time
import torch
import string
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup
punctuation = string.punctuation
# Credits to Olivier. (2021, January 5). Analyse de sentiments avec CamemBERT. Le Data Scientist. https://ledatascientist.com/analyse-de-sentiments-avec-camembert/

In [34]:
dataset = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_submission.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentence    4800 non-null   object
 1   difficulty  4800 non-null   object
dtypes: object(2)
memory usage: 75.1+ KB


In [11]:
# Transpore A1-C2 scale into 0 to 5 
difficulties = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, difficulty in zip(range(len(difficulties)), difficulties):
    dataset['difficulty'] = dataset['difficulty'].replace([difficulty], index)

In [7]:
# text preprocessing - remove punctuation and lowercase
punctuation = punctuation.replace('-', '')
punctuation = punctuation.replace('~', '')

def preprocess_text(text: string):
    sentence = ''.join([ word for word in text if word not in punctuation ])
    return (sentence).lower()

dataset.sentence = dataset.sentence.apply(lambda x: preprocess_text(x))

AttributeError: ignored

In [19]:
texts_train = dataset['sentence'].values.tolist()
labels_train = dataset['difficulty'].values.tolist()

In [13]:
TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

In [20]:
def preprocess(raw_texts, labels=None):
    """
    Takes raw data as argument and returns a pytorch dataloader.

    Args
        raw_texts (array-like) : A list of texts in the form of 'str'
        
        labels : a labels list from 0 to 5
    
    Returns
        inputs_ids, attention_masks, labels(optionel) : PyTorch object that contains tokenized and encoded versions of raw data
    """

    

    encoded_batch = TOKENIZER.batch_encode_plus(raw_texts,
                                                add_special_tokens=True,
                                                pad_to_max_length=True,
                                                return_attention_mask=True,
                                                return_tensors = 'pt')
    if labels:
        labels = torch.tensor(labels)
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], labels
    return encoded_batch['input_ids'], encoded_batch['attention_mask']

In [21]:
input_ids, attention_mask, labels_train = preprocess(texts_train, labels_train)
# Combine the training inputs into a TensorDataset
train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    labels_train)

In [22]:
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)

In [23]:
try:
    state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/aicrowd-v1-15e.pt")
    print("Loading trained model...")
    model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    state_dict=state_dict,
    num_labels = 6)
    print("Trained model loaded!")
except Exception as e:
    print("Enable to load trained model.")
    print(e)
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base',
        num_labels = 6)

Enable to load trained model.
[Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/aicrowd-v1-15e.pt'


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445032417.0, style=ProgressStyle(descri…




In [24]:
def predict(texts, model=model):
    with torch.no_grad():
        model.eval()
        input_ids, attention_mask = preprocess(texts)
        retour = model(input_ids, attention_mask=attention_mask)
        return torch.argmax(retour[0], dim=1)

In [25]:
def evaluate(texts, labels, metric='report'):
    predictions = predict(texts)
    if metric == 'report':
        return metrics.classification_report(labels, predictions, zero_division=0)
    elif metric == 'matrix':
        return metrics.confusion_matrix(labels, predictions)

In [26]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate - Default is 5e-5
                  eps = 1e-8 # Adam Epsilon  - Default is 1e-8.
                )

In [28]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)  
torch.cuda.set_device(0)

model.to(device)

# Training loop
training_stats = []
                                                                                
# Measure the total training time for the whole run.
total_t0 = time.time()

epochs = 5

# Total number of training steps is [number of batches] x [number of epochs]
# (Note that this is not the same as the number of training samples)
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# This variable will evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0

# Training
for epoch in range(0, epochs):
    
    print("")
    print(f'########## Epoch {epoch} / {epochs} ##########')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = time.time() - t0
            
            # Report progress
            print(f'  Batch {step}  of  {len(train_dataloader)}    Elapsed: {format_time(elapsed)}.')

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the 'device' using the 'to' method
        #
        # 'batch' contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: skills 
        input_id = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch)
        # the loss (because we provided skills) and the "logits"--the model
        # outputs prior to activation
        loss, logits = model(input_id, 
                             token_type_ids=None, 
                             attention_mask=attention_mask, 
                             labels=labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 'loss' is a Tensor containing a
        # single value; the '.item()' function just returns the Python value 
        # from the tensor
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches
    avg_train_loss = total_train_loss / len(train_dataloader)   

    if epoch > 0:
        if min([stat['Training Loss'] for stat in training_stats]) <= avg_train_loss:
            # i.e. If there is not improvement
            consecutive_epochs_with_no_improve += 1
        else:
            # If there is improvement
            consecutive_epochs_with_no_improve = 0
            print("Model saved!")
            torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/aicrowd-v1-15e.pt")
    
    # Measure how long this epoch took
    training_time = time.time() - t0

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )
    if consecutive_epochs_with_no_improve == 2:
        print("Stop training : The loss has not changed since 2 epochs!")
        break

    gc.collect()
    torch.cuda.empty_cache()

print("Model saved!")
with open('/content/drive/MyDrive/Colab Notebooks/metrics-aicrowd-v1.json', 'w+') as outfile:
    json.dump(training_stats, outfile)
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/aicrowd-v1-15e.pt")


########## Epoch 0 / 5 ##########
Training...
  Batch 50  of  300    Elapsed: 0:01:38.
  Batch 100  of  300    Elapsed: 0:03:17.
  Batch 150  of  300    Elapsed: 0:04:56.


In [35]:
texts_test = test['sentence'].values.tolist()

In [36]:
device = torch.device('cpu') 
model.to(device)

# Make predictions on the test dataset
predictions = []
for sentence in texts_test:
    predictions.append(predict([sentence]))

In [47]:
# Transpore 0-5 to A1-C2 scale
for index, prediction in zip(range(len(predictions)), predictions):
    if int(prediction) == 0:
        sample_submission.loc[index, 'difficulty'] = 'A1'
    if int(prediction) == 1:
        sample_submission.loc[index, 'difficulty'] = 'A2'
    if int(prediction) == 2:
        sample_submission.loc[index, 'difficulty'] = 'B1'
    if int(prediction) == 3:
        sample_submission.loc[index, 'difficulty'] = 'B2'
    if int(prediction) == 4:
        sample_submission.loc[index, 'difficulty'] = 'C1'
    if int(prediction) == 5:
        sample_submission.loc[index, 'difficulty'] = 'C2'

In [48]:
sample_submission

Unnamed: 0,difficulty
0,C1
1,C1
2,A2
3,C2
4,A2
...,...
1195,C2
1196,B2
1197,B2
1198,C2


In [51]:
sample_submission.to_csv('camembert-v1-e15.csv', index=False)