In [None]:
!pip install datasets
!pip install accelerate -U

In [None]:
from transformers import AutoTokenizer, TF_AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
import torch
import pandas as pd
from google.colab import files
from google.colab import drive
import glob
import zipfile
from sklearn.model_selection import train_test_split
import os

In [None]:
# Instantiate the tokenizer for the french model
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")


def tokenize_function(examples, tokenizer=tokenizer):
    """
    Function to tokenize the data.
    examples : data to tokenize ; dict
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

In [None]:
def model(train_comments, train_labels,
                    val_comments, val_labels,
                    batch_size_train, batch_size_val,
                    epochs, tokenizer=tokenizer):
    """
    Function to train a  model on the data.
    train_comments : comments for training ; lst of str
    train_labels : labels for training ; lst of int
    val_comments : comments for validation ; lst of str
    val_labels : labels for validation ; lst of int
    batch_size_train : batch size for training ; int
    batch_size_val : batch size for validation ; int
    epochs : number of epochs ; int
    tokenizer : tokenizer to use 
    """
    # Set the device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = TF_AutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")
    model.to(device)

    # Setup the Hugging Face Dataset Class
    train_dataset_dict = {"text": train_comments, "label": train_labels}
    val_dataset_dict = {"text": val_comments, "label": val_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    val_dataset = Dataset.from_dict(val_dataset_dict)

    # Apply the tokenizer to the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Remove columns we do not need for training
    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")


    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,              # total number of training epochs
        per_device_train_batch_size=batch_size_train,  # batch size for training
        per_device_eval_batch_size=batch_size_val,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.001,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",     # Evaluate every `eval_steps`
        eval_steps=10,                   # Number of steps between evaluations
        save_steps=10,                   # Save the model every `save_steps`
        load_best_model_at_end=True,     # Load the best model at the end of training#
        learning_rate= 1e-4,              # Set the learning rate
        metric_for_best_model="eval_loss", # Use evaluation loss to check how good our model is performing
        greater_is_better=False,
    )

    # Trainer
    trainer = Trainer(
        model=model,                         # model
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Early Stopping for Overfitting
    )

    # Train the model
    trainer.train()



    return model, tokenizer



In [None]:
def save_model(model, tokenizer, path):
    """
    Function to save the model
    model : model to save ;
    tokenizer : tokenizer to save ;
    path : path to save the model ; str
    """

    model_save_path =  path
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# They are contained within a zip file.
uploaded = files.upload()

In [None]:
# Set the path to the data 
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path 
# '/content/'
path = '/content/'

In [None]:
# Unzip the folder
with zipfile.ZipFile(path + 'english_data.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data')

In [None]:
# Load the dataset
all_comments_with_labels = glob.glob(path + 'french_data_finetuning/french_data_finetuning/*.csv')

In [None]:
# Read in the data
all_comments_with_labels = pd.concat([pd.read_csv(f) for f in all_comments_with_labels], ignore_index = True)

In [None]:
# Seperate the two columns in the dataframe into comments and labels and turn them into lists
comments = all_comments_with_labels['Comment'].tolist()
labels = all_comments_with_labels['label'].tolist()


In [None]:
# Convert the labels to integers
labels = [int(label) for label in labels]
# Turn all elements in comments into strings
comments = [str(comment) for comment in comments]


In [None]:
# Assess how many comments we have
print("Number of comments: ", len(comments))
# Asses how many negative (0) , neutral (1) and positive (2) comments we have
print("Number of negative comments: ", labels.count(0))
print("Number of neutral comments: ", labels.count(1))
print("Number of positive comments: ", labels.count(2))


In [None]:
# Split the data into training and validation sets with stratification
train_comments, val_comments, train_labels, val_labels = train_test_split(
    comments, labels, test_size=0.2, random_state=42, stratify=labels
)

In [None]:
# Look at the number of comments in the training and validation sets
print("Number of comments in the training set: ", len(train_comments))
print("Number of comments in the validation set: ", len(val_comments))
# Look at labels in the training and validation sets
print("Number of negative comments in the training set: ", train_labels.count(0))
print("Number of neutral comments in the training set: ", train_labels.count(1))
print("Number of positive comments in the training set: ", train_labels.count(2))
print("Number of negative comments in the validation set: ", val_labels.count(0))
print("Number of neutral comments in the validation set: ", val_labels.count(1))
print("Number of positive comments in the validation set: ", val_labels.count(2))


In [None]:
# Train the model
model_trained, tokenizer_trained = model(train_comments, train_labels, val_comments, val_labels, batch_size_train = 16, batch_size_val = 16, epochs = 5, tokenizer = tokenizer)


In [None]:
# Save the model
save_model(model_trained, tokenizer_trained, path + 'french_model')


In [None]:
# Save the fine-tuned model to your system

# Mount to drive
drive.mount('/content/drive')

!cp -r /content/french_model /content/drive/MyDrive/

# Now download it from your Google Drive Account !