In [None]:
!pip install datasets
!pip install accelerate -U

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import regex as re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from google.colab import files

In [None]:
nltk.download('stopwords')

In [None]:
def remove_emoji(comment):
    """Function to remove emojis.
        comment : data input ; str
        Taken from :
        https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b

    """

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', comment)

In [None]:
def P_data_cleaning(data, language, labelling):
    """Function to clean our data.
       data : data input ; pd.Series
       language : what language the comments are in (input in lowercase) : str
       labelling : if we want to label, we keep punctuation & stopwords
    """

    # REMOVE NAN ENTRIES
    data = data.dropna()

    # REMOVE COMMENTS THAT EXCEED CERTAIN LENGTH (350 for now)
    data = data[data.str.len() <= 350]
    

    # FOR GERMAN DATA : Change ö , ä , ü to oe, ae, ue
    data = data.str.replace("ö", "oe").str.replace("ä", "ae").str.replace("ü", "ue")

    # REMOVE NAMES FROM ANSWERS (in youtube comments scraper answers stored by @@)
    data = data.str.replace('@@\w+', '', regex=True)

    # REMOVING PUNCTUATION
    if labelling == False:
      data = data.str.replace('[^a-zA-Z0-9]',' ')

    # REMOVING EMOJIS
    data = data.apply(lambda x: remove_emoji(x))

    # LOWERCASE
    data = data.str.lower()

    # REMOVING STOPWORDS
    if labelling == False:
      data = data.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words(language))]))


    return data

In [None]:
# Instantiate the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


def tokenize_function(examples, tokenizer=tokenizer):
    """
    Function to tokenize the data.
    examples : data to tokenize ; dict
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

In [None]:
def DistilBertModel(train_comments, train_labels, 
                    val_comments, val_labels,
                    batch_size_train, batch_size_val,
                    epochs, num_labels, tokenizer=tokenizer):
    """
    Function to train a DistilBert model on the data.
    train_comments : comments for training ; lst of str
    train_labels : labels for training ; lst of int
    val_comments : comments for validation ; lst of str
    val_labels : labels for validation ; lst of int
    batch_size_train : batch size for training ; int
    batch_size_val : batch size for validation ; int
    epochs : number of epochs ; int
    num_labels : number of labels (for denoiser 2, for classification 3) ; int
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    

    # Setup the Hugging Face Dataset Class
    train_dataset_dict = {"text": train_comments, "label": train_labels}
    val_dataset_dict = {"text": val_comments, "label": val_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    val_dataset = Dataset.from_dict(val_dataset_dict)

    # Apply the tokenizer to the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Remove columns we do not need for training
    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")


    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,              # total number of training epochs
        per_device_train_batch_size=batch_size_train,  # batch size for training
        per_device_eval_batch_size=batch_size_val,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )

    # Trainer
    trainer = Trainer(
        model=model,                         # model
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()



    return model, tokenizer

In [None]:
def save_model(model, tokenizer, path):
    """
    Function to save the model
    model : model to save ; DistilBertForSequenceClassification
    tokenizer : tokenizer to save ; DistilBertTokenizer
    path : path to save the model ; str
    """

    model_save_path =  path
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
uploaded = files.upload()

In [None]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
path = '/content/'

In [None]:
# Load the datasets
# Note that mac users seperated with , automatically, for windows users we have to specify ; as the seperator
english_test_dataset_labelled = pd.read_csv(path +'Giuseppe.csv')
english_test_dataset_labelled_2 = pd.read_csv(path + 'andrea.csv', sep= ';')
english_test_dataset_labelled_3 = pd.read_csv(path + 'giovanni.csv', sep= ';')


In [None]:
# Concatenate the datasets
english_test_dataset_labelled = pd.concat([english_test_dataset_labelled, english_test_dataset_labelled_2, english_test_dataset_labelled_3], ignore_index= True)


In [None]:
# Remove comments that are not labelled (have NaN value in column 'Label')
english_test_dataset_labelled = english_test_dataset_labelled.dropna(subset=['Label'])

In [None]:
# Do preprocessing steps
english_test_dataset_labelled['Comment'] = P_data_cleaning(english_test_dataset_labelled['Comment'], 'english', False)


In [None]:
# Seperate the two columns in the dataframe into 'comment' and 'label' in form of two lists
english_test_dataset_labelled_comments = english_test_dataset_labelled['Comment'].tolist()
english_test_dataset_labelled_labels = english_test_dataset_labelled['Label'].tolist()

In [None]:
# Now seperate into noise and no noise
english_test_dataset_labelled_comments_noise = []

# We will refer to no noise as 0 and noise as 1. Turn every entry in english_test_dataset_labelled_labels into 1 if it is 'N', else into 0.
english_test_dataset_labelled_labels = [1 if label == 'N' else 0 for label in english_test_dataset_labelled_labels]

In [None]:
# Check the amount of noisy and non-noisy comments
print('Amount of noisy comments:', english_test_dataset_labelled_labels.count(1))
print('Amount of non-noisy comments:', english_test_dataset_labelled_labels.count(0))

In [None]:
# For easy testing purposes, I'll just take as much noisy comments as non-noisy comments. 
# If we label more, we could think about stuff like data augmentation (synonym replacement, etc.)
# Else, oversampling the noisy comments would be a good idea. But care for overfitting.

# Seperate noisy and non-noisy comments
english_test_dataset_labelled_comments_no_noise = []
english_test_dataset_labelled_comments_noise = []

for i in range(len(english_test_dataset_labelled_labels)):
    if english_test_dataset_labelled_labels[i] == 0:
        english_test_dataset_labelled_comments_no_noise.append(english_test_dataset_labelled_comments[i])
    else:
        english_test_dataset_labelled_comments_noise.append(english_test_dataset_labelled_comments[i])

# Take the first len(english_test_dataset_labelled_comments_noise) comments from the noisy comments
english_test_dataset_labelled_comments_no_noise = english_test_dataset_labelled_comments_no_noise[:len(english_test_dataset_labelled_comments_noise)]

In [None]:
# Check if the amount of noisy and non-noisy comments is the same
print('Amount of noisy comments:', len(english_test_dataset_labelled_comments_noise))
print('Amount of non-noisy comments:', len(english_test_dataset_labelled_comments_no_noise))

In [None]:
# Combine the comments and labels into a single list
comments = english_test_dataset_labelled_comments_no_noise + english_test_dataset_labelled_comments_noise
labels = [0]*len(english_test_dataset_labelled_comments_no_noise) + [1]*len(english_test_dataset_labelled_comments_noise)

In [None]:
# Split the data into training and validation sets with stratification
train_comments, val_comments, train_labels, val_labels = train_test_split(
    comments, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
# Check the amount of training and validation comments
print("We have {} training comments".format(len(train_comments)))
print("We have {} validation comments".format(len(val_comments)))

# Check that the classes are evenly distributed across training and validation sets
print("Training set:")
print("Noisy comments:", train_labels.count(1))
print("Non noisy comments:", train_labels.count(0))
print("Validation set:")
print("Noisy comments:", val_labels.count(1))
print("Non noisy comments:", val_labels.count(0))

In [None]:
# Using our evened out dataset, we can start applying the model
model_trained, tokenizer_trained = DistilBertModel(train_comments, train_labels, val_comments, val_labels, batch_size_train = 16, batch_size_val = 16, num_labels = 2, epochs = 1, tokenizer = tokenizer)
save_model(model_trained, tokenizer_trained, "denoising_model_fine_tuned_distilbert_english")