In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from google.colab import files
from google.colab import drive
import glob
import zipfile
import os
import copy
from collections import Counter, defaultdict

In [2]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# Note that we are here uploading all the english comments that we have past filtering phase
# aswell as pre filtering phase. We need to keep the original ones, as we later on
# translate them to other languages and have to do specific preprocessing (such as stopwords etc.)
# for the respective language to fine tune them for the models.
# They are contained within a zip file.
uploaded = files.upload()

Saving english_data_original.zip to english_data_original.zip
Saving english_data_processed.zip to english_data_processed.zip


In [40]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [41]:
# Unzip the folder
with zipfile.ZipFile(path + 'english_data_processed.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data_processed')
with zipfile.ZipFile(path + 'english_data_original.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data_original')

In [101]:
# Load the dataset
# For Mac users : do english_data/english_data/*.csv
# For Windows users : do english_data/*.csv
all_english_comments = glob.glob(path + 'english_data_processed/english_data_processed/*.csv')
all_english_comments_original = glob.glob(path + 'english_data_original/english_data_original/*.csv')

In [102]:
# Read in the data
all_english_comments = pd.concat([pd.read_csv(f) for f in all_english_comments], ignore_index = True)
all_english_comments_original = pd.concat([pd.read_csv(f) for f in all_english_comments_original], ignore_index = True)

In [103]:
# Remove unneccessary column
all_english_comments = all_english_comments.drop(columns = ['Unnamed: 0'])
all_english_comments_original = all_english_comments_original.drop(columns = ['Unnamed: 0'])

In [104]:
# TODO : CHANGE LOCATION OF THESE STEPS !!

# Remove comments with words like "video" and "channel" as they are associated with comments such as 'great video!'
all_english_comments = all_english_comments[~all_english_comments['Comment'].str.contains('video|channel', case=False)]
all_english_comments_original = all_english_comments_original[~all_english_comments_original['Comment'].str.contains('video|channel', case=False)]
# Remove comments whose length is less than 3 words
#all_english_comments = all_english_comments[all_english_comments['Comment'].str.split().str.len() > 3]
# TODO: can't do that here now when we keep the original ones, bc it will lead to mismatches
# between the original and the processed data


# We first need to concatenate the two dataframes
# We will then drop duplicates
# We will then split them again

all_english_comments_combined = pd.concat([all_english_comments.reset_index(drop=True), all_english_comments_original.reset_index(drop=True)], axis = 1)
all_english_comments_combined.columns = ['Comment processed', 'Comment original']
all_english_comments_combined = all_english_comments_combined.drop_duplicates(subset='Comment processed', keep='first')
# Split the dataframes again
all_english_comments = all_english_comments_combined[['Comment processed']].rename(columns={'Comment processed': 'Comment'})
all_english_comments_original = all_english_comments_combined[['Comment original']].rename(columns={'Comment original': 'Comment'})



In [105]:
# Now we prepare for the labelling phase using a pre-trained state-of-the-art model

# Turn dataframe into a list
comments = all_english_comments['Comment'].tolist()
comments_original = all_english_comments_original['Comment'].tolist()

# Turn all comments into strings
comments = [str(comment) for comment in comments]
comments_original = [str(comment) for comment in comments_original]

In [106]:
# Assure that we have the same amount of comments
assert len(comments) == len(comments_original), 'The number of comments is not equal!'

In [73]:
# Load the different models, trained on different datasets
tokenizer_1 = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_1 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

tokenizer_2 = AutoTokenizer.from_pretrained("aychang/roberta-base-imdb")
model_2 = AutoModelForSequenceClassification.from_pretrained("aychang/roberta-base-imdb")

tokenizer_3 = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model_3 = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

tokenizer_4 = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
model_4 = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")

tokenizer_5 = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
model_5 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [74]:
# Move the models to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_1.to(device)
model_2.to(device)
model_3.to(device)
model_4.to(device)
model_5.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [75]:
# Initialize the pipelines
classifier_1 = pipeline('sentiment-analysis', model=model_1, tokenizer=tokenizer_1)
classifier_2 = pipeline('sentiment-analysis', model=model_2, tokenizer=tokenizer_2)
classifier_3 = pipeline('sentiment-analysis', model=model_3, tokenizer=tokenizer_3)
classifier_4 = pipeline('sentiment-analysis', model=model_4, tokenizer=tokenizer_4)
classifier_5 = pipeline('sentiment-analysis', model=model_5, tokenizer=tokenizer_5)


In [110]:
# Predict sentiment labels for each classifier
predictions_1 = classifier_1(comments)
predictions_2 = classifier_2(comments)
predictions_3 = classifier_3(comments)
predictions_4 = classifier_4(comments)
predictions_5 = classifier_5(comments)

In [111]:
# Extract the scores from the predictions
scores_1 = [prediction['score'] for prediction in predictions_1]
scores_2 = [prediction['score'] for prediction in predictions_2]
scores_3 = [prediction['score'] for prediction in predictions_3]
scores_4 = [prediction['score'] for prediction in predictions_4]
scores_5 = [prediction['score'] for prediction in predictions_5]
# Extract the labels from the predictions
labels_1 = [prediction['label'] for prediction in predictions_1]
labels_2 = [prediction['label'] for prediction in predictions_2]
labels_3 = [prediction['label'] for prediction in predictions_3]
labels_4 = [prediction['label'] for prediction in predictions_4]
labels_5 = [prediction['label'] for prediction in predictions_5]



In [112]:
# Set up the right labels for the different models
# We want to transform all labels to the same format ; all should be numbers where 0 is negative, 1 is neutral and 2 is positive
# Model 1 gives Negative, Neutral, Positive as labels, so we will transform them to 0, 1, 2
labels_1 = [0 if label == 'Negative' else 1 if label == 'Neutral' else 2 for label in labels_1]
# Model 2 gives neg and pos as labels, so we will transform them to 0, 2
labels_2 = [0 if label == 'neg' else 2 for label in labels_2]
# Model 3 gives only POSITIVE, NEGATIVE as labels, so we will transform them to 0, 2
labels_3 = [0 if label == 'NEGATIVE' else 2 for label in labels_3]
# Model 4 gives negative, neutral, positive as labels, so we will transform them to 0,1,2
labels_4 = [0 if label == 'negative' else 1 if label == 'neutral' else 2 for label in labels_4]
# Model 5 gives negative, neutral, positive as labels, so we will transform them to 0,1,2
labels_5 = [0 if label == 'negative' else 1 if label == 'neutral' else 2 for label in labels_5]




In [113]:
# Only keep comments with a confidence score of above 0.80
conf_score = 0.80

high_confidence_comments_1 = []
high_confidence_comments_1_original = []
high_confidence_comments_2 = []
high_confidence_comments_2_original = []
high_confidence_comments_3 = []
high_confidence_comments_3_original = []
high_confidence_comments_4 = []
high_confidence_comments_4_original = []
high_confidence_comments_5 = []
high_confidence_comments_5_original = []


high_confidence_predictions_1 = []
high_confidence_predictions_2 = []
high_confidence_predictions_3 = []
high_confidence_predictions_4 = []
high_confidence_predictions_5 = []


In [114]:
# For model 1 :
for i in range(len(scores_1)):
    if scores_1[i] > conf_score:
        high_confidence_predictions_1.append(labels_1[i])
        high_confidence_comments_1.append(comments[i])
        high_confidence_comments_1_original.append(comments_original[i])


In [115]:
# For model 2 :
for i in range(len(scores_2)):
    if scores_2[i] > conf_score:
        high_confidence_predictions_2.append(labels_2[i])
        high_confidence_comments_2.append(comments[i])
        high_confidence_comments_2_original.append(comments_original[i])

In [116]:
# For model 3 :
for i in range(len(scores_3)):
    if scores_3[i] > conf_score:
        high_confidence_predictions_3.append(labels_3[i])
        high_confidence_comments_3.append(comments[i])
        high_confidence_comments_3_original.append(comments_original[i])

In [None]:
# For model 4 :
for i in range(len(scores_4)):
    if scores_4[i] > conf_score:
        high_confidence_predictions_4.append(labels_4[i])
        high_confidence_comments_4.append(comments[i])
        high_confidence_comments_4_original.append(comments_original[i])

In [None]:
# For model 5 :
for i in range(len(scores_5)):
    if scores_5[i] > conf_score:
        high_confidence_predictions_5.append(labels_5[i])
        high_confidence_comments_5.append(comments[i])
        high_confidence_comments_5_original.append(comments_original[i])


In [118]:
# Combining all comments and their respective labels
combined_comments = high_confidence_comments_1 + high_confidence_comments_2 + high_confidence_comments_3 + high_confidence_comments_4 + high_confidence_comments_5
combined_comments_original = high_confidence_comments_1_original + high_confidence_comments_2_original + high_confidence_comments_3_original + high_confidence_comments_4_original + high_confidence_comments_5_original
combined_predictions = high_confidence_predictions_1 + high_confidence_predictions_2 + high_confidence_predictions_3 + high_confidence_predictions_4 + high_confidence_predictions_5

# Counting the occurrence of each comment
comment_counter = Counter(combined_comments)
comment_original_counter = Counter(combined_comments_original)



# Filtering comments that appear in at least two lists
filtered_comments = {comment for comment, count in comment_counter.items() if count >= 3}
filtered_comments_original = {comment for comment, count in comment_original_counter.items() if count >= 3}

# Dictionary to keep track of labels for each comment
comment_labels = defaultdict(list)
comment_original_labels = defaultdict(list)

# Populate the dictionary with labels for each comment
for comment, label in zip(combined_comments, combined_predictions):
    if comment in filtered_comments:
        comment_labels[comment].append(label)

for comment, label in zip(combined_comments_original, combined_predictions):
    if comment in filtered_comments_original:
        comment_original_labels[comment].append(label)



# Keep only labels that appear at least twice for each comment
final_comments = []
final_labels = []
for comment, labels in comment_labels.items():
    label_count = Counter(labels)
    filtered_labels = [label for label, count in label_count.items() if count >= 3]
    if filtered_labels:
        final_comments.append(comment)
        final_labels.append(filtered_labels)

final_comments_original = []
final_labels_original = []
for comment, labels in comment_original_labels.items():
    label_count = Counter(labels)
    filtered_labels = [label for label, count in label_count.items() if count >= 3]
    if filtered_labels:
        final_comments_original.append(comment)
        final_labels_original.append(filtered_labels)

# The labels are all stored in lists, so we need to flatten them
final_labels = [label for labels in final_labels for label in labels]
final_labels_original = [label for labels in final_labels_original for label in labels]

print("Filtered Comments:", final_comments)
print("Respective Labels:", final_labels)


# Check how many predictions we have in the respective classes
print("We have ", final_labels.count(0), " negative predictions.")
print("We have ", final_labels.count(1), " neutral predictions.")
print("We have ", final_labels.count(2), " positive predictions.")


Respective Labels: [2, 0, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0,

In [119]:
# Check how many comments are left after filtering by confidence score
print("We have ", len(final_comments), " comments left after filtering by confidence score " , conf_score , " .")


We have  5110  comments left after filtering by confidence score  0.8  .


In [120]:
# Save to csv the comments and their label
high_confidence_comments_df = pd.DataFrame(final_comments, columns=['Comment'])
high_confidence_comments_df['Label'] = final_labels
high_confidence_comments_df.to_csv(path + "High_Confidence_Comments_English.csv")
# Save original ones aswell
high_confidence_comments_original_df = pd.DataFrame(final_comments_original, columns=['Comment'])
high_confidence_comments_original_df['Label'] = final_labels_original
high_confidence_comments_original_df.to_csv(path + "High_Confidence_Comments_Original_English.csv")

In [122]:
# Download the file to your local machine (from google colab)

files.download(path + "High_Confidence_Comments_English.csv")
files.download(path + "High_Confidence_Comments_Original_English.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>