In [None]:
import pandas as pd 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from google.colab import files
from google.colab import drive
import glob
import zipfile
import os

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# Note that we are here uploading all the english comments that we have past filtering phase.
# They are contained within a zip file.
uploaded = files.upload()

In [None]:
# Set the path to the data 
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path 
# '/content/'
path = '/content/'

In [None]:
# Unzip the folder
with zipfile.ZipFile(path + 'english_data.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data')

In [None]:
# Load the dataset
all_english_comments = glob.glob(path + 'english_data/english_data/*.csv')

In [None]:
# Read in the data
all_english_comments = pd.concat([pd.read_csv(f) for f in all_english_comments], ignore_index = True)

In [None]:
# TODO : CHANGE LOCATION OF THESE STEPS !! 

# Remove comments with words like "video" and "channel" as they are associated with comments such as 'great video!'
all_english_comments = all_english_comments[~all_english_comments['Comment'].str.contains('video|channel', case=False)]

# Remove comments whose length is less than 3 words
all_english_comments = all_english_comments[all_english_comments['Comment'].str.split().str.len() > 3]


In [None]:
# Now we prepare for the labelling phase using a pre-trained state-of-the-art model

# Turn dataframe into a list
comments = all_english_comments['Comment'].tolist()

# Turn all comments into strings
comments = [str(comment) for comment in comments]

In [None]:
# Load the different models, trained on different datasets
tokenizer_1 = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_1 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# TODO : change to correct model, just for testing purposes
tokenizer_2 = AutoTokenizer.from_pretrained("aychang/roberta-base-imdb")
model_2 = AutoModelForSequenceClassification.from_pretrained("aychang/roberta-base-imdb")

tokenizer_3 = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model_3 = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")


In [None]:
# Move the models to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_1.to(device)
model_2.to(device)
model_3.to(device)

In [None]:
# Initialize the pipelines
classifier_1 = pipeline('sentiment-analysis', model=model_1, tokenizer=tokenizer_1)
classifier_2 = pipeline('sentiment-analysis', model=model_2, tokenizer=tokenizer_2)
classifier_3 = pipeline('sentiment-analysis', model=model_3, tokenizer=tokenizer_3)


In [None]:
# Predict sentiment labels for each classifier
predictions_1 = classifier_1(comments)
predictions_2 = classifier_2(comments)
predictions_3 = classifier_3(comments)

In [None]:
# Extract the scores from the predictions
scores_1 = [prediction['score'] for prediction in predictions_1]
scores_2 = [prediction['score'] for prediction in predictions_2]
scores_3 = [prediction['score'] for prediction in predictions_3]
# Extract the labels from the predictions
labels_1 = [prediction['label'] for prediction in predictions_1]
labels_2 = [prediction['label'] for prediction in predictions_2]
labels_3 = [prediction['label'] for prediction in predictions_3]



In [None]:
# Set up the right labels for the different models
# We want to transform all labels to the same format ; all should be numbers where 0 is negative, 1 is neutral and 2 is positive
# Model 1 gives Negative, Neutral, Positive as labels, so we will transform them to 0, 1, 2
labels_1 = [0 if label == 'Negative' else 1 if label == 'Neutral' else 2 for label in labels_1]
# Model 2 gives neg and pos as labels, so we will transform them to 0, 2
labels_2 = [0 if label == 'neg' else 2 for label in labels_2]
# Model 3 gives only POSITIVE, NEGATIVE as labels, so we will transform them to 0, 1
labels_3 = [0 if label == 'NEGATIVE' else 2 for label in labels_3]




In [None]:
# Only keep comments with a confidence score of above 0.80
conf_score = 0.80
high_confidence_predictions_1 = []
high_confidence_comments_1 = []
high_confidence_comments_2 = []
high_confidence_predictions_2 = []
high_confidence_predictions_3 = []
high_confidence_comments_3 = []


In [None]:
# For model 1 :
for i in range(len(scores_1)):
    if scores_1[i] > conf_score:
        high_confidence_predictions_1.append(labels_1[i])
        high_confidence_comments_1.append(comments[i])



In [None]:
# For model 2 :
for i in range(len(scores_2)):
    if scores_2[i] > conf_score:
        high_confidence_predictions_2.append(labels_2[i])
        high_confidence_comments_2.append(comments[i])

In [None]:
# For model 3 :
for i in range(len(scores_3)):
    if scores_3[i] > conf_score:
        high_confidence_predictions_3.append(labels_3[i])
        high_confidence_comments_3.append(comments[i])

In [None]:
# Now we want to keep only the comments which appear in atleast 2 of the 3 models with high confidence
# We will use the intersection of the comments from the 3 models
# We will also keep the corresponding labels
high_confidence_comments = list(set(high_confidence_comments_1) & set(high_confidence_comments_2) & set(high_confidence_comments_3))
high_confidence_labels = [label for i, label in enumerate(high_confidence_predictions_1) if high_confidence_comments_1[i] in high_confidence_comments]

In [None]:
# Check how many comments are left after filtering by confidence score
print("We have ", len(high_confidence_labels), " comments left after filtering by confidence score " , conf_score , " .")

# Check how many predictions we have in the respective classes
print("We have ", high_confidence_labels.count(0), " negative predictions.")
print("We have ", high_confidence_labels.count(1), " neutral predictions.")
print("We have ", high_confidence_labels.count(2), " positive predictions.")



In [None]:
# Save to csv the comments and their label
high_confidence_comments_df = pd.DataFrame(high_confidence_comments, columns=['comments'])
high_confidence_comments_df['predictions'] = high_confidence_labels
high_confidence_comments_df.to_csv(path + "High_Confidence_Comments_English.csv")

In [None]:
# Download the file to your local machine (from google colab)

files.download(path + "High_Confidence_Comments_English.csv")