In [1]:
import pandas as pd
from google.colab import files
from google.colab import drive
import regex as re
import glob
import zipfile
from textblob import TextBlob

In [2]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# Note that we are here uploading all the english comments that we have past filtering phase
# aswell as pre filtering phase. We need to keep the original ones, as we later on
# translate them to other languages and have to do specific preprocessing (such as stopwords etc.)
# for the respective language to fine tune them for the models.
# They are contained within a zip file.
uploaded = files.upload()

Saving english_data_original_test.zip to english_data_original_test.zip
Saving english_data_processed_test.zip to english_data_processed_test.zip


In [17]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [30]:
# Unzip the folder
with zipfile.ZipFile(path + 'english_data_processed.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data_processed')
with zipfile.ZipFile(path + 'english_data_original.zip', 'r') as zip_ref:
    zip_ref.extractall('english_data_original')

In [41]:
# Load the dataset
# For Mac users : do english_data/english_data/*.csv
# For Windows users : do english_data/*.csv
all_english_comments = glob.glob(path + 'english_data_processed/english_data_processed/*.csv')
all_english_comments_original = glob.glob(path + 'english_data_original/english_data_original/*.csv')

In [42]:
# Read in the data
all_english_comments = pd.concat([pd.read_csv(f) for f in all_english_comments], ignore_index = True)
all_english_comments_original = pd.concat([pd.read_csv(f) for f in all_english_comments_original], ignore_index = True)

In [43]:
# Remove unneccessary column
all_english_comments = all_english_comments.drop(columns = ['Unnamed: 0'])
all_english_comments_original = all_english_comments_original.drop(columns = ['Unnamed: 0'])

In [22]:
# We saw after scraping that we still have some problematic comments, which we will now adress

In [44]:
# Remove comments with words like "video" and "channel" as they are associated with comments such as 'great video!'
# Note we also need to remove from original, because in this case we are removing whole comments !
all_english_comments = all_english_comments[~all_english_comments['Comment'].str.contains('video|channel', case=False)]
all_english_comments_original = all_english_comments_original[~all_english_comments_original['Comment'].str.contains('video|channel', case=False)]

In [45]:
# Remove numbers from comments
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'\d+', '', x))

In [46]:
# Remove words that have the scheme '@something' (i.e. remove words that begin with '@')
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'@\w+', '', x))


In [47]:
# Replace special characters (all but period ('.') and whitespaces) with nothing (so they are removed)
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.]', '', x))
# Replace period with whitespace (so words are not getting merged)
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'\.', ' ', x))
# Remove trailing whitespaces
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())




In [None]:
# We also aim to fix spelling mistakes, as they can mess up our sentiment analysis heavily (we tested that on some models and got completely wrong sentiments due to spelling mistakes)
# This we can also do on the original comments directly 
# This will later on also be crucial for translation into other languages !

# Function to correct spelling mistakes
def correct_spelling(text):
    try:
        corrected_text = str(TextBlob(text).correct())
        return corrected_text
    except Exception as e:
        return text
    
all_english_comments['Comment'] = all_english_comments['Comment'].apply(correct_spelling)
all_english_comments_original['Comment'] = all_english_comments_original['Comment'].apply(correct_spelling)



In [48]:
# We first need to concatenate the two dataframes
# We will then drop duplicates
# We will then split them again

all_english_comments_combined = pd.concat([all_english_comments.reset_index(drop=True), all_english_comments_original.reset_index(drop=True)], axis = 1)
all_english_comments_combined.columns = ['Comment processed', 'Comment original']
all_english_comments_combined = all_english_comments_combined.drop_duplicates(subset='Comment processed', keep='first')
# Split the dataframes again
all_english_comments = all_english_comments_combined[['Comment processed']].rename(columns={'Comment processed': 'Comment'})
all_english_comments_original = all_english_comments_combined[['Comment original']].rename(columns={'Comment original': 'Comment'})

In [49]:
# Save to csv
all_english_comments.to_csv('all_english_comments.csv')
all_english_comments_original.to_csv('all_english_comments_original.csv')

In [50]:
# Download the file to your local machine (from google colab)
files.download(path + "all_english_comments.csv")
files.download(path + "all_english_comments_original.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>