# Secondary cleaning of english comments
This File is used to apply our secondary cleaning phase. This is based on what we marked as problematic for our sentiment analysis, based on looking through scraped english comments. In particular, we note that we already did some cleaning steps while scraping. Yet, after scraping, we found some problems looking through the comments. These problems are adressed here.

In [None]:
!pip install textblob

In [1]:
import pandas as pd
import regex as re
import glob
from textblob import TextBlob

In [32]:
# Read in single file
all_english_comments = pd.read_csv('Comments DB/english/Scraped/english_processed_full_unlabelled_uncleaned.csv')
all_english_comments_original = pd.read_csv('Comments DB/english/Scraped/english_original_full_unlabelled_uncleaned.csv')

In [34]:
# Remove comments with words like "video" and "channel" as they are associated with comments such as 'great video!'
# Note we also need to remove from original, because in this case we are removing whole comments !
all_english_comments = all_english_comments[~all_english_comments['Comment'].str.contains('video|channel', case=False)]
all_english_comments_original = all_english_comments_original[~all_english_comments_original['Comment'].str.contains('video|channel', case=False)]


In [35]:
# Remove numbers from comments
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'\d+', '', x))
all_english_comments_original['Comment'] = all_english_comments_original['Comment'].apply(lambda x: re.sub(r'\d+', '', x))


In [36]:
# Remove words that have the scheme '@something' (i.e. remove words that begin with '@')
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'@\w+', '', x))
all_english_comments_original['Comment'] = all_english_comments_original['Comment'].apply(lambda x: re.sub(r'@\w+', '', x))

In [37]:
# Remove all special characters except for ., ,, ?, !, /, (, and )
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.,?!/()]', '', x))

# Replace ., /, (, and ) with whitespace
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'[.,/()]', ' ', x))

# Remove trailing and excessive whitespaces
all_english_comments['Comment'] = all_english_comments['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
all_english_comments_original['Comment'] = all_english_comments_original['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())





In [None]:
# We also aim to fix spelling mistakes, as they can mess up our sentiment analysis heavily (we tested that on some models and got completely wrong sentiments due to spelling mistakes)
# This we can also do on the original comments directly 
# This will later on also be crucial for translation into other languages !

# Function to correct spelling mistakes
def correct_spelling(text):
    try:
        corrected_text = str(TextBlob(text).correct())
        return corrected_text
    except Exception as e:
        return text
    
all_english_comments['Comment'] = all_english_comments['Comment'].apply(correct_spelling)
all_english_comments_original['Comment'] = all_english_comments_original['Comment'].apply(correct_spelling)



In [4]:
# Dropping duplicates
all_english_comments_combined = pd.concat([all_english_comments.reset_index(drop=True), all_english_comments_original.reset_index(drop=True)], axis = 1)
all_english_comments_combined.columns = ['Comment processed', 'Comment original']
all_english_comments_combined = all_english_comments_combined.drop_duplicates(subset='Comment processed', keep='first')
# Split the dataframes again
all_english_comments = all_english_comments_combined[['Comment processed']].rename(columns={'Comment processed': 'Comment'})
all_english_comments_original = all_english_comments_combined[['Comment original']].rename(columns={'Comment original': 'Comment'})

In [5]:
# Save to csv
all_english_comments.to_csv('Comments DB/english/ReadyForLabelling/english_processed_full_unlabelled.csv')
all_english_comments_original.to_csv('Comments DB/english/ReadyForLabelling/english_original_full_unlabelled.csv')