# Processing of Translated and Scraped Comments for Fine-Tuning

This file is used to process/clean the translated comments aswell as the comments we are going to use in inference.
The idea is that we want to fine-tune and inference using the models on the same kind of preprocessed data for all languages. Note that we are not using textblob for spell correction here, as it doesn't really work with languages other than english. We tested on german and the comments didn't make any sense anymore.

In [2]:
from nltk.corpus import stopwords
import pandas as pd 
import regex as re
import glob


In [50]:
language = 'french'

In [None]:
# Read in single file
comments = pd.read_csv('')

In [51]:
# Read in multiple files
path = 'Comments DB/{}/TranslatedFromEnglish/*.csv'.format(language)

# Use glob to get all the .csv files in the folder
csv_files = glob.glob(path)

# Initialize an empty list to hold the DataFrames
dataframes = []

# Loop over the list of csv files
for file in csv_files:
    # Read the csv file into a DataFrame
    df = pd.read_csv(file)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
comments = pd.concat(dataframes, ignore_index=True)

In [52]:
# After translation, we still have to process the comments into the correct format ; i.e. the same we used on the english comments when we labelled them.
# In particular, we still have to
# remove stopwords
# Remove all special characters except for ., ,, ?, !, /, (, and )
# Remove all special characters except for ., ,, ?, !, /, (, and )
# # Replace ., /, (, and ) with whitespace


# Remove empty comments
comments = comments[comments['Comment'].notnull()]


comments['Comment'] = comments['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words(language))]))
if language == 'french':
    pattern = r'[^a-zA-Z0-9\s.,?!/()àâäéèêëîïôöùûüÿçÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇ]'
elif language == 'german':
    pattern = r'[^a-zA-Z0-9\s.,?!/()äöüßÄÖÜ]'
elif language == 'spanish':
    pattern= r'[^a-zA-Z0-9\s.,?!/()áéíóúñÁÉÍÓÚÑ]'
elif language == 'italian':
    pattern= r'[^a-zA-Z0-9\s.,?!/()àèéìòùÀÈÉÌÒÙ]'


# Apply the regex pattern to clean the comments
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(pattern, '', x))

# Replace ., /, (, and ) with whitespace
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(r'[.,/()]', ' ', x))




In [53]:
# For inference comments, also do this :
# Note that for the translated this is not necessary, as it was already done on the english comments and therefore
# also the translated version


# Remove trailing and excessive whitespaces
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
# Remove comments with words like "video" and "channel" as they are associated with comments such as 'great video!'
# Note we also need to remove from original, because in this case we are removing whole comments !
if language == 'french':
    comments = comments[~comments['Comment'].str.contains('vidéo|canal', case=False)]
elif language == 'german':
    comments = comments[~comments['Comment'].str.contains('video|kanal', case=False)]
elif language == 'spanish':
    comments = comments[~comments['Comment'].str.contains('video|canal', case=False)]
elif language == 'italian':
    comments = comments[~comments['Comment'].str.contains('video|canale', case=False)]

    
# Remove numbers from comments
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(r'\d+', '', x))
# Remove words that have the scheme '@something' (i.e. remove words that begin with '@')
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(r'@\w+', '', x))
# Remove trailing and excessive whitespaces
comments['Comment'] = comments['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [54]:
comments.to_csv('Comments DB/{}/Finetuning/{}_combined_ready_for_finetuning.csv'.format(language,language), index=False)

