In [1]:
"""
Load the dataset and select relevant columns.
"""

import pandas as pd
import os

# Gets the absolute path to the data folder.
dirname = os.path.abspath('')
input_file = os.path.join(dirname, '..', '..', 'data', 'all_data.csv')

# Load the data
df = pd.read_csv(input_file)

# Only keeps the cols that will be used
df = df[['comment_text', 'toxicity', 'severe_toxicity']]

# Something was wrong in the dataset and only certain intervals were labeled.
df = df.iloc[list(range(0, 10563+1)) + list(range(65648, 76239+1)) + list(range(131624, 142097+1)) + list(range(262632, 273122+1)) + list(range(328454, 338864+1)) + list(range(394284, 404764+1)) + list(range(460040, 470467+1)) + list(range(525920, 536568+1)) + list(range(591589, 602305+1))]


In [2]:
"""
Performing autocorrect on dataset.
"""
from autocorrect import Speller

# fast=False is preferable but is way to slow for these purposes, fast=True means that words with more than one error won't be fixed.
spell = Speller(fast=True)

# .map() performed sligthly better than .apply() in tests.
#df['comment_text'] = df['comment_text'].astype(str).apply(spell)
df['comment_text'] = df['comment_text'].astype(str).map(spell)


In [3]:
"""
Remove punctuation, convert to lowercase.
"""

df['comment_text'] = df['comment_text'].str.replace(r'[^\w\s]+', '', regex=True)  # remove punctuation
df['comment_text'] = df['comment_text'].apply(lambda x : x.lower())  # make lowercase


In [None]:
"""
Download neccessary nltk data.
"""
from nltk import download

download("wordnet")
download("omw-1.4")
download("stopwords")


In [4]:
"""
Tokenize messages
"""

import re

df['comment_text'] = df['comment_text'].apply(lambda x : re.split(r'\s+', x))


In [5]:
"""
Remove Stop words
"""

from nltk.corpus import stopwords

stop_words = set(map(lambda x : re.sub(r'[^\w\s]+', '', x), stopwords.words('english')))  # Loads nltk stopwords and removes punctuation

df['comment_text'] = df['comment_text'].apply(lambda x : [word for word in x if word not in stop_words])


In [6]:
"""
Lemmatize data
"""

from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

df['comment_text'] = df['comment_text'].apply(lambda x : [wnl.lemmatize(wnl.lemmatize(word), pos='v') for word in x])  # Lemmatizes each word both as a noun and a verb


In [9]:
"""
Export the dataframe to a csv file.
"""

import os

# Gets the absolute path to the file.
dirname = os.path.abspath('')
output_file = os.path.join(dirname, '..', '..', 'data', 'all_data_processed.csv')

df.to_csv(path_or_buf=output_file)
