In [1]:
"""
Load the dataset and select relevant columns.
"""

import pandas as pd
import os

# Gets the absolute path to the data folder.
dirname = os.path.abspath('')
filename = os.path.join(dirname, '..', '..', 'data', 'all_data.csv')

# Load the data
df = pd.read_csv(filename)

# Only keeps the rows that will be used
df = df[['id', 'comment_text', 'toxicity', 'severe_toxicity']]


Time elapsed for code block 1: 19.087236881256104 s.


In [11]:
"""
Performing autocorrect on dataset.
"""
from autocorrect import Speller

#df1 = df.loc[[i for i in range(1,10001)]]

# fast=False is preferable but is way to slow for these purposes, fast=True means that words with more than one error won't be fixed.
spell = Speller(fast=True)

# .map() performed sligthly better than .apply() in tests.
#df1['comment_text'] = df1['comment_text'].astype(str).apply(spell)
df['comment_text'] = df['comment_text'].astype(str).map(spell)


Time elapsed for code block 2: 1374.0026388168335 s.


In [44]:
"""
Remove punctuation, convert to lowercase.
"""

df1 = df.loc[[i for i in range(1,1000001)]]

df1['comment_text'] = df1['comment_text'].str.replace(r'[^\w\s]+', '', regex=True)  # remove punctuation
df1['comment_text'] = df1['comment_text'].apply(lambda x : x.lower())  # make lowercase




In [None]:
"""
Download neccessary nltk data.
"""
from nltk import download

download("wordnet")
download("omw-1.4")
download("stopwords")

In [45]:
"""
Tokenize messages
"""

import re

df1['comment_text'] = df1['comment_text'].apply(lambda x : re.split(r'\s+', x))


In [46]:
"""
Remove Stop words
"""

from nltk.corpus import stopwords

stop_words = set(map(lambda x : re.sub(r'[^\w\s]+', '', x), stopwords.words('english')))  # Loads nltk stopwords and removes punctuation

df1['comment_text'] = df1['comment_text'].apply(lambda x : [word for word in x if word not in stop_words])


In [47]:
"""
Lemmatize data
"""

from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

df1['comment_text'] = df1['comment_text'].apply(lambda x : [wnl.lemmatize(wnl.lemmatize(word), pos='v') for word in x])


In [None]:
"""
Export the dataframe to a csv file.
"""

import os

# Gets the absolute path to the file.
dirname = os.path.abspath('')
filename = os.path.join(dirname, '..', '..', 'data', 'all_data_proccessed.csv')

df.to_csv(path_or_buf=filename)
