In [1]:
import os
import re
import sys  
import string
from pathlib import Path
sys.path.insert(0, str(Path().resolve().parents[1]))

import pandas as pd
from utils.data_preprocessing import text_preprocessors

import swifter
from nltk.stem.porter import PorterStemmer
from unicodedata import normalize
from utils.data_preprocessing.emo_unicode import EMOTICONS_EMO
from nltk.corpus import stopwords

In [2]:
DATA_SET = 'jigsaw_toxic_comments'
DATA_DIR = Path('data/')
OUT_DIR = Path('data/preprocessed/')

INPUT_COL = 'comment_text'
PREPROCESSED_COL = f'Processed {INPUT_COL}'

CWD = Path(os.getcwd()).parent.parent

import importlib
importlib.reload(text_preprocessors);
PREPROCESSOR = text_preprocessors.PREPROCESSOR

In [3]:
DATA_DIR = CWD / DATA_DIR
OUT_DIR = CWD / OUT_DIR
train_data = pd.read_csv(DATA_DIR / DATA_SET / 'train.csv')
test_data = pd.read_csv(DATA_DIR / DATA_SET / 'test.csv')

In [4]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
STEMMER = PorterStemmer()

MIN_WORD_SIZE = 2
MAX_WORD_SIZE = 20

URL_PATTERN = re.compile(r"\S*https?:\S*")

STOP_WORDS = set(stopwords.words("english"))

PUNCTUATION = '!?\'\"[]{}()'
PUNCTUATION2 = '.,-;:'

SPECIAL_CHARS = '=~|_$'
SPECIAL_CHARS2 = '\\/#'

DIGITS_TRANSLATOR = str.maketrans('', '', string.digits)
PUNCTUATION_TRANSLATOR = str.maketrans('', '', PUNCTUATION)
PUNCTUATION2_TRANSLATOR = str.maketrans(PUNCTUATION2, ' ' * len(PUNCTUATION2))
SPECIAL_CHARS_TRANSLATOR = str.maketrans('', '', SPECIAL_CHARS)
SPECIAL_CHARS2_TRANSLATOR = str.maketrans(SPECIAL_CHARS2, ' ' * len(SPECIAL_CHARS2))

def remove_punctuation(s: str) -> str:
    return s.translate(PUNCTUATION_TRANSLATOR).translate(PUNCTUATION2_TRANSLATOR)

def remove_digits(s: str) -> str:
    return s.translate(DIGITS_TRANSLATOR)

def replace_new_line_with_space(s: str) -> str:
    return s.replace('\n', ' ').replace('\r', ' ')

def convert_lower(s: str) -> str:
    return s.lower()

def map_emoji(s: str) -> str:
    return ' '.join([EMOTICONS_EMO.get(w, w) for w in s.split()])

def remove_non_ascii(s: str) -> str:
    return normalize('NFKD', s).encode('ascii','ignore').decode('utf-8')

def remove_stop_words(s: str) -> str:
    return ' '.join([w for w in s.split() if w not in STOP_WORDS])

def remove_small_words(s: str) -> str:
    return ' '.join([w for w in s.split() if len(w) > MIN_WORD_SIZE])

def remove_big_words(s: str) -> str:
    return ' '.join([w for w in s.split() if len(w) < MAX_WORD_SIZE])

def remove_urls(s: str) -> str:
    return URL_PATTERN.sub(r'', s)

def remove_special_chars(s: str) -> str:
    return s.translate(SPECIAL_CHARS_TRANSLATOR).translate(SPECIAL_CHARS2_TRANSLATOR)

def stem(s: str) -> str:
    return ' '.join([STEMMER.stem(w) for w in s.split()])

In [6]:
CLEANERS = [
    replace_new_line_with_space, convert_lower, remove_urls, 
    remove_punctuation, map_emoji, remove_non_ascii, 
    remove_digits, remove_stop_words, remove_special_chars,
    remove_small_words, remove_big_words, stem
]

def clean_text(s: str) -> str:
    for cleaner in CLEANERS: 
        s = cleaner(s)

    return s

def process_data(data: pd.DataFrame) -> pd.DataFrame:
    comments = data[INPUT_COL]
    comments = comments.swifter.allow_dask_on_strings(enable=True).apply(clean_text)
    
    data.insert(
        data.columns.get_loc(INPUT_COL) + 1,
        PREPROCESSED_COL,
        comments
    )

In [7]:
process_data(train_data)
train_data.head()

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,daww match background colour seemingli stuck t...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",hey man realli tri edit war guy constantli rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",cant make real suggest improv wonder section s...,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",sir hero chanc rememb page that,0,0,0,0,0,0


In [8]:
process_data(test_data)
test_data.head()

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,id,comment_text,Processed comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,bitch rule succes youll ever what hate sad mof...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,rfc titl fine imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",sourc zaw ashton lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...",look back sourc inform updat correct form gues...
4,00017695ad8997eb,I don't anonymously edit articles at all.,dont anonym edit articl


In [9]:
train_data.to_csv(OUT_DIR / DATA_SET / 'train.csv', index=False)
test_data.to_csv(OUT_DIR / DATA_SET / 'test.csv', index=False)

In [28]:
test_data_label = pd.read_csv(DATA_DIR / DATA_SET / 'test_labels.csv')

In [None]:
test_data = test_data.set_index('id')
test_data_label = test_data_label.set_index('id')

In [41]:
test_data = test_data.join(test_data_label, how='outer')

In [42]:
test_data.to_csv(OUT_DIR / DATA_SET / 'test.csv', index=False)