# Configuration

In [None]:
# set main directories
BASE_DIR = '/content/gdrive/MyDrive/AML_project/project/'
UTILS_DIR = BASE_DIR + 'utils'
DATA_DIR = BASE_DIR + 'data/'
TOKENIZER_DIR = BASE_DIR + 'tokenizers/'
# mount drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
# set utils directory
import sys
sys.path.append(UTILS_DIR)


Mounted at /content/gdrive


# Imports

In [None]:
import numpy as np
import random as rnd
import pandas as pd
import nltk
import string
import unicodedata
import pickle
import re
import io
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# custom imports
from utils import getComments
from contracted_forms import contractions
from emoticons import emoticons_replaces
from preprocessing import Preprocessor

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

# Load data

## Original data

In [None]:
# load train
train = pd.read_csv(DATA_DIR + 'train.csv')

# load test
test = pd.read_csv(DATA_DIR + 'test.csv')

# load test labels
test_labels = pd.read_csv(DATA_DIR + 'test_labels.csv')

## Data augmentation with back translations

Load DE, FR and ES back translation to EN

In [None]:
# load back translated comments
train_de = pd.read_csv(DATA_DIR + 'train_de.csv')
train_fr = pd.read_csv(DATA_DIR + 'train_fr.csv')
train_es = pd.read_csv(DATA_DIR + 'train_es.csv')
# select ONLY not clean comments to reduce the imbalance
train_de = getComments(train_de, clean=False)
train_fr = getComments(train_fr, clean=False)
train_es = getComments(train_es, clean=False)

Add a label to identify each source.

In [None]:
train['source'] = 'en'
train_de['source'] = 'de'
train_fr['source'] = 'fr'
train_es['source'] = 'es'

Augment original data

In [None]:
train = train.append([train_de, train_fr, train_es], ignore_index=True)

## Remove invalid data

In [None]:
train['comment_text'] = train['comment_text'].astype(str)

# remove empty comments
train.drop(index = train[train['comment_text'] == ''].index, inplace=True)

test['comment_text'] = test['comment_text'].astype(str)

# remove empty comments
test.drop(index = test[test['comment_text'] == ''].index, inplace=True)

# Remove invalid test instances
indexes = test_labels[test_labels['toxic'] == -1].index
test_labels.drop(index=indexes, inplace=True)
test.drop(index=indexes, inplace=True)

# Preprocess text

#### Instantiate a Preprocessor object responsible of all the preprocessing steps

In [None]:
p = Preprocessor(keep_punct='!', replace_emojis=False, correct_bad_words=False)

### Clean text in train

In [None]:
# preprocess train
train['comment_text'] = train['comment_text'].map(p.preprocessDoc)

In [None]:
# remove invalid texts after cleaning
ids = train[train['comment_text'] == '']['id']
train = train[~train['id'].isin(ids)]


### Clean text in test

In [None]:
test['comment_text'] = test['comment_text'].map(p.preprocessDoc)

# remove invalid texts after cleaning
indexes = test[test['comment_text'] == ''].index
test.drop(index = indexes, inplace=True)
test_labels.drop(index=indexes, inplace=True)

### Remove useless columns

In [None]:
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

# Save Tokenizer for future preprocessings

In [None]:
p.saveTokenizer(TOKENIZER_DIR + 'base_tokenizer.pickle')

# Data augmentation with synonyms

Generate new comments replacing words with synonyms

In [None]:
# select ONLY original language comments
train_en = train[train['source'] == 'en']
# select ONLY toxic comments to reduce the imbalance
train_en = getComments(train_en, clean=False)

In [None]:
dict_pos = {'NN': wordnet.NOUN, 'JJ': wordnet.ADJ, 'VB': wordnet.VERB}

# get a synonym of a word depending to the part of speach
def getSynonym(word, pos):
  syns = []
  # iterate over all possible synonyms
  for syn in wordnet.synsets(word):
    # check that words represent the same POS
    if syn.pos() == dict_pos[pos]:
      # iterate over all possible lemmas
      for lemma in syn.lemmas():
        syns.append(lemma.name().replace('_', ' ').replace('-', ' '))
  # pick a random synonym      
  if len(syns)>0:
    syn = rnd.choice(syns)
  else:
    syn = word
  return syn

# replace synonyms in a comment given a replacement probability
def replaceSynonyms(comment, replace_probabilty = 1):
  poss = pos_tag(word_tokenize(comment))
  new_comment = []
  for pos in poss:
    if rnd.random() < replace_probabilty:
      w = pos[0]
      p = pos[1][0:2]
      if (p=='NN') | (p=='JJ') | (p=='VB') :
        w = getSynonym(w, p).lower()
      new_comment.append(w)
  return ' '.join(new_comment)

# create a new datafram by substituting words with their synonyms
def augmentWithSynonyms(X, n_times = 1, replace_probability = 1):
  total_documents = len(X)*n_times
  count = 0
  X_syn = pd.DataFrame(columns=X.columns)
  for i in range(n_times):
    for index, row in X.iterrows():
      count = count+1
      print('\r' + '{0}/{1} documents'.format(count, total_documents), end='')
      row['comment_text'] = replaceSynonyms(row['comment_text'], replace_probability)
      row['source'] = 'syn_' + str(i)
      X_syn = X_syn.append(row, ignore_index=True)
  return X_syn


In [None]:
# triplicate toxic comments with synonyms
train_syn = augmentWithSynonyms(train_en, n_times=3)

48636/48636 documents

In [None]:
# augment original data
train = train.append(train_syn, ignore_index=True)

# Add generic 'is_toxic' label

Set "is_toxic" equal to 1 in order to use it in a binary classifier of toxic comments

In [None]:
# init is_toxic
train['is_toxic'] = 0
test_labels['is_toxic'] = 0
# assign 1 to is_toxic of all the toxic comments
toxic_indexes = getComments(train, clean=False).index
train.loc[toxic_indexes, 'is_toxic'] = 1
toxic_indexes = getComments(test_labels, clean=False).index
test_labels.loc[toxic_indexes, 'is_toxic'] = 1

# Generate undersample of not toxic comments

Basic undersample of the clean comments.

In [None]:
train_clean = getComments(train, clean=True)
train_toxic = getComments(train, clean=False)
print('Number of clean comments: ', len(train_clean))
print('Number of not clean comments: ', len(train_toxic))
train_clean = train_clean.sample(n=90000, random_state=42)
train_undersample = pd.concat([train_clean, train_toxic], axis=0, ignore_index=True)
print('Clean comments undersampled to ', 90000)

Number of clean comments:  143261
Number of not clean comments:  113484
Clean comments undersampled to  90000


# Save cleaned data

In [None]:
# save augmented data to csv
train.to_csv(DATA_DIR + 'train_aug_cleaned.csv', index=False, sep="\t")
test.to_csv(DATA_DIR + 'test_cleaned.csv', index=False, sep="\t")
test_labels.to_csv(DATA_DIR + 'test_labels_cleaned.csv', index=False, sep="\t")
# save augmented data + not toxic undersample to csv
train_undersample.to_csv(DATA_DIR + 'train_aug_und_cleaned.csv', index=False, sep="\t")