In [25]:
import pandas as pd
import numpy as np

from textblob import TextBlob
from nltk.tokenize import WordPunctTokenizer, word_tokenize, StanfordSegmenter, sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords

import re, os, sys, string, itertools
from collections import defaultdict, Counter
import unicodedata
from tqdm import tqdm, tqdm_notebook
import sentencepiece as spm

In [2]:
def translate_quoted_words(series):
    """ 
    This functions replaces words which are written in quotes
    with quoted word e.g. "Damn" with "quoted "Damn""
     """
    reg = re.compile(r'((?<=")"[\w\s]+"(?="))')
    print("Total occurences of quoted words {}".format(series.str.count(reg).sum()))
    series = series.str.replace(reg, 'quoted \\1')
    return series

In [17]:
def remove_ips(series):
    """
    Remove all ip addresses
    """
    series = series.copy()
    reg = re.compile(r'(([0-9]{1,}\.){2,}[0-9]{1,})')
    print("Total unique ip addresses in data are {}".format(series.str.extract(reg).nunique()))
    series = series.str.replace(reg, ' ')
    return series

In [18]:
def remove_trailing_dates(series):
    """
    Remove date times at the end
    """
    series = series.copy()
    return series.str.replace("([0-9]{1,2}:[0-9]{1,2},{0,1}\s[0-9]{1,2}\s[a-zA-Z]{3,}\s[0-9]{2,4}\s\((utc|UTC)\))", " ")

In [33]:
def trim_repitions(series, thresh=5):
    """
    Check if comment has repitions, if more than than repitions then trim at 10 words
    """
    series = series.copy()
    total_words = series.str.count("\w+")
    unq_words = series.apply(lambda x: len(np.unique(x.split(' '))))
    
    rep_inds = total_words/unq_words > thresh
    print("Total comments with high repitions are {}".format(sum(rep_inds)))
    print("Some examples of high reps are {}".format(series.loc[rep_inds].sample(5).values))
    
    rep_inds_unq = iter(unq_words.loc[rep_inds])
    series.loc[rep_inds] = series[rep_inds].str.split(' ').str.slice(0, next(rep_inds_unq)).str.join(' ')
    
    print("Some corrected sample are {}".format(series.loc[rep_inds].sample(5).values))
    return series

In [46]:
def break_oovwords(series, vocab_filename, sp_file):
    """
    Breaks the words which are out of vocab using sentence piece and returns new text
    """
    series = series.copy()
    dict_words = set([o.rstrip().rsplit(' ')[0] for o in open(vocab_filename)])
    sp = spm.SentencePieceProcessor()
    sp.Load(sp_file)
    series = series.apply(lambda x: ' '.join(itertools.chain.from_iterable([sp.EncodeAsPieces(word) 
                                            if word not in dict_words else [word] 
                                            for word in x.split(' ') ])))
    series = series.str.replace("▁", "")
    return series

In [47]:
def preprocess_text(series, remove_ip=True, remove_date_stamps=True, tag_quoted=True, remove_puncts=True, lower=True,
                    remove_digits=True, remove_nonchars=True,
                   break_oov=True, break_vocab_file="", break_sp_file="", trim_reps=True):
    series = series.copy()
    series = series.str.replace(r"\\n{1,}", " line ")
    
    if remove_ips:
        series = remove_ips(series)
        
    if remove_date_stamps:
        series = remove_trailing_dates(series)
        
    if tag_quoted:
        series = translate_quoted_words(series)
        
    if remove_puncts:
        series = series.str.replace("'", "")
        series = series.str.translate(str.maketrans({s:" " for s in string.punctuation}))
        
    if lower:
        series = series.str.lower()
        
    if remove_digits:
        series = series.str.replace(r"\d", "")
    
    if remove_nonchars:
        series = series.str.replace(r"[^a-zA-Z0-9.,\"!]+", " ")
        
    if break_oov:
        series = break_oovwords(series, break_vocab_file, break_sp_file)
        
    if trim_reps:
        series = trim_repitions(series, thresh=10)
        
    return series

In [48]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train.head()

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [49]:
embed_file ="../utility/glove.42B.300d.txt"
sp_file = "../utility/en.wiki.bpe.op200000.model"

train.comment_text = preprocess_text(train.comment_text, break_vocab_file=embed_file, break_sp_file=sp_file)
train.comment_text.sample(10).values

  import sys


Total unique ip addresses in data are 0    5565
1     282
dtype: int64
Total occurences of quoted words 57052
Total comments with high repitions are 369
Some examples of high reps are ['i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am gay i am

array([' \n\n wikipedia has a new administrator  \n thanks thank you for voicing your opinion in my rfa which passed with supports opposes and neutrals thanks for your support i really appreciate it i hope to exceed expectations if you have any advice please feel free to let me know thanks again',
       'your statement that she has “ ref used to talk about the matter all together ” is patently false in response to a t v show host ’ s question “ how are you responding to the rumors about you and eddie cibrian ” leann replied “ you know what everything is so not black and white ” and on another show regis and kelly the same response “ everything people read is not ... it ’ s not as easy as black and white ” also noteworthy is her statement on her blog that “ this is a difficult time for me and my loved ones ” – again with a complete absence of any denial of the affair no one has seriously argued that it was not her or eddie in the video and i submit that any attempt to do so would simpl