In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pdb
import os, re
from collections import OrderedDict

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

## Examine the data

In [4]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

Let's **look at the data**

In [5]:
display(raw_train_df.head(3))
display(test_df.head(3))
display(sample_subm_df.head(3))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0


Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5


## Clean

First, we'll add a "none" column, replace missing values, and get rid of newline characters

In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [7]:
raw_train_df.comment_text.fillna("<na>", inplace=True)
test_df.comment_text.fillna("<na>", inplace=True)

Lots of emojis, let's clean them up using this code:  https://www.kaggle.com/prashantkikani/pooled-gru-with-preprocessing

We'll include this list of various replacements from here (https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view) as well

In [8]:
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'": "trying"
}

repl = { **appos, **repl }  # repl becomes a merged dictionary with values from repl replacing those from appos

# display(repl)

Lets try this clean function from https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

In [9]:
repl_keys = [i for i in repl.keys()]

def clean(comment):
    # convert to lower case , so that Hi and hi are the same
    comment = comment.lower()
    
    # remove \n 
    # torchtext cannot read the .csv files correctly if there are newline characters, so replace with " "
    comment = re.sub("\\n"," ",comment)
    
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"," ",comment)
    
    # removing usernames
    comment = re.sub("\[\[.*\]","",comment)
    
    # do any substitutions
    comment = " ".join([ repl[w] if (w in repl_keys) else w for w in comment.split() ])
    
    return(comment)

In [10]:
%time raw_train_df['comment_text_cleaned'] = raw_train_df.comment_text.apply(lambda x: clean(x))
print('train cleaned ...')

%time test_df['comment_text_cleaned'] = test_df.comment_text.apply(lambda x: clean(x))
print('test cleaned ...')

CPU times: user 19.8 s, sys: 64 ms, total: 19.9 s
Wall time: 19.9 s
train cleaned ...
CPU times: user 17.4 s, sys: 52 ms, total: 17.5 s
Wall time: 17.5 s
test cleaned ...


Feature engineering (adding columns that may be useful for modeling and analysis)

In [11]:
def add_cols(df, txt_col='text'):
    rows = []
    
    for index, row in df.iterrows():
        cols = OrderedDict({
            'token_count': 0.0,
            'named_entity_count': 0.0,
            'word_count': 0.0,
            'unique_word_count': 0.0,
            'unique_word_pct': 0.0,
            'upper_word_count': 0.0,
            'avg_word_len': 0.0,
            'char_count': 0.0,
            'unique_char_count': 0.0,
            'upper_char_count': 0.0,
            
            'stopwords_count': 0.0,
            'punctuation_count': 0.0,
            'punctuation_pct': 0.0,
            'symbol_count': 0.0,
            'number_count': 0.0,
            'alpha_count': 0.0,
            'noun_count': 0.0,
            'verb_count': 0.0,
            'adj_count': 0.0,
            'proper_name_count': 0.0,

            # named entity types
            'named_entity_count': 0.0,
            'named_entity_person_count': 0.0,
            'named_entity_norp_count': 0.0,
            'named_entity_facility_count': 0.0,
            'named_entity_org_count': 0.0,
            'named_entity_gpe_count': 0.0,
            'named_entity_non_gpe_loc_count': 0.0,
            'named_entity_product_count': 0.0,
            'named_entity_event_count': 0.0,
            'named_entity_woa_count': 0.0,
            'named_entity_lang_count': 0.0,
            'named_entity_date_count': 0.0,
            'named_entity_time_count': 0.0,
            'named_entity_money_count': 0.0,
            'named_entity_quantity_count': 0.0
        })
        
        # grab tokens, entities, and word tokens
        txt = row[txt_col]
        tokens = spacy_en(txt)
        ents = tokens.ents
        words = txt.split()
        
        # get counts
        cols['token_count'] = len(tokens)
        cols['ent_count'] = len(ents)
        cols['word_count'] = len(words)
        cols['unique_word_count'] = len(set(words))
        cols['upper_word_count'] = len([w for w in words if (w.isupper())])
        cols['avg_word_len'] = np.mean([len(w) for w in words])
        
        cols['char_count'] = len(txt)
        cols['unique_char_count'] = len(set(txt))
        cols['upper_char_count'] = len([c for c in txt if (c.isupper())])
        
        for t in tokens:
            if (t.is_punct): cols['punctuation_count'] += 1
            if (t.is_stop): cols['stopwords_count'] += 1
            if (t.pos_ == 'SYM'): cols['symbol_count'] += 1
            if (t.pos_ == 'NUM'): cols['number_count'] += 1
            if (t.is_alpha): cols['alpha_count'] += 1
                
            if (t.pos_ == 'NOUN') : cols['noun_count'] += 1
            if (t.pos_ == 'VERB') : cols['verb_count'] += 1
            if (t.pos_ == 'ADJ') : cols['adj_count'] += 1
            if (t.pos_ == 'PROPN') : cols['proper_name_count'] += 1
                
        for ent in ents:
            if (ent.label_ == 'PERSON'): cols['proper_name_count'] += 1
            if (ent.label_ == 'NORP'): cols['named_entity_norp_count'] += 1
            if (ent.label_ == 'FACILITY'): cols['named_entity_facility_count'] += 1
            if (ent.label_ == 'ORG'): cols['named_entity_org_count'] += 1
            if (ent.label_ == 'GPE'): cols['named_entity_gpe_count'] += 1
            if (ent.label_ == 'LOC'): cols['named_entity_non_gpe_loc_count'] += 1
            if (ent.label_ == 'PRODUCT'): cols['named_entity_product_count'] += 1
            if (ent.label_ == 'EVENT'): cols['named_entity_event_count'] += 1
            if (ent.label_ == 'WORK_OF_ART'): cols['named_entity_woa_count'] += 1
            if (ent.label_ == 'LANGUAGE'): cols['named_entity_lang_count'] += 1
            if (ent.label_ == 'DATE'): cols['named_entity_date_count'] += 1
            if (ent.label_ == 'TIME'): cols['named_entity_time_count'] += 1
            if (ent.label_ == 'MONEY'): cols['named_entity_money_count'] += 1
            if (ent.label_ == 'QUANTITY'): cols['named_entity_quantity_count'] += 1

        cols['unique_word_pct'] = (cols['unique_word_count'] + 1) / (cols['word_count'] + 1)
        cols['unique_char_pct'] = (cols['unique_char_count'] + 1) / (cols['char_count'] + 1)
        cols['punctuation_pct'] = (cols['punctuation_count'] + 1) / (cols['token_count'] + 1)
        
        rows.append(cols)
        
    return pd.DataFrame(rows, columns=cols.keys())

In [12]:
%time raw_train_df = pd.concat([raw_train_df, add_cols(raw_train_df, 'comment_text')], axis=1)
print('train feature engineering done ...')

%time test_df = pd.concat([test_df, add_cols(test_df, 'comment_text')], axis=1)
print('test feature engineering done ...')

CPU times: user 1h 50min 59s, sys: 3min 2s, total: 1h 54min 1s
Wall time: 59min 35s
train feature engineering done ...


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


CPU times: user 1h 45min 57s, sys: 2min 46s, total: 1h 48min 44s
Wall time: 56min 55s
test feature engineering done ...


Let's see what we've done

In [13]:
display(raw_train_df.head(3))
display(test_df.head(3))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,comment_text_cleaned,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,named_entity_person_count,named_entity_norp_count,named_entity_facility_count,named_entity_org_count,named_entity_gpe_count,named_entity_non_gpe_loc_count,named_entity_product_count,named_entity_event_count,named_entity_woa_count,named_entity_lang_count,named_entity_date_count,named_entity_time_count,named_entity_money_count,named_entity_quantity_count,ent_count,unique_char_pct
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0,1,"explanation why the edits made under my username hardcore metallica fan were reverted? they were not vandalisms, just closure on some gas after i voted at new york dolls fac. and please do not remove the template from the talk page since i am retired now.",50,0.0,43,41,0.954545,2,5.162791,264,49,17,17.0,3.0,0.078431,0.0,0.0,42.0,8.0,10.0,1.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.188679
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0,1,"d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",28,0.0,17,17,1.0,1,5.588235,112,39,8,2.0,9.0,0.344828,0.0,3.0,13.0,5.0,3.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2,0.353982
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0,1,"hey man, i am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",48,0.0,42,39,0.930233,0,4.571429,233,29,4,20.0,4.0,0.102041,0.0,0.0,42.0,9.0,9.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.128205


Unnamed: 0,id,comment_text,comment_text_cleaned,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,named_entity_person_count,named_entity_norp_count,named_entity_facility_count,named_entity_org_count,named_entity_gpe_count,named_entity_non_gpe_loc_count,named_entity_product_count,named_entity_event_count,named_entity_woa_count,named_entity_lang_count,named_entity_date_count,named_entity_time_count,named_entity_money_count,named_entity_quantity_count,ent_count,unique_char_pct
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,","yo bitch ja rule is more succesful then you will ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",84,0.0,72,61,0.849315,0,4.111111,367,30,4,36.0,7.0,0.094118,0.0,0.0,76.0,19.0,16.0,10.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.084239
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.","== from rfc == the title is fine as it is, imo.",17,0.0,12,11,0.923077,1,3.0,50,24,7,4.0,2.0,0.166667,3.0,0.0,10.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.490196
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,""" == sources == * zawe ashton on lapland — / """,18,0.0,12,10,0.846154,0,2.916667,54,25,4,1.0,5.0,0.315789,4.0,0.0,5.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.472727


Save preprocessed data

In [14]:
raw_train_df.to_csv(f'{PATH}/train_preproc.csv', index=None)
test_df.to_csv(f'{PATH}/test_preproc.csv', index=None)