In [14]:
import sys
import pandas as pd
import numpy as np
import re as regex
import nltk
from collections import Counter

%matplotlib inline

# Loading of Raw Data

In [2]:
trainData = pd.read_csv('../data/train.csv')
testData = pd.read_csv('../data/test.csv')

In [3]:
trainData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
testData.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


# Accessing elements

In [5]:
testData.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [6]:
print(trainData.loc[trainData.loc[:, "toxic"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "severe_toxic"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "obscene"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "threat"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "insult"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "identity_hate"] == 1,].shape[0])

9237
965
5109
305
4765
814


# Create BoW

## Cleaning: Removing URLs

In [7]:
class TwitterCleanuper:

    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_na,
                               self.remove_special_chars,
                               self.remove_numbers]:
            yield cleanup_method

    @staticmethod
    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "comment_text"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_na(self, tweets):
        return tweets[tweets["comment_text"] != "Not Available"]

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#", "\n"]):
            tweets.loc[:, "comment_text"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [8]:
class Cleaner():
    
    def __init__(self):
        self.processed_data = 0
    
    def cleanup(self, data, cleanuper):
        t = data
        for cleanup_method in cleanuper.iterate():
            t = cleanup_method(t)
        self.processed_data = t
        
    def getData(self):
        return self.processed_data

In [9]:
cleanup = TwitterCleanuper()
cleaner = Cleaner()
cleaner.cleanup(trainData, cleanup)
data = cleaner.getData()
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,Nonsense kiss off geek what I said is true I...,1,0,0,0,0,0
1,27450690,Please do not vandalize pages as you did with...,0,0,0,0,0,0
2,54037174,Points of interest I removed the points of in...,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [10]:
class TwitterData_TokenStem():
    def __init__(self, data):
        self.processed_data = data
        
    def stem(self, stemmer=nltk.PorterStemmer()):
        def stem_and_join(row):
            row["comment_text"] = list(map(lambda str: stemmer.stem(str.lower()), row["comment_text"]))
            return row
        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["comment_text"] = tokenizer(row["comment_text"])
            row["tokenized_text"] = [] + row["comment_text"]
            return row
        self.processed_data = self.processed_data.apply(tokenize_row, axis=1)
        
    def getData(self):
        return self.processed_data

In [13]:
stemmer = TwitterData_TokenStem(data)
stemmer.tokenize()
stemmer.stem()
data = stemmer.getData()
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenized_text
0,22256635,"[nonsens, kiss, off, geek, what, i, said, is, ...",1,0,0,0,0,0,"[Nonsense, kiss, off, geek, what, I, said, is,..."
1,27450690,"[pleas, do, not, vandal, page, as, you, did, w...",0,0,0,0,0,0,"[Please, do, not, vandalize, pages, as, you, d..."
2,54037174,"[point, of, interest, i, remov, the, point, of...",0,0,0,0,0,0,"[Points, of, interest, I, removed, the, points..."
3,77493077,"[ask, some, hi, nation, is, a, racial, offenc,...",0,0,0,0,0,0,"[Asking, some, his, nationality, is, a, Racial..."
4,79357270,"[the, reader, here, is, not, go, by, my, say, ...",0,0,0,0,0,0,"[The, reader, here, is, not, going, by, my, sa..."


In [16]:
words = Counter()
for idx in data.index:
    words.update(data.loc[idx, "comment_text"])

words.most_common(5)

[('the', 293177),
 ('to', 177604),
 ('of', 134454),
 ('and', 132668),
 ('a', 128282)]

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matthew\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
stopwords=nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]
words.most_common(5)

[('not', 58082),
 ('thi', 56916),
 ('articl', 42454),
 ('page', 32858),
 ('wa', 32605)]