In [1]:
import sys
import pandas as pd
import numpy as np
import re as regex
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter

# Loading of Raw Data

In [2]:
trainData = pd.read_csv('../data/train.csv', header = 0)
testData = pd.read_csv('../data/test.csv', header = 0)

print(trainData.shape)
print(testData.shape)

(95851, 8)
(226998, 2)


In [3]:
trainData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
testData.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


# Accessing elements (examples)

In [5]:
print(trainData.loc[trainData.loc[:, "toxic"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "severe_toxic"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "obscene"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "threat"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "insult"] == 1,].shape[0])
print(trainData.loc[trainData.loc[:, "identity_hate"] == 1,].shape[0])

9237
965
5109
305
4765
814


# Create BoW

In [49]:
#vectorizer = TfidfVectorizer(max_df=1.0, min_df=0.001, stop_words = ENGLISH_STOP_WORDS, token_pattern=u'(?u)\\b\\w\\w+\\b')
vectorizer = TfidfVectorizer(max_df=1.0, min_df=0.001, stop_words = ENGLISH_STOP_WORDS, token_pattern=u'\\b[a-zA-Z]\w+\\b')

data_corpus = trainData.comment_text
X = vectorizer.fit_transform(data_corpus)
# get frequencies of words
freqs = [(word, X.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]

In [61]:
len(freqs)

3370

In [60]:
# sort according to descending frequencies
freqs = sorted (freqs, key = lambda x: -x[1])

freqs[3000:]

[('dislike', 25.86659697874105),
 ('joining', 25.844804751033227),
 ('temporarily', 25.836040808864276),
 ('victory', 25.833542546754035),
 ('patient', 25.824284129773989),
 ('terminology', 25.820773858778661),
 ('integrity', 25.806190109107348),
 ('fields', 25.805130850095548),
 ('addressing', 25.802649668039393),
 ('layout', 25.80070940597713),
 ('counter', 25.791183617487921),
 ('collaboration', 25.787223671611461),
 ('practices', 25.762527010257713),
 ('elaborate', 25.758347065767648),
 ('licensing', 25.753984558477022),
 ('participated', 25.704764992370418),
 ('wikipediaimage', 25.690792093072179),
 ('environment', 25.683539628929111),
 ('regularly', 25.677381999595585),
 ('theyll', 25.647315053885606),
 ('logging', 25.644119672412675),
 ('concepts', 25.640547972609387),
 ('scientist', 25.637091641016951),
 ('applicable', 25.62052048677365),
 ('spending', 25.617459755421471),
 ('deeply', 25.610291444694909),
 ('stance', 25.609973654560495),
 ('carried', 25.597290830699382),
 ('flo

## Cleaning: Removing URLs (All test codes below)

In [46]:
class TwitterCleanuper:

    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_na,
                               self.remove_special_chars,
                               self.remove_numbers]:
            yield cleanup_method

    @staticmethod
    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "comment_text"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_na(self, tweets):
        return tweets[tweets["comment_text"] != "Not Available"]

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#", "\n"]):
            tweets.loc[:, "comment_text"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [47]:
class Cleaner():
    
    def __init__(self):
        self.processed_data = 0
    
    def cleanup(self, data, cleanuper):
        t = data
        for cleanup_method in cleanuper.iterate():
            t = cleanup_method(t)
        self.processed_data = t
        
    def getData(self):
        return self.processed_data

In [48]:
cleanup = TwitterCleanuper()
cleaner = Cleaner()
cleaner.cleanup(trainData, cleanup)
trainData = cleaner.getData()
trainData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,Nonsense kiss off geek what I said is true I...,1,0,0,0,0,0
1,27450690,Please do not vandalize pages as you did with...,0,0,0,0,0,0
2,54037174,Points of interest I removed the points of in...,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [None]:
class TwitterData_TokenStem():
    def __init__(self, data):
        self.processed_data = data
        
    def stem(self, stemmer=nltk.PorterStemmer()):
        def stem_and_join(row):
            row["comment_text"] = list(map(lambda str: stemmer.stem(str.lower()), row["comment_text"]))
            return row
        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["comment_text"] = tokenizer(row["comment_text"])
            row["tokenized_text"] = [] + row["comment_text"]
            return row
        self.processed_data = self.processed_data.apply(tokenize_row, axis=1)
        
    def getData(self):
        return self.processed_data

In [None]:
stemmer = TwitterData_TokenStem(data)
stemmer.tokenize()
stemmer.stem()
data = stemmer.getData()
data.head()

In [None]:
words = Counter()
for idx in data.index:
    words.update(data.loc[idx, "comment_text"])

words.most_common(5)

In [None]:
nltk.download('stopwords')

In [None]:
stopwords=nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]
words.most_common(5)

In [None]:
data_corpus.size