In [28]:
import pandas as pd
import numpy as np
import scipy as sp
import random
import re
import string
import collections
import itertools
import sklearn

from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
from nltk.corpus import gutenberg
from nltk.corpus import stopwords

In [3]:
LONG_WORD_TOKEN = "<LONG_WORD>"
my_stop = set(stopwords.words('english')) # set of all stopwords

In [4]:
def get_train_data():
    return pd.read_csv("train.csv")

def get_test_data():
    test_comments = pd.read_csv("test.csv")
    test_labels = pd.read_csv("test_labels.csv")
    return test_comments, test_labels

def get_num_true_positives(predictions, truth):
    """
    Return number of true positives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 1 and the truth was 1
    """
    
    if len(predictions) != len(truth):
        print("get_num_true_positive: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 1 and truth[i] == 1:
            count += 1
    return count

def get_num_false_positives(predictions, truth):
    """
    Return number of false positives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 1 but the truth was 0
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_positive: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 1 and truth[i] == 0:
            count += 1
    return count

def get_num_false_negatives(predictions, truth):
    """
    Return number of false negatives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 0 but the truth was 1
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_negatives: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 0 and truth[i] == 1:
            count += 1
    
    return count

def get_precision(predictions, truth):
    """
    Calculates precision as defined by (true_positives) / (true_positives + false_positives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: precision
    """
    
    true_positives = get_num_true_positives(predictions, truth)
    false_positives = get_num_false_positives(predictions, truth)
    
    return true_positives / (true_positives + false_positives)

def get_recall(predictions, truth):
    """
    Calculates recall as defined by (true_positives) / (true_positives + false_negatives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: recall
    """
    
    true_positives = get_num_true_positives(predictions, truth)
    false_negatives = get_num_false_negatives(predictions, truth)
    
    return true_positives / (true_positives + false_negatives)

def f_beta_score(beta, predictions, truth):
    precision = get_precision(predictions, truth)
    recall = get_recall(predictions, truth)
    numer = precision * recall
    denom = ((beta**2) * precision) + recall
    if denom == 0:
        print("f_beta_score: denom == 0")
        return 0.0
    factor = (1 + (beta**2))
    
    return factor * (numer / denom)

def iter_ngram(n, words):
    """
    Iterate over n-grams.

    :param n: the "n"-gram
    :param words: an iterable of words
    :yield: the ngrams

    >>> list(iter_ngram(1, ['hello', 'world']))
    [('hello',), ('world',)]
    >>> list(iter_ngram(2, ['hello', 'world']))
    [('hello', 'world')]
    """
    words = iter(words)
    cache = collections.deque(maxlen=n)
    try:
        for _ in range(n - 1):
            cache.append(next(words))
    except StopIteration:
        return
    try:
        for w in words:
            cache.append(next(words))
            yield tuple(cache)
    except StopIteration:
        pass

def count_ngram(n, ngrams_list):
    """
    Count occurrences of
    :param n: the "n"-gram
    :param ngrams_list: list of ngrams
    :return: a dict of ngram-to-count

    >>> cnt = count_ngram(1, [[('hello',), ('world',)], [('again',)]])
    """
    c = collections.Counter()
    for words in corpus:
        for ng in iter_ngram(n, words):
            c[ng] += 1
    c = dict(c)
    return c

def preprocess_comments(corpus):
    """
    Remove capital letters, remove punctuations, split into words, stem the words.

    :param corpus: an iterable of comments
    :return: a iterable of processed comments
    """
    puncset = set(string.punctuation)
    stemmer = PorterStemmer()
    comments = corpus
    
    comments = iter(''.join(c for c in x if c not in puncset) for x in comments)
    comments = iter(x.lower() for x in comments)
    comments = iter(re.sub(r'([a-z])([0-9])', r'\1 \2', x) for x in comments)
    comments = iter(re.sub(r'([0-9])([a-z])', r'\1 \2', x) for x in comments)
    comments = map(str.split, comments)
    ret = []
    for comment in comments:
        stemmed = []
        for word in comment:
            try:
                stemmed.append(stemmer.stem(word))
            except RecursionError:
                print("stemmer recursion issue")
                stemmed.append(LONG_WORD_TOKEN)
        ret.append(' '.join(stemmed))
    return ret

In [5]:
def custom_preprocessor(raw_string):
    """
    Replace uppercase with lowercase, long words with LONG_WORD_TOKEN, and (maybe) remove punctuation
    
    :param raw_string: a raw comment (string)
    :return: a processed string
    """
    
    puncset = set(string.punctuation)
    puncset.discard("!")
    puncset.discard("?")
    puncset.discard("#")
    goodpuncset = set(["!", "?"])
    raw_string = ''.join(" " + c if c in goodpuncset else c for c in raw_string.lower() if c not in puncset)
    #raw_string = ''.join(c for c in raw_string.lower() if c not in puncset)
    words = raw_string.strip().split(' ')
    for i in range(len(words)):
        word = words[i]
        if len(word) > 30:
            words[i] = LONG_WORD_TOKEN
    return ' '.join(words)

def custom_tokenizer(raw_string):
    """
    Split words into tokens, preserving the LONG_WORD_TOKEN

    :param raw_string: one comment, post-processing
    :return: a list of processed tokens from comment
    """
    
    stemmer = PorterStemmer()
    tokenized_string = raw_string
    tokenized_string = re.sub(r'([a-z])([0-9])', r'\1 \2', tokenized_string)
    tokenized_string = re.sub(r'([0-9])([a-z])', r'\1 \2', tokenized_string)
    tokenized_string = tokenized_string.split(' ')
    return [word if word == LONG_WORD_TOKEN else stemmer.stem(word) for word in tokenized_string]

In [8]:
training_data = sklearn.utils.shuffle(get_train_data())
split = len(training_data)//2
validation_data = training_data[split:]
training_data = training_data[:split]
test_comments, test_labels = get_test_data()
test_data = test_comments.set_index('id').join(other=test_labels.set_index('id'))
test_data = test_data[test_data.toxic != -1]

In [9]:
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
154220,aa5dba16e10beeaa,Thank you for experimenting with the page Guit...,0,0,0,0,0,0
63835,aacc2f5b963c23e6,"By editing back, Wikibots can easily look thro...",0,0,0,0,0,0
106658,3a79f766187aea21,"""\n\n Re: """"Oblateness constant"""" \n\nHaha, my...",0,0,0,0,0,0
83573,df9e423f27161982,Deolis \n\nI have no idea what I'm supposed to...,0,0,0,0,0,0
5974,0ff1ed81db0f366a,REDIRECT Talk:Japanese gunboat Sumida (1906),0,0,0,0,0,0


In [10]:
raw_training_comment_text = training_data.comment_text.tolist()
training_toxic_labels = training_data.toxic.tolist()
training_severe_labels = training_data.severe_toxic.tolist()
training_obscene_labels = training_data.obscene.tolist()
training_threat_labels = training_data.threat.tolist()
training_insult_labels = training_data.insult.tolist()
training_hate_labels = training_data.identity_hate.tolist()
training_labels = list(zip(training_toxic_labels, training_severe_labels, training_obscene_labels, training_threat_labels, training_insult_labels, training_hate_labels))

In [11]:
raw_validation_comment_text = validation_data.comment_text.tolist()
validation_toxic_labels = validation_data.toxic.tolist()
validation_severe_labels = validation_data.severe_toxic.tolist()
validation_obscene_labels = validation_data.obscene.tolist()
validation_threat_labels = validation_data.threat.tolist()
validation_insult_labels = validation_data.insult.tolist()
validation_hate_labels = validation_data.identity_hate.tolist()
validation_labels = list(zip(validation_toxic_labels, validation_severe_labels, validation_obscene_labels, validation_threat_labels, validation_insult_labels, validation_hate_labels))

In [12]:
# develop baseline
"""
predict all are not toxic
"""
baseline_truth = [1 if sum(training_label) > 0 else 0 for training_label in training_labels]
baseline_predictions = [0] * len(baseline_truth)
for i in range(len(baseline_predictions)):
    baseline_predictions[i] = random.randint(0,1)

print(sklearn.metrics.fbeta_score(y_true=baseline_truth, y_pred=baseline_predictions, beta=1.5))

0.23008503189233592


In [13]:
# develop tf-idf model
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(preprocessor=custom_preprocessor, tokenizer=custom_tokenizer, ngram_range=(1,5), stop_words='english', norm='l2')

In [55]:
training_X = vectorizer.fit_transform(raw_documents=raw_training_comment_text)

In [43]:
# get sentiments for each comment in X using the raw comment
training_sentiments = [0] * training_X.shape[0]
for i in range(len(training_sentiments)):
    blob = TextBlob(raw_training_comment_text[i])
    training_sentiments[i] = (blob.sentiment)

In [56]:
lil_training_X = training_X.tolil()
num_rows, num_cols = training_X.shape
lil_training_X.resize((num_rows, num_cols + 1))
for i in range(num_rows):
    lil_training_X[i,-1] = training_sentiments[i][0]
training_X = lil_training_X.tocsr()

In [57]:
training_y = [1 if sum(training_label) > 0 else 0 for training_label in training_labels]

In [58]:
clf = sklearn.linear_model.Ridge(1.0)
clf.fit(training_X, training_y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [59]:
training_predictions = [0 if pred < 0.5 else 1 for pred in clf.predict(training_X)]

In [60]:
sklearn.metrics.fbeta_score(y_true=training_y, y_pred=training_predictions, beta=1.5)

0.9889488563351323

In [61]:
validation_X = vectorizer.transform(raw_documents=raw_validation_comment_text)

In [50]:
# get sentiments for each comment in X using the raw comment
validation_sentiments = [0] * validation_X.shape[0]
for i in range(len(validation_sentiments)):
    blob = TextBlob(raw_validation_comment_text[i])
    validation_sentiments[i] = (blob.sentiment)

In [62]:
lil_validation_X = validation_X.tolil()
num_rows, num_cols = validation_X.shape
lil_validation_X.resize((num_rows, num_cols + 1))
for i in range(num_rows):
    lil_validation_X[i,-1] = validation_sentiments[i][0]
validation_X = lil_validation_X.tocsr()

In [63]:
validation_y = [1 if sum(validation_label) > 0 else 0 for validation_label in validation_labels]

In [64]:
validation_predictions = [0 if pred < 0.5 else 1 for pred in clf.predict(validation_X)]

In [65]:
sklearn.metrics.fbeta_score(y_true=validation_y, y_pred=validation_predictions, beta=1.5)

0.7301888687964637

In [23]:
custom_tokenizer(custom_preprocessor("Hello, my name is jaishriramanujanchanduranjanbalasubranium mikey! #swag"))

['hello', 'my', 'name', 'is', '<LONG_WORD>', 'mikey', '!', '#swag']

In [70]:
blob = TextBlob("hello, thsi is speled incorrectily")

In [75]:
for word in blob.split():
    w = textblob.blob.Word(word)
    print(w.spellcheck())

[('hello', 1.0)]
[('this', 1.0)]
[('is', 1.0)]
[('speed', 0.96875), ('spelled', 0.03125)]
[('incorrectly', 1.0)]


In [72]:
textblob.blob.TextBlob.

<textblob.decorators.cached_property at 0x1a749d2978>