In [79]:
import pandas as pd
import numpy as np
import random
import re
import string
import collections
import itertools
import sklearn

from nltk.stem.porter import PorterStemmer

In [171]:
LONG_WORD_TOKEN = "<LONG_WORD>"

In [261]:
def get_train_data():
    return pd.read_csv("train.csv")

def get_test_data():
    test_comments = pd.read_csv("test.csv")
    test_labels = pd.read_csv("test_labels.csv")
    return test_comments, test_labels

def get_num_false_positives(predictions, truth):
    """
    Return number of false positives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 1 but the truth was 0
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_positive: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 1 and truth[i] == 0:
            count += 1
    return count

def get_num_false_negatives(predictions, truth):
    """
    Return number of false negatives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 0 but the truth was 1
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_negatives: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 0 and truth[i] == 1:
            count += 1
    
    return count

def get_precision(predictions, truth):
    """
    Calculates precision as defined by (true_positives) / (true_positives + false_positives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: precision
    """
    
    true_positives = sum(truth)
    false_positives = get_num_false_positives(predictions, truth)
    
    return true_positives / (true_positives + false_positives)

def get_recall(predictions, truth):
    """
    Calculates recall as defined by (true_positives) / (true_positives + false_negatives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: recall
    """
    
    true_positives = sum(truth)
    false_negatives = get_num_false_negatives(predictions, truth)
    
    return true_positives / (true_positives + false_negatives)

def f_beta_score(beta, predictions, truth):
    precision = get_precision(predictions, truth)
    recall = get_recall(predictions, truth)
    numer = precision * recall
    denom = ((beta**2) * precision) + recall
    if denom == 0:
        print("f_beta_score: denom == 0")
        return 0
    factor = (1 + (beta**2))
    
    return factor * (numer / denom)

def iter_ngram(n, words):
    """
    Iterate over n-grams.

    :param n: the "n"-gram
    :param words: an iterable of words
    :yield: the ngrams

    >>> list(iter_ngram(1, ['hello', 'world']))
    [('hello',), ('world',)]
    >>> list(iter_ngram(2, ['hello', 'world']))
    [('hello', 'world')]
    """
    words = iter(words)
    cache = collections.deque(maxlen=n)
    try:
        for _ in range(n - 1):
            cache.append(next(words))
    except StopIteration:
        return
    try:
        for w in words:
            cache.append(next(words))
            yield tuple(cache)
    except StopIteration:
        pass

def count_ngram(n, ngrams_list):
    """
    Count occurrences of
    :param n: the "n"-gram
    :param ngrams_list: list of ngrams
    :return: a dict of ngram-to-count

    >>> cnt = count_ngram(1, [[('hello',), ('world',)], [('again',)]])
    """
    c = collections.Counter()
    for words in corpus:
        for ng in iter_ngram(n, words):
            c[ng] += 1
    c = dict(c)
    return c

def preprocess_comments(corpus):
    """
    Remove capital letters, remove punctuations, split into words, stem the words.

    :param corpus: an iterable of comments
    :return: a iterable of processed comments
    """
    puncset = set(string.punctuation)
    stemmer = PorterStemmer()
    comments = corpus
    
    comments = iter(''.join(c for c in x if c not in puncset) for x in comments)
    comments = iter(x.lower() for x in comments)
    comments = iter(re.sub(r'([a-z])([0-9])', r'\1 \2', x) for x in comments)
    comments = iter(re.sub(r'([0-9])([a-z])', r'\1 \2', x) for x in comments)
    comments = map(str.split, comments)
    ret = []
    for comment in comments:
        stemmed = []
        for word in comment:
            try:
                stemmed.append(stemmer.stem(word))
            except RecursionError:
                print("stemmer recursion issue")
                stemmed.append(LONG_WORD_TOKEN)
        ret.append(stemmed)
    return ret

In [81]:
training_data = sklearn.utils.shuffle(get_train_data())
split = len(training_data)//2
validation_data = training_data[split:]
training_data = training_data[:split]
test_comments, test_labels = get_test_data()
test_data = test_comments.set_index('id').join(other=test_labels.set_index('id'))
test_data = test_data[test_data.toxic != -1]

In [87]:
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
131808,c148665705d8388d,What is it that you are trying to do?,0,0,0,0,0,0
108702,45321c8d812e4317,"""\n\n Why do you accuse me of making personal ...",0,0,0,0,0,0
50307,867ed4376adc4d4b,"Wow, crying like a baby to some guy is very ma...",1,0,0,0,0,0
158765,f35f98137911a3d1,Agreed. And simply changing section headings d...,0,0,0,0,0,0
61683,a51361093d3517b9,"""However alerting all who participated in the ...",0,0,0,0,0,0


In [226]:
raw_comment_text = training_data.comment_text.tolist()
toxic_labels = training_data.toxic.tolist()
severe_labels = training_data.severe_toxic.tolist()
obscene_labels = training_data.obscene.tolist()
threat_labels = training_data.threat.tolist()
insult_labels = training_data.insult.tolist()
hate_labels = training_data.identity_hate.tolist()
labels = list(zip(toxic_labels, severe_labels, obscene_labels, threat_labels, insult_labels, hate_labels))

In [228]:
labels[2]

(1, 0, 0, 0, 0, 0)

In [175]:
bag_of_comments = preprocess_comments(training_data.comment_text.tolist())

stemmer recursion issue


In [267]:
# develop baseline
"""
predict all are not toxic
"""
truth = [1 if sum(label) > 0 else 0 for label in labels]
predictions = [0] * len(truth)
predictions[0] = 1

#f_beta_score(beta=1.5, predictions=predictions, truth=truth)
#sklearn.metrics.fbeta_score(y_true=truth, y_pred=predictions, beta=0.01)
print(get_precision(predictions=predictions, truth=truth))
print(sklearn.metrics.precision_score(y_true=truth, y_pred=predictions))

print(get_recall(predictions=predictions, truth=truth))
print(sklearn.metrics.recall_score(y_true=truth, y_pred=predictions))

0.9998774359602892
0.0
0.5
0.0


In [262]:
get_num_false_negatives(predictions=predictions, truth=truth)

8158