In [1]:
import pandas as pd
import numpy as np
import random
import re
import string
import collections
import itertools
import sklearn

from nltk.stem.porter import PorterStemmer

In [2]:
LONG_WORD_TOKEN = "<LONG_WORD>"

In [3]:
def get_train_data():
    return pd.read_csv("train.csv")

def get_test_data():
    test_comments = pd.read_csv("test.csv")
    test_labels = pd.read_csv("test_labels.csv")
    return test_comments, test_labels

def get_num_true_positives(predictions, truth):
    """
    Return number of true positives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 1 and the truth was 1
    """
    
    if len(predictions) != len(truth):
        print("get_num_true_positive: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 1 and truth[i] == 1:
            count += 1
    return count

def get_num_false_positives(predictions, truth):
    """
    Return number of false positives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 1 but the truth was 0
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_positive: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 1 and truth[i] == 0:
            count += 1
    return count

def get_num_false_negatives(predictions, truth):
    """
    Return number of false negatives in predictions
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: the number of times we predicted 0 but the truth was 1
    """
    
    if len(predictions) != len(truth):
        print("get_num_false_negatives: len(predictions) != len(truth)")
        return None
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == 0 and truth[i] == 1:
            count += 1
    
    return count

def get_precision(predictions, truth):
    """
    Calculates precision as defined by (true_positives) / (true_positives + false_positives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: precision
    """
    
    true_positives = get_num_true_positives(predictions, truth)
    false_positives = get_num_false_positives(predictions, truth)
    
    return true_positives / (true_positives + false_positives)

def get_recall(predictions, truth):
    """
    Calculates recall as defined by (true_positives) / (true_positives + false_negatives)
    
    :param predictions: a vector of predicted classifications
    :param truth: a vector of true classifications
    :return: recall
    """
    
    true_positives = get_num_true_positives(predictions, truth)
    false_negatives = get_num_false_negatives(predictions, truth)
    
    return true_positives / (true_positives + false_negatives)

def f_beta_score(beta, predictions, truth):
    precision = get_precision(predictions, truth)
    recall = get_recall(predictions, truth)
    numer = precision * recall
    denom = ((beta**2) * precision) + recall
    if denom == 0:
        print("f_beta_score: denom == 0")
        return 0.0
    factor = (1 + (beta**2))
    
    return factor * (numer / denom)

def iter_ngram(n, words):
    """
    Iterate over n-grams.

    :param n: the "n"-gram
    :param words: an iterable of words
    :yield: the ngrams

    >>> list(iter_ngram(1, ['hello', 'world']))
    [('hello',), ('world',)]
    >>> list(iter_ngram(2, ['hello', 'world']))
    [('hello', 'world')]
    """
    words = iter(words)
    cache = collections.deque(maxlen=n)
    try:
        for _ in range(n - 1):
            cache.append(next(words))
    except StopIteration:
        return
    try:
        for w in words:
            cache.append(next(words))
            yield tuple(cache)
    except StopIteration:
        pass

def count_ngram(n, ngrams_list):
    """
    Count occurrences of
    :param n: the "n"-gram
    :param ngrams_list: list of ngrams
    :return: a dict of ngram-to-count

    >>> cnt = count_ngram(1, [[('hello',), ('world',)], [('again',)]])
    """
    c = collections.Counter()
    for words in corpus:
        for ng in iter_ngram(n, words):
            c[ng] += 1
    c = dict(c)
    return c

def preprocess_comments(corpus):
    """
    Remove capital letters, remove punctuations, split into words, stem the words.

    :param corpus: an iterable of comments
    :return: a iterable of processed comments
    """
    puncset = set(string.punctuation)
    stemmer = PorterStemmer()
    comments = corpus
    
    comments = iter(''.join(c for c in x if c not in puncset) for x in comments)
    comments = iter(x.lower() for x in comments)
    comments = iter(re.sub(r'([a-z])([0-9])', r'\1 \2', x) for x in comments)
    comments = iter(re.sub(r'([0-9])([a-z])', r'\1 \2', x) for x in comments)
    comments = map(str.split, comments)
    ret = []
    for comment in comments:
        stemmed = []
        for word in comment:
            try:
                stemmed.append(stemmer.stem(word))
            except RecursionError:
                print("stemmer recursion issue")
                stemmed.append(LONG_WORD_TOKEN)
        ret.append(stemmed)
    return ret

In [4]:
training_data = sklearn.utils.shuffle(get_train_data())
split = len(training_data)//2
validation_data = training_data[split:]
training_data = training_data[:split]
test_comments, test_labels = get_test_data()
test_data = test_comments.set_index('id').join(other=test_labels.set_index('id'))
test_data = test_data[test_data.toxic != -1]

In [5]:
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
121942,8c58128b985073bd,"""\n\n Melanie Phillips \n\nHi, thanks for your...",0,0,0,0,0,0
125091,9d1a1f673e54014c,February 2006 \n\nPlease do not add spamlinks ...,0,0,0,0,0,0
37777,64d7880b6987daa6,"I'm accused of sockpuppetry, but the author is...",1,0,0,0,0,0
38419,6691db7d86256ed8,User:Indianacademy\n\n{{Quote box\n | quote =...,0,0,0,0,0,0
158088,e89c8db2c672aafb,User categorisation\nYou were listed on the Wi...,0,0,0,0,0,0


In [6]:
raw_comment_text = training_data.comment_text.tolist()
toxic_labels = training_data.toxic.tolist()
severe_labels = training_data.severe_toxic.tolist()
obscene_labels = training_data.obscene.tolist()
threat_labels = training_data.threat.tolist()
insult_labels = training_data.insult.tolist()
hate_labels = training_data.identity_hate.tolist()
labels = list(zip(toxic_labels, severe_labels, obscene_labels, threat_labels, insult_labels, hate_labels))

In [7]:
labels[2]

(1, 0, 0, 0, 0, 0)

In [8]:
words = set()
bag_of_comments = preprocess_comments(raw_comment_text)
for comment in bag_of_comments:
    for word in comment:
        words.add(word)

stemmer recursion issue


In [9]:
# develop baseline
"""
predict all are not toxic
"""
truth = [1 if sum(label) > 0 else 0 for label in labels]
predictions = [0] * len(truth)
half = len(predictions)//2
for i in range(half, len(predictions)):
    predictions[i] = 1

print(f_beta_score(beta=1.5, predictions=predictions, truth=truth))
print(sklearn.metrics.fbeta_score(y_true=truth, y_pred=predictions, beta=1.5))

print(get_precision(predictions=predictions, truth=truth))
print(sklearn.metrics.precision_score(y_true=truth, y_pred=predictions))

print(get_recall(predictions=predictions, truth=truth))
print(sklearn.metrics.recall_score(y_true=truth, y_pred=predictions))

0.2274369476180211
0.2274369476180211
0.10172210663524929
0.10172210663524929
0.5046008455608058
0.5046008455608058


In [10]:
# develop tf-idf model
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
X = vectorizer.fit_transform(raw_documents=raw_comment_text)

In [12]:
raw_validation_comment_text = validation_data.comment_text.tolist()
valid_X = vectorizer.transform(raw_documents=raw_validation_comment_text)

In [14]:
clf = sklearn.linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, truth)

Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
valid_predictions = [0 if pred < 0.5 else 1 for pred in clf.predict(valid_X)]

In [20]:
valid_labels = list(zip(validation_data.toxic.tolist(), validation_data.severe_toxic.tolist(), validation_data.obscene.tolist(), validation_data.threat.tolist(), validation_data.insult.tolist(), validation_data.identity_hate.tolist()))
valid_truth = [1 if sum(label) > 0 else 0 for label in valid_labels]
print(sklearn.metrics.fbeta_score(y_true=valid_truth, y_pred=valid_predictions, beta=1.5))

0.6238348796267295
