In [34]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import random
import re
import string
import collections
import itertools
import sklearn
import pickle
import itertools
import multiprocessing
import sys
import pdb
import logging
import time

from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from tqdm import tqdm
from functools import partial

logging.basicConfig(level=logging.INFO, filename='launch.log')

In [2]:
LONG_WORD_TOKEN = "<LONG_WORD>"
my_stop = set(stopwords.words('english')) # set of all stopwords

In [6]:
def get_train_data():
    return pd.read_csv("train.csv")


def get_test_data():
    test_comments = pd.read_csv("test.csv")
    test_labels = pd.read_csv("test_labels.csv")
    return test_comments, test_labels

In [42]:
def custom_preprocessor(raw_string):
    """
    Replace uppercase with lowercase, long words with LONG_WORD_TOKEN, and (maybe) remove punctuation
    
    :param raw_string: a raw comment (string)
    :return: a processed string
    """
    
    puncset = set(string.punctuation)
    puncset.discard("!")
    puncset.discard("?")
    puncset.discard("#")
    goodpuncset = set(["!", "?"])
    raw_string = ''.join(" " + c if c in goodpuncset else c for c in raw_string.lower() if c not in puncset)
    words = raw_string.strip().split()
    for i in range(len(words)):
        word = words[i]
        if len(word) > 30:
            words[i] = LONG_WORD_TOKEN
    return ' '.join(words)

def custom_tokenizer(raw_string):
    """
    Split words into tokens, preserving the LONG_WORD_TOKEN

    :param raw_string: one comment, post-processing
    :return: a list of processed tokens from comment
    """
    
    stemmer = PorterStemmer()
    tokenized_string = raw_string
    tokenized_string = re.sub(r'([a-z])([0-9])', r'\1 \2', tokenized_string)
    tokenized_string = re.sub(r'([0-9])([a-z])', r'\1 \2', tokenized_string)
    tokenized_string = list(
        map(
            lambda x: x[:15], 
            filter(
                lambda x: not re.match(r'^\d+$', x) and max(map(ord, x)) < 128, 
                tokenized_string.split()
            )
        )
    )
    return [word if word == LONG_WORD_TOKEN else stemmer.stem(word) for word in tokenized_string]

In [8]:
def modelspec2modelobj(cls, params):
    args, kwargs = params
    return cls(*args, **kwargs)


def model_eval(X, y, model):
    predictions = (model.predict(X) >= 0.5)
    return sklearn.metrics.fbeta_score(y_true=y, y_pred=predictions, beta=1.5)


def train_and_eval(X_train, y_train, X_valid, y_valid, model) -> float:
    model.fit(X_train, y_train)
    fbeta_train = model_eval(X_train, y_train, model)
    fbeta_valid = model_eval(X_valid, y_valid, model)
    logging.info('{} --- {}'.format(' '.join(
        map(str.strip, str(model).split('\n'))),
        (fbeta_train, fbeta_valid)))
    return fbeta_train, fbeta_valid

def run_models(te, models):
    """
    :param te: the ``train_and_eval`` function with the first four arguments
           filled out
    """
    results = {}
    with multiprocessing.Pool() as pool:
        for ModelClass, all_params in tqdm(models.items(), ascii=True):
            mobjs = map(partial(modelspec2modelobj, ModelClass), all_params)
            fbs = pool.map(te, mobjs)
            results[ModelClass.__name__] = list(zip(fbs, all_params))
    return results


models = {
    sklearn.svm.SVC: [
        ((2.0,), {'kernel': 'linear', 'gamma': 'auto', 'class_weight': 'balanced'})
    ],
}

In [9]:
training_data = sklearn.utils.shuffle(get_train_data())
split = len(training_data)//2
validation_data = training_data[split:]
training_data = training_data[:split]
test_comments, test_labels = get_test_data()
test_data = test_comments.set_index('id').join(other=test_labels.set_index('id'))
test_data = test_data[test_data.toxic != -1]

In [10]:
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
76496,cccd00f521d911d5,Legal immunity NOT relevant to this page. The...,0,0,0,0,0,0
20038,34f77901ec57e442,You currently appear to be engaged in an edit ...,0,0,0,0,0,0
52178,8b93c12382c926c5,"""::SV has recently gotten into an editing disp...",0,0,0,0,0,0
26987,477875a62a6eab1d,It looks like I got confused. Your energy calc...,0,0,0,0,0,0
130203,b894f7fcf621b189,Why is your user page the Green Day article? S...,0,0,0,0,0,0


In [11]:
raw_training_comment_text = training_data.comment_text.tolist()
training_toxic_labels = training_data.toxic.tolist()
training_severe_labels = training_data.severe_toxic.tolist()
training_obscene_labels = training_data.obscene.tolist()
training_threat_labels = training_data.threat.tolist()
training_insult_labels = training_data.insult.tolist()
training_hate_labels = training_data.identity_hate.tolist()
training_labels = np.array(list(zip(training_toxic_labels, training_severe_labels, training_obscene_labels, training_threat_labels, training_insult_labels, training_hate_labels)))

In [12]:
raw_validation_comment_text = validation_data.comment_text.tolist()
validation_toxic_labels = validation_data.toxic.tolist()
validation_severe_labels = validation_data.severe_toxic.tolist()
validation_obscene_labels = validation_data.obscene.tolist()
validation_threat_labels = validation_data.threat.tolist()
validation_insult_labels = validation_data.insult.tolist()
validation_hate_labels = validation_data.identity_hate.tolist()
validation_labels = np.array(list(zip(validation_toxic_labels, validation_severe_labels, validation_obscene_labels, validation_threat_labels, validation_insult_labels, validation_hate_labels)))

In [13]:
# develop baseline
"""
predict all are not toxic
"""
baseline_truth = [1 if sum(training_label) > 0 else 0 for training_label in training_labels]
baseline_predictions = [0] * len(baseline_truth)
for i in range(len(baseline_predictions)):
    baseline_predictions[i] = random.randint(0,1)

print(sklearn.metrics.fbeta_score(y_true=baseline_truth, y_pred=baseline_predictions, beta=1.5))

0.22498253074802696


In [47]:
# develop tf-idf model
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(preprocessor=custom_preprocessor, tokenizer=custom_tokenizer, strip_accents='ascii', stop_words='english', norm='l2')

In [48]:
training_X = vectorizer.fit_transform(raw_documents=raw_training_comment_text)

  sorted(inconsistent))


In [57]:
# get sentiments for each comment in X using the raw comment
training_sentiments = [0] * training_X.shape[0]
for i in range(len(training_sentiments)):
    blob = TextBlob(raw_training_comment_text[i])
    training_sentiments[i] = (blob.sentiment)

In [58]:
lil_training_X = training_X.tolil()
num_rows, num_cols = training_X.shape
lil_training_X.resize((num_rows, num_cols + 1))
for i in range(num_rows):
    lil_training_X[i,-1] = training_sentiments[i][0]
training_X = lil_training_X.tocsr()

In [49]:
training_y = [1 if sum(training_label) > 0 else 0 for training_label in training_labels]

In [50]:
validation_X = vectorizer.transform(raw_documents=raw_validation_comment_text)

In [59]:
# get sentiments for each comment in X using the raw comment
validation_sentiments = [0] * validation_X.shape[0]
for i in range(len(validation_sentiments)):
    blob = TextBlob(raw_validation_comment_text[i])
    validation_sentiments[i] = (blob.sentiment)

In [60]:
lil_validation_X = validation_X.tolil()
num_rows, num_cols = validation_X.shape
lil_validation_X.resize((num_rows, num_cols + 1))
for i in range(num_rows):
    lil_validation_X[i,-1] = validation_sentiments[i][0]
validation_X = lil_validation_X.tocsr()

In [51]:
validation_y = [1 if sum(validation_label) > 0 else 0 for validation_label in validation_labels]

In [None]:
print(time.time())
clf = sklearn.svm.SVC(C=2,kernel='linear',gamma='auto', class_weight='balanced')
clf.fit(training_X, training_y)
print(time.time())

1543802821.001256


In [None]:
#clf = sklearn.linear_model.Ridge(1.0)
#clf.fit(training_X, training_y)

In [53]:
training_predictions = [0 if pred < 0.5 else 1 for pred in clf.predict(training_X)]

In [54]:
sklearn.metrics.fbeta_score(y_true=training_y, y_pred=training_predictions, beta=1.5)

0.93389656353465

In [55]:
validation_predictions = [0 if pred < 0.5 else 1 for pred in clf.predict(validation_X)]

In [56]:
sklearn.metrics.fbeta_score(y_true=validation_y, y_pred=validation_predictions, beta=1.5)

0.7735140638920709

In [None]:
#vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(preprocessor=custom_preprocessor, tokenizer=custom_tokenizer, ngram_range=(1,5), stop_words='english', norm='l2')
#clf = sklearn.svm.SVC(C=2,kernel='linear',gamma='auto', class_weight='balanced')
#0.7688090369071965

#vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(preprocessor=custom_preprocessor, tokenizer=custom_tokenizer, strip_accents='ascii', stop_words='english', norm='l2')
#clf = sklearn.svm.SVC(C=2,kernel='linear',gamma='auto', class_weight='balanced')
#0.7735140638920709

In [23]:
custom_tokenizer(custom_preprocessor("Hello, my name is jaishriramanujanchanduranjanbalasubranium mikey! #swag"))

['hello', 'my', 'name', 'is', '<LONG_WORD>', 'mikey', '!', '#swag']

In [70]:
blob = TextBlob("hello, thsi is speled incorrectily")

In [75]:
for word in blob.split():
    w = textblob.blob.Word(word)
    print(w.spellcheck())

[('hello', 1.0)]
[('this', 1.0)]
[('is', 1.0)]
[('speed', 0.96875), ('spelled', 0.03125)]
[('incorrectly', 1.0)]


In [35]:
sorted(
    set(
        map(
            lambda x: x[:15], 
            filter(
                lambda x: not re.match(r'^\d+$', x) and max(map(ord, x)) < 128, 
                words_in_toxic_comments - words_in_nontoxic_comments
            )
        )
    ), 
    key=lambda x: (len(x),))

1543783101.677035


In [46]:
custom_tokenizer("hello, this boyyyyyyyyyyyyyyyyyyyyy is me")

['hello,', 'thi', 'boyyyyyyyyyyyyy', 'is', 'me']

In [61]:
test_data

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0
000663aff0fffc80,this other one from 1897,0,0,0,0,0,0
000689dd34e20979,== Reason for banning throwing == \n\n This ar...,0,0,0,0,0,0
000844b52dee5f3f,|blocked]] from editing Wikipedia. |,0,0,0,0,0,0
00091c35fa9d0465,"== Arabs are committing genocide in Iraq, but ...",1,0,0,0,0,0
000968ce11f5ee34,Please stop. If you continue to vandalize Wiki...,0,0,0,0,0,0
