In [155]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack

from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

import time
import string

#### some helper stuff for tracking performance over the duration of the project

In [156]:
def write_model_timestamp(model_type, kfolds, scores, note):
    '''
    Parameters:
    model_type = string description of the model(s) used to make these scores
    kfolds     = how many folds in kfold cross validation used
    scores     = list of ROC AUC avg scores of models for each class, floats should be like 0.9784
    note       = string, whatever is of note about the model, made a change or whatever
    
    Returns:
    None, but writes (appends) a line to scores.txt in the root directory so that progress can be tracked
    The format is:
            time(s)~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
            
    scores.txt is a tilde '~' seperated CSV like:
        time~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
        1520303252~0.9794005980274005~note something
    '''

    out_text = "{:10.0f}~{:}~{:2d}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:}\n".format(time.time(), 
                                             model_type, 
                                             kfolds, 
                                             np.mean(scores),
                                             scores[0],
                                             scores[1],
                                             scores[2],
                                             scores[3],
                                             scores[4],
                                             scores[5],                                                
                                             note)
    
    with open("../scores.txt", 'a') as out_file:
        out_file.write(out_text)
        
        print("wrote:")
        print(out_text)
        print("to file")

#### Load data and light processing

In [157]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [158]:
# optionally lemmatize words before vectorizing
lemming = WordNetLemmatizer()

def stringer(_word):
    return " ".join(lemming.lemmatize(word) for word in _word.split())

lem_alltext = all_text.apply(stringer)

print(lem_alltext[0])

0    Explanation Why the edits made under my userna...
0    Yo bitch Ja Rule is more succesful then you'll...
Name: comment_text, dtype: object


In [159]:
# test = '''Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169'''

test = '''I think that your a Fagget get a oife and burn in Hell I hate you 'm sorry we cant have any more sex i'm running out of conndoms'''

print(string.punctuation)

" ".join(lemming.lemmatize(word) for word in test.split())

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


"I think that your a Fagget get a oife and burn in Hell I hate you 'm sorry we cant have any more sex i'm running out of conndoms"

In [160]:
lem_alltext[0]

0    Explanation Why the edits made under my userna...
0    Yo bitch Ja Rule is more succesful then you'll...
Name: comment_text, dtype: object

In [161]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=50000)

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [162]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print("train shape:", train_word_features.shape)
print("test shape:", test_word_features.shape)

train shape: (159571, 50000)
test shape: (153164, 50000)


In [163]:
scores = []

NUM_FOLDS = 10

train_features = train_word_features.copy()

# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1337)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

write_model_timestamp('logistic regression', NUM_FOLDS, scores, "first model: logistic regression, word to vec max 50k features, lemmatizing")

CV Spread for class "toxic":
    0.9709     0.9694     0.9714     0.9721     0.9732     0.9720     0.9684     0.9716     0.9727     0.9729  
    CV score for class "toxic" is 0.9715

CV Spread for class "severe_toxic":
    0.9838     0.9876     0.9816     0.9862     0.9876     0.9838     0.9883     0.9873     0.9878     0.9889  
    CV score for class "severe_toxic" is 0.9863

CV Spread for class "obscene":
    0.9884     0.9871     0.9842     0.9854     0.9878     0.9831     0.9868     0.9895     0.9876     0.9862  
    CV score for class "obscene" is 0.9866

CV Spread for class "threat":
    0.9723     0.9904     0.9919     0.9851     0.9727     0.9880     0.9897     0.9782     0.9870     0.9718  
    CV score for class "threat" is 0.9827

CV Spread for class "insult":
    0.9790     0.9765     0.9784     0.9754     0.9798     0.9749     0.9762     0.9776     0.9815     0.9790  
    CV score for class "insult" is 0.9778

CV Spread for class "identity_hate":
    0.9782     0.9723     

In [44]:
print("{: <14} {:}".format("CLASS", "AVG ROC AUC"))

for item in zip(class_names, scores):
    print("{: <14} {:0.4f}".format(item[0], item[1]))

CLASS          AVG ROC AUC
toxic          0.9698
severe_toxic   0.9859
obscene        0.9854
threat         0.9828
insult         0.9765
identity_hate  0.9761
