In [69]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack

import time

#### some helper stuff for tracking performance over the duration of the project

In [75]:
def write_model_timestamp(model_type, kfolds, scores, note):
    '''
    Parameters:
    model_type = string description of the model(s) used to make these scores
    kfolds     = how many folds in kfold cross validation used
    scores     = list of ROC AUC avg scores of models for each class, floats should be like 0.9784
    note       = string, whatever is of note about the model, made a change or whatever
    
    Returns:
    None, but writes (appends) a line to scores.txt in the root directory so that progress can be tracked
    The format is:
            time(s)~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
            
    scores.txt is a tilde '~' seperated CSV like:
        time~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
        1520303252~0.9794005980274005~note something
    '''

    out_text = "{:10.0f}~{:}~{:2d}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:}\n".format(time.time(), 
                                             model_type, 
                                             kfolds, 
                                             np.mean(scores),
                                             scores[0],
                                             scores[1],
                                             scores[2],
                                             scores[3],
                                             scores[4],
                                             note)
    
    with open("../scores.txt", 'a') as out_file:
        out_file.write(out_text)
        
        print("wrote:")
        print(out_text)
        print("to file")

#### Load data and light processing

In [71]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [72]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000)    # 10k was initial

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [73]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print("train shape:", train_word_features.shape)
print("test shape:", test_word_features.shape)

train shape: (159571, 5000)
test shape: (153164, 5000)


In [74]:
scores = []

NUM_FOLDS = 10

train_features = train_word_features.copy()

# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1337)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

write_model_timestamp('logistic regression', NUM_FOLDS, scores, "first model: logistic regression, word to vec max 5k features, kfold=10")

CV Spread for class "toxic":
    0.9653     0.9658     0.9659     0.9670     0.9690     0.9664     0.9637     0.9675     0.9683     0.9695  
    CV score for class "toxic" is 0.9668

CV Spread for class "severe_toxic":
    0.9824     0.9864     0.9782     0.9862     0.9870     0.9826     0.9879     0.9867     0.9890     0.9883  
    CV score for class "severe_toxic" is 0.9855

CV Spread for class "obscene":
    0.9866     0.9848     0.9803     0.9841     0.9862     0.9808     0.9844     0.9870     0.9858     0.9834  
    CV score for class "obscene" is 0.9843

CV Spread for class "threat":
    0.9714     0.9897     0.9928     0.9855     0.9773     0.9851     0.9871     0.9789     0.9871     0.9706  
    CV score for class "threat" is 0.9825

CV Spread for class "insult":
    0.9769     0.9723     0.9749     0.9718     0.9760     0.9724     0.9743     0.9743     0.9793     0.9766  
    CV score for class "insult" is 0.9749

CV Spread for class "identity_hate":
    0.9779     0.9676     

In [44]:
print("{: <14} {:}".format("CLASS", "AVG ROC AUC"))

for item in zip(class_names, scores):
    print("{: <14} {:0.4f}".format(item[0], item[1]))

CLASS          AVG ROC AUC
toxic          0.9698
severe_toxic   0.9859
obscene        0.9854
threat         0.9828
insult         0.9765
identity_hate  0.9761
