In [15]:
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

import time

#### some helper stuff for tracking performance over the duration of the project

In [3]:
def write_model_timestamp(model_type, kfolds, scores, note):
    '''
    Parameters:
    model_type = string description of the model(s) used to make these scores
    kfolds     = how many folds in kfold cross validation used
    scores     = list of ROC AUC avg scores of models for each class, floats should be like 0.9784
    note       = string, whatever is of note about the model, made a change or whatever
    
    Returns:
    None, but writes (appends) a line to scores.txt in the root directory so that progress can be tracked
    The format is:
            time(s)~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
            
    scores.txt is a tilde '~' seperated CSV like:
        time~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
        1520303252~0.9794005980274005~note something
    '''

    out_text = "{:10.0f}~{:}~{:2d}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:}\n".format(time.time(), 
                                             model_type, 
                                             kfolds, 
                                             np.mean(scores),
                                             scores[0],
                                             scores[1],
                                             scores[2],
                                             scores[3],
                                             scores[4],
                                             scores[5],                                                
                                             note)
    
    with open("../scores.txt", 'a') as out_file:
        out_file.write(out_text)
        
        print("wrote:")
        print(out_text)
        print("to file")

#### Load data and light processing

In [4]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [5]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=50000)    # 10k was initial

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print("train shape:", train_word_features.shape)
print("test shape:", test_word_features.shape)

train shape: (159571, 50000)
test shape: (153164, 50000)


#### Attempt to tune on a single split instead of 10 kfold since that will take forever

In [25]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for _class in class_names:
    x_train, x_test, y_train, y_test = train_test_split(train_word_features, train[_class], test_size=0.1, random_state=1337)
    
    lrs = [0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65]
    nests = [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75]
    
    for lr in lrs:
        for nest in nests:
            model = GradientBoostingClassifier(learning_rate=0.5, n_estimators=50)
            model.fit(x_train, y_train)

            preds = model.predict(x_test)

            result = roc_auc_score(y_test, preds)

            print("lr: {:0.2f}   nests: {:2d}".format(lr, nest))
            print("Result for {:} is ROC AUC {:0.4f}".format(_class, result))
    

lr: 0.35   nests: 25
Result for toxic is ROC AUC 0.7918
lr: 0.35   nests: 30
Result for toxic is ROC AUC 0.7930
lr: 0.35   nests: 35
Result for toxic is ROC AUC 0.7918
lr: 0.35   nests: 40
Result for toxic is ROC AUC 0.7930
lr: 0.35   nests: 45
Result for toxic is ROC AUC 0.7933
lr: 0.35   nests: 50
Result for toxic is ROC AUC 0.7930
lr: 0.35   nests: 55
Result for toxic is ROC AUC 0.7923
lr: 0.35   nests: 60
Result for toxic is ROC AUC 0.7937
lr: 0.35   nests: 65
Result for toxic is ROC AUC 0.7933
lr: 0.35   nests: 70
Result for toxic is ROC AUC 0.7937
lr: 0.35   nests: 75
Result for toxic is ROC AUC 0.7922
lr: 0.40   nests: 25
Result for toxic is ROC AUC 0.7927
lr: 0.40   nests: 30
Result for toxic is ROC AUC 0.7928
lr: 0.40   nests: 35
Result for toxic is ROC AUC 0.7933
lr: 0.40   nests: 40
Result for toxic is ROC AUC 0.7928
lr: 0.40   nests: 45
Result for toxic is ROC AUC 0.7940
lr: 0.40   nests: 50
Result for toxic is ROC AUC 0.7930
lr: 0.40   nests: 55
Result for toxic is ROC AUC

lr: 0.60   nests: 60
Result for severe_toxic is ROC AUC 0.7301
lr: 0.60   nests: 65
Result for severe_toxic is ROC AUC 0.7301
lr: 0.60   nests: 70
Result for severe_toxic is ROC AUC 0.7301
lr: 0.60   nests: 75
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 25
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 30
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 35
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 40
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 45
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 50
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 55
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 60
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 65
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 70
Result for severe_toxic is ROC AUC 0.7301
lr: 0.65   nests: 75
Result for severe_toxic is ROC AUC 0.7301
lr: 0.35   nests: 25
Result for obscene is ROC AUC 0.85

lr: 0.55   nests: 50
Result for threat is ROC AUC 0.6817
lr: 0.55   nests: 55
Result for threat is ROC AUC 0.6817
lr: 0.55   nests: 60
Result for threat is ROC AUC 0.6817
lr: 0.55   nests: 65
Result for threat is ROC AUC 0.6817
lr: 0.55   nests: 70
Result for threat is ROC AUC 0.6817
lr: 0.55   nests: 75
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 25
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 30
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 35
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 40
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 45
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 50
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 55
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 60
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 65
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 70
Result for threat is ROC AUC 0.6817
lr: 0.60   nests: 75
Result for threat is ROC AUC 0.6817
lr: 0.65   nests: 25
Result for

lr: 0.50   nests: 35
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 40
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 45
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 50
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 55
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 60
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 65
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 70
Result for identity_hate is ROC AUC 0.6607
lr: 0.50   nests: 75
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 25
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 30
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 35
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 40
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 45
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 50
Result for identity_hate is ROC AUC 0.6607
lr: 0.55   nests: 55
Result for identity

In [19]:
scores = []

NUM_FOLDS = 10

train_features = train_word_features.copy()

# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1337)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

write_model_timestamp('logistic regression', NUM_FOLDS, scores, "first model: word to vec max 100k features")

CV Spread for class "toxic":
    0.9706     0.9692     0.9715     0.9721     0.9732     0.9718     0.9682     0.9715     0.9726     0.9728  
    CV score for class "toxic" is 0.9714

CV Spread for class "severe_toxic":
    0.9841     0.9875     0.9817     0.9870     0.9875     0.9836     0.9882     0.9872     0.9878     0.9890  
    CV score for class "severe_toxic" is 0.9864

CV Spread for class "obscene":
    0.9881     0.9877     0.9840     0.9853     0.9880     0.9831     0.9867     0.9894     0.9876     0.9862  
    CV score for class "obscene" is 0.9866

CV Spread for class "threat":
    0.9722     0.9903     0.9919     0.9849     0.9723     0.9882     0.9898     0.9779     0.9874     0.9717  
    CV score for class "threat" is 0.9827

CV Spread for class "insult":
    0.9787     0.9766     0.9781     0.9754     0.9800     0.9747     0.9761     0.9774     0.9816     0.9787  
    CV score for class "insult" is 0.9777

CV Spread for class "identity_hate":
    0.9773     0.9727     

In [44]:
print("{: <14} {:}".format("CLASS", "AVG ROC AUC"))

for item in zip(class_names, scores):
    print("{: <14} {:0.4f}".format(item[0], item[1]))

CLASS          AVG ROC AUC
toxic          0.9698
severe_toxic   0.9859
obscene        0.9854
threat         0.9828
insult         0.9765
identity_hate  0.9761
