In [22]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack


#### Load data and light processing

In [11]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [12]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [30]:
scores = []

NUM_FOLDS = 10

train_features = train_word_features.copy()

# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1337)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

CV Spread for class "toxic":
    0.9682     0.9680     0.9688     0.9702     0.9715     0.9709     0.9665     0.9706     0.9712     0.9717  
    CV score for class "toxic" is 0.9698

CV Spread for class "severe_toxic":
    0.9831     0.9868     0.9799     0.9860     0.9872     0.9836     0.9881     0.9876     0.9880     0.9885  
    CV score for class "severe_toxic" is 0.9859

CV Spread for class "obscene":
    0.9877     0.9858     0.9819     0.9849     0.9868     0.9809     0.9861     0.9884     0.9867     0.9849  
    CV score for class "obscene" is 0.9854

CV Spread for class "threat":
    0.9710     0.9901     0.9927     0.9860     0.9755     0.9887     0.9874     0.9783     0.9863     0.9721  
    CV score for class "threat" is 0.9828

CV Spread for class "insult":
    0.9781     0.9756     0.9760     0.9731     0.9778     0.9739     0.9759     0.9765     0.9801     0.9777  
    CV score for class "insult" is 0.9765

CV Spread for class "identity_hate":
    0.9784     0.9714     

In [44]:
print("{: <14} {:}".format("CLASS", "AVG ROC AUC"))

for item in zip(class_names, scores):
    print("{: <14} {:0.4f}".format(item[0], item[1]))

CLASS          AVG ROC AUC
toxic          0.9698
severe_toxic   0.9859
obscene        0.9854
threat         0.9828
insult         0.9765
identity_hate  0.9761
