# Simple Baseline Model (TF-IDF into Logistic Regression)

Generated features consists of word-based and character-based TF-IDF (term frequency - inverse document frequency) values.

The baseline model is a simple logistic regression.

Random Search CV hunts for the optimal vectorization and model parameters, achieving a local CV ROC-AUC score of 98.48. Pretty good for such a simple model! The substantially more complex neural models that will be developed later will achieve ROC-AUC scores of 99.5, but at the cost of much longer training times and computational complexity.

In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from scipy.sparse import hstack
from scipy.special import logit, expit

from sklearn.pipeline import Pipeline, FeatureUnion

from time import time

In [24]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('single_model_predictions/train_split.csv').fillna(' ')
valid = pd.read_csv('single_model_predictions/valid/valid_split.csv').fillna(' ')
test  = pd.read_csv('test.csv').fillna(' ')

train_text = train['comment_text']
valid_text = valid['comment_text']
test_text  = test['comment_text']

all_text = pd.concat([train_text, valid_text, test_text])

# TF-IDF Words and Characters

In [25]:
word_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='word',
                                  token_pattern=r'\w{1,}',
                                  max_features=10000)

char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  max_features=10000)

classifier = LogisticRegression(solver='sag')

In [26]:
word_char_vectorizer = FeatureUnion([
    ('word_vect', word_vectorizer),
    ('char_vect', char_vectorizer),
     ])

In [27]:
pipeline = Pipeline([
    ('vect', word_char_vectorizer),
    ('clf', classifier),
     ])

In [28]:
param_dist = {
    "vect__word_vect__ngram_range":  [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    "vect__char_vect__ngram_range":  [(1, 1), (1, 3), (1, 5), (2, 3), (2, 5), (3, 5)],
    "clf__C":                        [0.1, 1, 10]
}

# Random Search CV over the Pipeline

In [29]:
n_iter_search = 20;

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=n_iter_search, cv=3, 
                                   scoring='roc_auc', n_jobs=7, verbose=1)

start = time()

random_search.fit(train['comment_text'].values, train[class_names[0]].values)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

print("\nBest Score = " + str(random_search.best_score_))

print("\nBest Parameters = " + str(random_search.best_params_))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed: 33.9min
[Parallel(n_jobs=7)]: Done  60 out of  60 | elapsed: 52.1min finished


RandomizedSearchCV took 3234.59 seconds for 20 candidates parameter settings.

Best Score = 0.977276937629

Best Parameters = {'vect__word_vect__ngram_range': (1, 1), 'vect__char_vect__ngram_range': (1, 3), 'clf__C': 1}


In [13]:
test_predictions  = {'id': test['id']}
test_predictions[class_names[0]] = random_search.predict_proba(test['comment_text'].values)[:, 1]

In [44]:
losses = []
test_predictions  = {'id': test['id']}
valid_predictions = {'id': valid['id']}

for class_name in class_names:
    train_target = train[class_name]
    classifier = pipeline.set_params(clf__C=1, vect__word_vect__ngram_range=(1,1), vect__char_vect__ngram_range=(1, 3))

    cv_loss = np.mean(cross_val_score(classifier, train['comment_text'].values, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train['comment_text'].values, train_target)
    test_predictions[class_name] = classifier.predict_proba(test['comment_text'].values)[:, 1]
    valid_predictions[class_name] = classifier.predict_proba(valid['comment_text'].values)[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))

CV score for class toxic is 0.9772769088526844
CV score for class severe_toxic is 0.9885642581995572
CV score for class obscene is 0.9898989711408408
CV score for class threat is 0.9884735470340512
CV score for class insult is 0.9817733068152982
CV score for class identity_hate is 0.9828762819399839
Total CV score is 0.9848105456637359


In [15]:
losses = []
test_predictions  = {'id': test['id']}
valid_predictions = {'id': valid['id']}

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_features, train_target)
    test_predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    valid_predictions[class_name] = classifier.predict_proba(valid_features)[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))

CV score for class toxic is 0.9703823806373831
CV score for class severe_toxic is 0.9850204552076671
CV score for class obscene is 0.9840357267528151
CV score for class threat is 0.9854014125653093
CV score for class insult is 0.9774041293366643
CV score for class identity_hate is 0.9741863733994759
Total CV score is 0.979405079649886


# Export Predictions

In [45]:
test_submission = pd.DataFrame.from_dict(test_predictions)
test_submission.to_csv('single_model_predictions/other/test/18_02_16_BagOfWords_TFIDF_LogisticRegression_Test.csv', index=False)

In [46]:
valid_submission = pd.DataFrame.from_dict(valid_predictions)
valid_submission.to_csv('single_model_predictions/other/valid/18_02_16_BagOfWords_TFIDF_LogisticRegression_Valid.csv', index=False)