In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_file = os.path.join('.', 'train.csv')
test_file = os.path.join('.', 'test.csv')

In [3]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

train_text = train["comment_text"].fillna("_na_")
test_text = test["comment_text"].fillna("_na_")
all_text = pd.concat([train_text, test_text])

In [4]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [7]:
# Logistic Regression
scores = []
submission = pd.read_csv('./sample_submission.csv')
for c in class_names:
    train_target = train[c]
    clf = LogisticRegression(C=0.1, solver='sag')
    clf.fit(train_word_features, train_target)
    cv_score = np.mean(cross_val_score(clf, train_word_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    submission[c] = clf.predict_proba(test_word_features)[:, 1]
    
print ('Total CV score is {}'.format(np.mean(scores)))
submission.to_csv('submission3.csv', index=False)

Total CV score is 0.9735008430749542


In [None]:
# GBDT
scores = []
submission = pd.read_csv('./sample_submission.csv')
for c in class_names:
    train_target = train[c]
    clf = GradientBoostingClassifier()
    clf.fit(train_word_features, train_target)
    cv_score = np.mean(cross_val_score(clf, train_word_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    submission[c] = clf.predict_proba(test_word_features)[:, 1]
    
print ('Total CV score is {}'.format(np.mean(scores)))
submission.to_csv('submission4.csv', index=False)