In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
np.random.seed(42)

In [2]:
data = pd.read_csv("./input/train.csv")
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']

In [3]:
# Create train and Test Datasets
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

print(train.shape)
print(test.shape)

(127572, 8)
(31999, 8)


## Create Baseline ROC Score

In [4]:
def get_y_vec(df, column_name):
    return df[[column_name]].as_matrix().reshape((-1,))

In [14]:
#naive bayes classifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=4, ngram_range=(1,2), stop_words='english', lowercase=True, binary=False)),
    ('clf', SGDClassifier(loss='log', n_jobs=-1, max_iter = 1000, tol = 1e-3)),
    #('clf', LogisticRegression(C = 0.8))
])

In [15]:
# Build function to get balanced samples
def build_balanced_classifier(train, test = None, label = None):
    y_ = get_y_vec(df = train, column_name=label)
    pos_cases = (y_ == 1)
    num_pos_cases = pos_cases.sum()
    idx_desc = np.argsort(-y_) #sort indices high to low so positive cases are at the front
    balance_samples = idx_desc[:2*num_pos_cases] #extract 2x the num pos cases so now 50/50 pos/neg
    print(y_[balance_samples].mean()) #confirm probability
    y_train_bal = y_[balance_samples]
    X_train_bal = train[['comment_text']].as_matrix().reshape((-1,))
    X_train_bal = X_train_bal[balance_samples]
    print(y_train_bal.shape, X_train_bal.shape)
    
    y_test, X_test = None, None
    if test is not None:
        y_test = get_y_vec(df = test, column_name=label)
        X_test = test[['comment_text']].as_matrix().reshape((-1,))    
    return y_train_bal, X_train_bal, y_test, X_test

In [16]:
roc_scores = []
for label in LABELS:
    print("########### %s ####################\n"%(label))
    y_train_bal, X_train_bal, y_test, X_test = build_balanced_classifier(train, test, label)
    clf = text_clf.fit(X=X_train_bal, y=y_train_bal)
    y_ = clf.predict_proba(X_test)
    y_bin = y_[:,1] > 0.5
    print(classification_report(y_test, y_bin))
    print(confusion_matrix(y_test, y_bin))
    roc = roc_auc_score(y_test, y_[:, 1])
    print(roc)
    roc_scores.append(roc)
    print("########################################\n")
print(sum(roc_scores)/len(roc_scores))

########### toxic ####################

0.5
(24536,) (24536,)
             precision    recall  f1-score   support

          0       0.98      0.92      0.95     28973
          1       0.53      0.85      0.66      3026

avg / total       0.94      0.92      0.92     31999

[[26714  2259]
 [  443  2583]]
0.955326230869
########################################

########### severe_toxic ####################

0.5
(2556,) (2556,)
             precision    recall  f1-score   support

          0       1.00      0.95      0.98     31682
          1       0.17      0.93      0.29       317

avg / total       0.99      0.95      0.97     31999

[[30236  1446]
 [   21   296]]
0.984890563699
########################################

########### obscene ####################

0.5
(13524,) (13524,)
             precision    recall  f1-score   support

          0       0.99      0.96      0.98     30312
          1       0.55      0.87      0.67      1687

avg / total       0.97      0.96      0.

In [17]:
submission = pd.read_csv("./input/test.csv")

In [18]:
X_submission = submission[['comment_text']].as_matrix().reshape((-1,))   

In [19]:
#Create a model on full data set:
results = {}
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train_bal, X_train_bal, y_test, X_test = build_balanced_classifier(train = data, test = None, label = label)
    clf = text_clf.fit(X=X_train_bal, y=y_train_bal)
    y_ = clf.predict_proba(X_submission)
    results[label] = y_[:,1]

########### toxic ####################

0.5
(30588,) (30588,)
########### severe_toxic ####################

0.5
(3190,) (3190,)
########### obscene ####################

0.5
(16898,) (16898,)
########### threat ####################

0.5
(956,) (956,)
########### insult ####################

0.5
(15754,) (15754,)
########### identity_hate ####################

0.5
(2810,) (2810,)


In [20]:
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']
for  label in LABELS:
    submission[label] = results[label]

In [21]:
print(submission[0:3])
submission[0:3]['comment_text'].as_matrix()

                 id                                       comment_text  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   

     toxic  severe_toxic   obscene    threat    insult  identity_hate  
0  0.96596      0.952806  0.975238  0.969192  0.952758       0.978525  
1  0.17514      0.115041  0.152125  0.136733  0.210668       0.162814  
2  0.13609      0.068748  0.108716  0.043303  0.135494       0.184452  


array([ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "'], dtype=object)

In [22]:
headers = ["id"]+LABELS
print(headers)
submission[headers].head()

['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.96596,0.952806,0.975238,0.969192,0.952758,0.978525
1,0000247867823ef7,0.17514,0.115041,0.152125,0.136733,0.210668,0.162814
2,00013b17ad220c46,0.13609,0.068748,0.108716,0.043303,0.135494,0.184452
3,00017563c3f7919a,0.087092,0.088923,0.094538,0.10307,0.092825,0.050506
4,00017695ad8997eb,0.375896,0.136211,0.267587,0.075424,0.302074,0.162704


In [23]:
import time
submission[headers].to_csv("./input/submissions-%s.csv"%str(int(time.time())), index=False)