In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
np.random.seed(42)

In [7]:
data = pd.read_csv("./input/train.csv")
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']

In [8]:
# Create train and Test Datasets
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

print(train.shape)
print(test.shape)

(127572, 8)
(31999, 8)


## Create Baseline ROC Score

In [9]:
def get_y_vec(df, column_name):
    return df[[column_name]].as_matrix().reshape((-1,))

In [10]:
#naive bayes classifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=4, ngram_range=(1,2), stop_words='english', lowercase=True, binary=False)),
    ('clf', SGDClassifier(loss='log', n_jobs=-1, max_iter = 1000, tol = 1e-3)),
    #('clf', LogisticRegression(C = 0.8))
])

In [12]:
roc_scores = []
for label in LABELS:
    print("########### %s ####################\n"%(label))
    
    y_train = get_y_vec(df = train, column_name=label)
    X_train = train[['comment_text']].as_matrix().reshape((-1,))
    
    y_test = get_y_vec(df = test, column_name=label)
    X_test = test[['comment_text']].as_matrix().reshape((-1,))
    
    clf = text_clf.fit(X=X_train, y=y_train)
    y_ = clf.predict_proba(X_test)
    y_bin = y_[:,1] > 0.35
    print(classification_report(y_test, y_bin))
    print(confusion_matrix(y_test, y_bin))
    roc = roc_auc_score(y_test, y_[:, 1])
    print(roc)
    roc_scores.append(roc)
    print("########################################\n")
print(sum(roc_scores)/len(roc_scores))

########### toxic ####################

0.5
(24536,) (24536,)
             precision    recall  f1-score   support

          0       0.93      1.00      0.96     28973
          1       0.99      0.25      0.40      3026

avg / total       0.93      0.93      0.91     31999

[[28966     7]
 [ 2275   751]]
0.954919899556
########################################

########### severe_toxic ####################

0.5
(2556,) (2556,)
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     31682
          1       0.42      0.02      0.03       317

avg / total       0.98      0.99      0.99     31999

[[31675     7]
 [  312     5]]
0.984964245438
########################################

########### obscene ####################

0.5
(13524,) (13524,)
             precision    recall  f1-score   support

          0       0.96      1.00      0.98     30312
          1       0.99      0.28      0.43      1687

avg / total       0.96      0.96      0.

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00     31915
          1       0.00      0.00      0.00        84

avg / total       0.99      1.00      1.00     31999

[[31915     0]
 [   84     0]]
0.971154965198
########################################

########### insult ####################

0.5
(12668,) (12668,)
             precision    recall  f1-score   support

          0       0.96      1.00      0.98     30456
          1       0.91      0.20      0.33      1543

avg / total       0.96      0.96      0.95     31999

[[30424    32]
 [ 1236   307]]
0.967242725862
########################################

########### identity_hate ####################

0.5
(2250,) (2250,)
             precision    recall  f1-score   support

          0       0.99      1.00      1.00     31719
          1       0.00      0.00      0.00       280

avg / total       0.98      0.99      0.99     31999

[[31719     0]
 [  280     0]]
0.966452227822
##

  'precision', 'predicted', average, warn_for)


In [13]:
submission = pd.read_csv("./input/test.csv")

In [14]:
X_submission = submission[['comment_text']].as_matrix().reshape((-1,))   

In [15]:
#Create a model on full data set:
results = {}
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train_bal, X_train_bal, y_test, X_test = build_balanced_classifier(train = data, test = None, label = label)
    clf = text_clf.fit(X=X_train_bal, y=y_train_bal)
    y_ = clf.predict_proba(X_submission)
    results[label] = y_[:,1]

########### toxic ####################

0.5
(30588,) (30588,)
########### severe_toxic ####################

0.5
(3190,) (3190,)
########### obscene ####################

0.5
(16898,) (16898,)
########### threat ####################

0.5
(956,) (956,)
########### insult ####################

0.5
(15754,) (15754,)
########### identity_hate ####################

0.5
(2810,) (2810,)


In [16]:
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']
for  label in LABELS:
    submission[label] = results[label]

In [17]:
print(submission[0:3])
submission[0:3]['comment_text'].as_matrix()

                 id                                       comment_text  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   

      toxic  severe_toxic   obscene    threat    insult  identity_hate  
0  0.674154      0.034076  0.491013  0.005864  0.391845       0.023581  
1  0.057150      0.008737  0.034132  0.004264  0.034122       0.008764  
2  0.069053      0.008961  0.037025  0.004286  0.036915       0.008922  


array([ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "'], dtype=object)

In [18]:
headers = ["id"]+LABELS
print(headers)
submission[headers].head()

['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.674154,0.034076,0.491013,0.005864,0.391845,0.023581
1,0000247867823ef7,0.05715,0.008737,0.034132,0.004264,0.034122,0.008764
2,00013b17ad220c46,0.069053,0.008961,0.037025,0.004286,0.036915,0.008922
3,00017563c3f7919a,0.041036,0.007786,0.026333,0.003927,0.026221,0.007497
4,00017695ad8997eb,0.081241,0.007832,0.038195,0.004001,0.036422,0.007717


In [19]:
import time
submission[headers].to_csv("./input/submissions-%s.csv"%str(int(time.time())), index=False)