In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
np.random.seed(42)

In [2]:
data = pd.read_csv("./input/train.csv")
print(check_output(["ls", "./input"]).decode("utf8"))
print(data[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].sum())
print(data[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].mean())

sample_submission.csv
sample_submission.csv.zip
test.csv.zip
train.csv
train.csv.zip

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64
toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64


In [3]:
# Create train and Test Datasets
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

print(train.shape)
print(test.shape)

(127572, 8)
(31999, 8)


In [4]:
# Check that mean of th train and test set is comparable to the global population
print(train[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].sum())
print(train[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].mean())
print(test[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].sum())
print(test[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].mean())

toxic            12268
severe_toxic      1278
obscene           6762
threat             394
insult            6334
identity_hate     1125
dtype: int64
toxic            0.096165
severe_toxic     0.010018
obscene          0.053005
threat           0.003088
insult           0.049650
identity_hate    0.008819
dtype: float64
toxic            3026
severe_toxic      317
obscene          1687
threat             84
insult           1543
identity_hate     280
dtype: int64
toxic            0.094565
severe_toxic     0.009907
obscene          0.052720
threat           0.002625
insult           0.048220
identity_hate    0.008750
dtype: float64


## Create Baseline ROC Score

In [5]:
def get_y_vec(df, column_name):
    return df[[column_name]].as_matrix().reshape((-1,))

In [6]:
def compute_roc_score(df=None, column_name=None, prob_vec=None):
    y = get_y_vec(df, column_name)
    return roc_auc_score(y, prob_vec)

In [7]:
#random guessing gets you a ROC of 0.5
print('ROC Score:', compute_roc_score(test, 'toxic', np.random.random((31999,))))
print((get_y_vec(test, 'severe_toxic') == np.zeros((31999,))).mean())


ROC Score: 0.506340337971
0.99009344042


In [8]:
#naive bayes classifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [104]:
# Build function to get balanced samples
def build_balanced_classifier(train, test = None, label = None):
    y_ = get_y_vec(df = train, column_name=label)
    pos_cases = (y_ == 1)
    num_pos_cases = pos_cases.sum()
    idx_desc = np.argsort(-y_) #sort indices high to low so positive cases are at the front
    balance_samples = idx_desc[:2*num_pos_cases] #extract 2x the num pos cases so now 50/50 pos/neg
    print(y_[balance_samples].mean()) #confirm probability
    y_train_bal = y_[balance_samples]
    X_train_bal = train[['comment_text']].as_matrix().reshape((-1,))
    X_train_bal = X_train_bal[balance_samples]
    print(y_train_bal.shape, X_train_bal.shape)
    
    y_test, X_test = None, None
    if test is not None:
        y_test = get_y_vec(df = test, column_name=label)
        X_test = test[['comment_text']].as_matrix().reshape((-1,))    
    return y_train_bal, X_train_bal, y_test, X_test

In [147]:
roc_scores = []
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train_bal, X_train_bal, y_test, X_test = build_balanced_classifier(train, test, label)
    clf = text_clf.fit(X=X_train_bal, y=y_train_bal)
    y_ = clf.predict_proba(X_test)
    y_bin = y_[:,1] > 0.5
    print(classification_report(y_test, y_bin))
    print(confusion_matrix(y_test, y_bin))
    roc = roc_auc_score(y_test, y_[:, 1])
    print(roc)
    roc_scores.append(roc)
    print("########################################\n")
print(sum(roc_scores)/len(roc_scores))

########### toxic ####################

0.5
(24536,) (24536,)
             precision    recall  f1-score   support

          0       0.98      0.91      0.94     28973
          1       0.49      0.86      0.62      3026

avg / total       0.94      0.90      0.91     31999

[[26226  2747]
 [  432  2594]]
0.948275195205
########################################

########### severe_toxic ####################

0.5
(2556,) (2556,)
             precision    recall  f1-score   support

          0       1.00      0.94      0.97     31682
          1       0.13      0.94      0.24       317

avg / total       0.99      0.94      0.96     31999

[[29773  1909]
 [   20   297]]
0.978573748551
########################################

########### obscene ####################

0.5
(13524,) (13524,)
             precision    recall  f1-score   support

          0       0.99      0.91      0.95     30312
          1       0.35      0.86      0.50      1687

avg / total       0.96      0.91      0.

In [111]:
submission = pd.read_csv("./input/test.csv")

In [112]:
X_submission = submission[['comment_text']].as_matrix().reshape((-1,))   

In [114]:
#Create a model on full data set:
results = {}
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train_bal, X_train_bal, y_test, X_test = build_balanced_classifier(train = data, test = None, label = label)
    clf = text_clf.fit(X=X_train_bal, y=y_train_bal)
    y_ = clf.predict_proba(X_submission)
    results[label] = y_[:,1]

########### toxic ####################

0.5
(30588,) (30588,)
########### severe_toxic ####################

0.5
(3190,) (3190,)
########### obscene ####################

0.5
(16898,) (16898,)
########### threat ####################

0.5
(956,) (956,)
########### insult ####################

0.5
(15754,) (15754,)
########### identity_hate ####################

0.5
(2810,) (2810,)


In [138]:
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']
for  label in LABELS:
    submission[label] = results[label]

In [139]:
print(submission[0:3])
submission[0:3]['comment_text'].as_matrix()

                 id                                       comment_text  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   

      toxic  severe_toxic   obscene    threat    insult  identity_hate  
0  0.996461      0.930104  0.991982  0.886851  0.988576       0.958470  
1  0.098059      0.119510  0.091759  0.190254  0.098779       0.179650  
2  0.214485      0.122433  0.182039  0.221707  0.185183       0.207432  


array([ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "'], dtype=object)

In [144]:
headers = ["id"]+LABELS
print(headers)
submission[headers].head()

['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996461,0.930104,0.991982,0.886851,0.988576,0.95847
1,0000247867823ef7,0.098059,0.11951,0.091759,0.190254,0.098779,0.17965
2,00013b17ad220c46,0.214485,0.122433,0.182039,0.221707,0.185183,0.207432
3,00017563c3f7919a,0.064713,0.076143,0.067159,0.170788,0.070593,0.076639
4,00017695ad8997eb,0.245505,0.200181,0.374682,0.266965,0.363224,0.199277


In [146]:
submission[headers].to_csv("./input/submissions.csv", index=False)