In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
np.random.seed(42)

In [2]:
data = pd.read_csv("./input/train.csv")
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']

In [3]:
# Create train and Test Datasets
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

print(train.shape)
print(test.shape)

(127572, 8)
(31999, 8)


## Create Baseline ROC Score

In [4]:
def get_y_vec(df, column_name):
    return df[[column_name]].as_matrix().reshape((-1,))

In [5]:
#naive bayes classifier
#with C = 1.0 0.978368974747
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
# text_clf = Pipeline([
#     ('tfidf', TfidfVectorizer(
#         min_df=0.01,
#         max_df=0.99,
#         sublinear_tf=True,
#         analyzer='char',
#         ngram_range=(1,5), 
#         stop_words='english', 
#         lowercase=True, 
#         binary=False
#     )),
#     #('clf', SGDClassifier(loss='log', n_jobs=-1, max_iter = 1000, tol = 1e-3)),
#     ('clf', LogisticRegression())
# ])

In [16]:
char_tfidf =  TfidfVectorizer(
        min_df=0.0001,
        max_df=0.50,
        sublinear_tf=True,
        analyzer='char',
        ngram_range=(1,5), 
        lowercase=True, 
        binary=False,
)
word_tfidf =  TfidfVectorizer(
    min_df=0.0001,
    max_df=0.50,
    sublinear_tf=True,
    ngram_range=(1,2), 
    stop_words='english', 
    lowercase=True, 
    binary=False
)
lr = LogisticRegression()


In [17]:
from scipy.sparse import hstack

In [18]:
X_train = train[['comment_text']].as_matrix().reshape((-1,))
X_train_char_tfidf = char_tfidf.fit_transform(X_train)
print("finished char")
X_train_word_tfidf = word_tfidf.fit_transform(X_train)
X_train_out =  hstack([X_train_char_tfidf,X_train_word_tfidf])

finished char


ValueError: blocks must be 2-D

In [23]:
X_train_out = hstack([X_train_char_tfidf, X_train_word_tfidf])

In [24]:
X_train_out.shape

(127572, 288261)

In [25]:
X_test = test[['comment_text']].as_matrix().reshape((-1,))

#X_train_tfidf = tfidf.transform(X_train)
X_test_word_tfidf = word_tfidf.transform(X_test)
X_test_char_tfidf = char_tfidf.transform(X_test)
X_test_out = hstack([X_test_char_tfidf, X_test_word_tfidf])
X_test_out.shape

(31999, 288261)

In [26]:
roc_scores = []
for label in LABELS:
    print("########### %s ####################\n"%(label))
    
    y_train = get_y_vec(df = train, column_name=label)    
    y_test = get_y_vec(df = test, column_name=label)
    
    clf = lr.fit(X=X_train_out, y=y_train)
    y_ = clf.predict_proba(X_test_out)
    y_bin = y_[:,1] > 0.5
    print(classification_report(y_test, y_bin))
    print(confusion_matrix(y_test, y_bin))
    roc = roc_auc_score(y_test, y_[:, 1])
    print(roc)
    roc_scores.append(roc)
    print("########################################\n")
print(sum(roc_scores)/len(roc_scores))

########### toxic ####################

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     28973
          1       0.91      0.68      0.78      3026

avg / total       0.96      0.96      0.96     31999

[[28776   197]
 [  980  2046]]
0.979954375098
########################################

########### severe_toxic ####################

             precision    recall  f1-score   support

          0       0.99      1.00      1.00     31682
          1       0.61      0.28      0.39       317

avg / total       0.99      0.99      0.99     31999

[[31624    58]
 [  227    90]]
0.989975898106
########################################

########### obscene ####################

             precision    recall  f1-score   support

          0       0.98      1.00      0.99     30312
          1       0.91      0.68      0.78      1687

avg / total       0.98      0.98      0.98     31999

[[30204   108]
 [  539  1148]]
0.991161237495
####

In [27]:
submission = pd.read_csv("./input/test.csv")

In [34]:
X_train = data[['comment_text']].as_matrix().reshape((-1,))
X_train_char_tfidf = char_tfidf.fit_transform(X_train)
print("finished char")
X_train_word_tfidf = word_tfidf.fit_transform(X_train)
print("finished word")
X_train_out =  hstack([X_train_char_tfidf,X_train_word_tfidf])
X_train_out.shape

finished char
finished word


(159571, 289060)

In [35]:
X_submission = submission[['comment_text']].as_matrix().reshape((-1,))   
X_submission_char_tfidf = char_tfidf.transform(X_submission)
print("finished char")
X_submission_word_tfidf = word_tfidf.transform(X_submission)
print("finished word")
X_submission_out =  hstack([X_submission_char_tfidf,X_submission_word_tfidf])
X_submission_out.shape

finished char
finished word


(153164, 289060)

In [37]:
#Create a model on full data set:
results = {}
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train = get_y_vec(df = data, column_name=label)
    clf = lr.fit(X=X_train_out, y=y_train)
    y_ = clf.predict_proba(X_submission_out)
    results[label] = y_[:,1]

########### toxic ####################

########### severe_toxic ####################

########### obscene ####################

########### threat ####################

########### insult ####################

########### identity_hate ####################



In [38]:
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']
for  label in LABELS:
    submission[label] = results[label]

In [39]:
print(submission[0:3])
submission[0:3]['comment_text'].as_matrix()

                 id                                       comment_text  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   

      toxic  severe_toxic   obscene    threat    insult  identity_hate  
0  0.999856      0.241858  0.999507  0.033102  0.978433       0.344077  
1  0.008738      0.003096  0.004654  0.000889  0.006233       0.002726  
2  0.020692      0.002995  0.012085  0.000682  0.006730       0.001924  


array([ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland â€”  /  "'], dtype=object)

In [40]:
headers = ["id"]+LABELS
print(headers)
submission[headers].head()

['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999856,0.241858,0.999507,0.033102,0.978433,0.344077
1,0000247867823ef7,0.008738,0.003096,0.004654,0.000889,0.006233,0.002726
2,00013b17ad220c46,0.020692,0.002995,0.012085,0.000682,0.00673,0.001924
3,00017563c3f7919a,0.004155,0.001605,0.002402,0.000891,0.002894,0.000579
4,00017695ad8997eb,0.02237,0.001843,0.006253,0.001035,0.01127,0.00156


In [41]:
import time
submission[headers].to_csv("./input/submissions-%s.csv"%str(int(time.time())), index=False)