In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
np.random.seed(42)

In [2]:
data = pd.read_csv("./input/train.csv")
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']

In [3]:
# Create train and Test Datasets
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

print(train.shape)
print(test.shape)

(127572, 8)
(31999, 8)


## Create Baseline ROC Score

In [4]:
def get_y_vec(df, column_name):
    return df[[column_name]].as_matrix().reshape((-1,))

In [9]:
#naive bayes classifier
#with C = 1.0 0.978368974747
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=4, 
        sublinear_tf=True,
        analyzer='char',
        ngram_range=(1,5), 
        stop_words='english', 
        lowercase=True, 
        binary=False
    )),
    #('clf', SGDClassifier(loss='log', n_jobs=-1, max_iter = 1000, tol = 1e-3)),
    ('clf', LogisticRegression())
])

In [11]:
tfidf =  TfidfVectorizer(
        min_df=4, 
        sublinear_tf=True,
        analyzer='char',
        ngram_range=(1,5), 
        stop_words='english', 
        lowercase=True, 
        binary=False
    )
lr = LogisticRegression()


In [12]:
X_train = train[['comment_text']].as_matrix().reshape((-1,))
tfidf.fit(X_train)

In [17]:
X_test = test[['comment_text']].as_matrix().reshape((-1,))

X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [24]:
roc_scores = []
for label in LABELS:
    print("########### %s ####################\n"%(label))
    
    y_train = get_y_vec(df = train, column_name=label)    
    y_test = get_y_vec(df = test, column_name=label)
    
    clf = lr.fit(X=X_train_tfidf, y=y_train)
    y_ = clf.predict_proba(X_test_tfidf)
    y_bin = y_[:,1] > 0.5
    print(classification_report(y_test, y_bin))
    print(confusion_matrix(y_test, y_bin))
    roc = roc_auc_score(y_test, y_[:, 1])
    print(roc)
    roc_scores.append(roc)
    print("########################################\n")
print(sum(roc_scores)/len(roc_scores))

########### toxic ####################

             precision    recall  f1-score   support

          0       0.96      1.00      0.98     28973
          1       0.93      0.61      0.74      3026

avg / total       0.96      0.96      0.96     31999

[[28842   131]
 [ 1182  1844]]
0.975875207469
########################################

########### severe_toxic ####################

             precision    recall  f1-score   support

          0       0.99      1.00      1.00     31682
          1       0.62      0.22      0.33       317

avg / total       0.99      0.99      0.99     31999

[[31639    43]
 [  247    70]]
0.99029760851
########################################

########### obscene ####################

             precision    recall  f1-score   support

          0       0.98      1.00      0.99     30312
          1       0.93      0.63      0.75      1687

avg / total       0.98      0.98      0.98     31999

[[30236    76]
 [  629  1058]]
0.988338215966
#####

In [25]:
submission = pd.read_csv("./input/test.csv")

In [29]:
tfidf2 =  TfidfVectorizer(
        min_df=4, 
        sublinear_tf=True,
        analyzer='char',
        ngram_range=(1,5), 
        stop_words='english', 
        lowercase=True, 
        binary=False
    )
lr2 = LogisticRegression()

In [30]:
X_train = data[['comment_text']].as_matrix().reshape((-1,))
X_train_tfidf = tfidf2.fit_transform(X_train)

In [31]:
X_submission = submission[['comment_text']].as_matrix().reshape((-1,))   
X_submission_tfidf = tfidf2.transform(X_submission)

In [34]:
print(X_train_tfidf.shape, X_submission_tfidf.shape)

(159571, 611967) (153164, 611967)


In [35]:
#Create a model on full data set:
results = {}
for label in ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']:
    print("########### %s ####################\n"%(label))
    y_train = get_y_vec(df = data, column_name=label)
    clf = lr2.fit(X=X_train_tfidf, y=y_train)
    y_ = clf.predict_proba(X_submission_tfidf)
    results[label] = y_[:,1]

########### toxic ####################

########### severe_toxic ####################

########### obscene ####################

########### threat ####################

########### insult ####################

########### identity_hate ####################



In [36]:
LABELS = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']
for  label in LABELS:
    submission[label] = results[label]

In [37]:
print(submission[0:3])
submission[0:3]['comment_text'].as_matrix()

                 id                                       comment_text  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   

      toxic  severe_toxic   obscene    threat    insult  identity_hate  
0  0.999281      0.180704  0.998248  0.022054  0.969670       0.194899  
1  0.024184      0.005031  0.015103  0.001195  0.011767       0.005081  
2  0.037859      0.008330  0.023222  0.001970  0.011978       0.004530  


array([ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "'], dtype=object)

In [38]:
headers = ["id"]+LABELS
print(headers)
submission[headers].head()

['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999281,0.180704,0.998248,0.022054,0.96967,0.194899
1,0000247867823ef7,0.024184,0.005031,0.015103,0.001195,0.011767,0.005081
2,00013b17ad220c46,0.037859,0.00833,0.023222,0.00197,0.011978,0.00453
3,00017563c3f7919a,0.005044,0.001354,0.003243,0.001022,0.003939,0.000857
4,00017695ad8997eb,0.02922,0.0016,0.009521,0.001065,0.010405,0.001771


In [39]:
import time
submission[headers].to_csv("./input/submissions-%s.csv"%str(int(time.time())), index=False)