In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore", UserWarning)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
columnList = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comments-dataset-merged/train.csv')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comments-dataset-merged/test.csv')
train, valid = train[train.index <= 100_000], train[train.index > 100_000]

train['comment_text'] = train['comment_text'].fillna('__nocomment__')
valid['comment_text'] = valid['comment_text'].fillna('__nocomment__')
test['comment_text']  = test['comment_text'].fillna('__nocomment__')

train['comment_text'] = train['comment_text'].map(lambda x : x.lower())
valid['comment_text'] = valid['comment_text'].map(lambda x : x.lower())
test['comment_text'] = test['comment_text'].map(lambda x : x.lower())

In [3]:
def customAuc(yActual, yPred):
    fpr, tpr, __ = metrics.roc_curve(yActual, yPred)
    auc          = metrics.auc(fpr, tpr)
    return auc

In [4]:
train_text = train['comment_text']
valid_text = valid['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, valid_text, test_text])

In [5]:
tfv = TfidfVectorizer(min_df=3,  max_features=20000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')

tfv.fit(all_text)
xTrainTfv = tfv.transform(train.comment_text.values)
xValidTfv = tfv.transform(valid.comment_text.values)
xTestTfv  = tfv.transform(test.comment_text.values)



In [6]:
cfv = TfidfVectorizer(min_df=3,  max_features=50000, strip_accents='unicode', analyzer='char',token_pattern=r'\w{1,}',
                      ngram_range=(2, 6), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')

cfv.fit(all_text)
xTrainCfv = cfv.transform(train.comment_text.values)
xValidCfv = cfv.transform(valid.comment_text.values)
xTestCfv  = cfv.transform(test.comment_text.values)

In [7]:
xTrainStack = hstack([xTrainTfv, xTrainCfv])
xValidStack  = hstack([xValidTfv, xValidCfv])
xTestStack  = hstack([xTestTfv, xTestCfv])

In [8]:
clf = LogisticRegression(C= 1, class_weight='balanced')

predVal = np.zeros((len(valid), len(columnList)))
predTest = np.zeros((len(test), len(columnList)))

for c_idx, col in enumerate(columnList):
    print('Fitting ...', col)
    clf.fit(xTrainStack, train[col])
    
    predVal[:, c_idx] = clf.predict_proba(xValidStack)[:,1]
    predTest[:, c_idx] = clf.predict_proba(xTestStack)[:,1]

Fitting ... toxic
Fitting ... severe_toxic
Fitting ... obscene
Fitting ... threat
Fitting ... insult
Fitting ... identity_hate


In [9]:
pd.merge(left=test.id.to_frame(), right=pd.DataFrame(predTest, columns=columnList), left_index=True, right_index=True)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999989,0.881433,0.999988,0.703426,0.999242,0.953537
1,0000247867823ef7,0.024911,0.006315,0.007862,0.001169,0.012996,0.009804
2,00013b17ad220c46,0.038929,0.007975,0.042651,0.002391,0.012949,0.006981
3,00017563c3f7919a,0.025064,0.006692,0.005446,0.001726,0.010945,0.001820
4,00017695ad8997eb,0.031819,0.006029,0.018180,0.001511,0.021916,0.001879
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.615508,0.002517,0.551615,0.001529,0.045145,0.003489
153160,fffd7a9a6eb32c16,0.090815,0.015404,0.039543,0.011907,0.092635,0.147441
153161,fffda9e8d6fafa9e,0.007223,0.002732,0.037384,0.001122,0.005850,0.004749
153162,fffe8f1340a79fc2,0.010916,0.001353,0.015759,0.002833,0.013602,0.048923


In [10]:
pd.merge(left=test.id.to_frame(), right=pd.DataFrame(predTest, columns=columnList), left_index=True, right_index=True).to_csv('submissions.csv', index=False)

In [12]:
pd.merge(left=valid.reset_index().id.to_frame(), right=pd.DataFrame(predVal, columns=columnList), left_index=True, right_index=True).to_csv('submissions.csv', index=False)

In [13]:
pd.merge(left=valid.reset_index().id.to_frame(), right=pd.DataFrame(predVal, columns=columnList), left_index=True, right_index=True)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,17359f940f219872,0.017774,0.002258,0.013155,0.003512,0.005302,0.003336
1,173693e634e1a6cf,0.013506,0.004301,0.004685,0.001201,0.004076,0.004044
2,17375e3fff2d9d1a,0.059074,0.017080,0.014669,0.004560,0.094791,0.018011
3,173a0ca6d52dd01d,0.015409,0.002021,0.012450,0.001593,0.002421,0.015694
4,173a573b6011f226,0.033028,0.010775,0.016398,0.016151,0.031743,0.030341
...,...,...,...,...,...,...,...
59565,ffe987279560d7ff,0.101306,0.003382,0.034490,0.000852,0.035017,0.002574
59566,ffea4adeee384e90,0.862643,0.040218,0.156955,0.039451,0.144673,0.004782
59567,ffee36eab5c267c9,0.056790,0.001246,0.017186,0.001518,0.115756,0.012555
59568,fff125370e4aaaf3,0.049825,0.000711,0.013862,0.000957,0.016796,0.003203
