In [16]:
import numpy as np
import pandas as pd
from sklearn import *
from textblob import TextBlob # for sentiment analysis


In [17]:
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [18]:
train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')
test_labels = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test_labels.csv')
sample_submission = pd.read_csv('jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

In [19]:
print(train.shape)
print(train.columns)
train.head(3)

(159571, 8)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [20]:
train[LABELS].sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [21]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
test.describe()

Unnamed: 0,id,comment_text
count,153164,153164
unique,153164,153164
top,49932200b816730c,""" \n ::I agree with Indian, I don't think Wiki..."
freq,1,1


In [23]:
from collections import Counter

In [24]:
for k, v in Counter(train[LABELS].sum(axis=1)).items():
    print(k, '{:0.2%}'.format(v/len(train)))

0 89.83%
4 1.10%
1 3.99%
3 2.64%
2 2.18%
5 0.24%
6 0.02%


In [25]:
# -1: not scoring
for k, v in Counter(test_labels[LABELS].sum(axis=1)).items():
    print(k, '{:0.2%}'.format(v/len(test_labels)))

-6 58.23%
0 37.69%
1 1.20%
4 0.40%
3 1.36%
2 1.00%
5 0.11%
6 0.01%


In [26]:
Counter(train[LABELS].values.reshape(-1))

Counter({0: 922328, 1: 35098})

In [27]:
Counter(test_labels[LABELS].values.reshape(-1))

Counter({-1: 535116, 0: 369370, 1: 14498})

In [28]:
# multilabel
#in-balanced

In [29]:
for l in LABELS:
    c = Counter(train[l])
    print(l, '{:.2%}'.format(c[1]/sum(c.values())), c)

toxic 9.58% Counter({0: 144277, 1: 15294})
severe_toxic 1.00% Counter({0: 157976, 1: 1595})
obscene 5.29% Counter({0: 151122, 1: 8449})
threat 0.30% Counter({0: 159093, 1: 478})
insult 4.94% Counter({0: 151694, 1: 7877})
identity_hate 0.88% Counter({0: 158166, 1: 1405})


In [31]:
for l in LABELS:
    c = Counter(train[train[l] == 1]['toxic'])
    print(l, '{:.2%}'.format(c[1]/sum(c.values())), c)

toxic 100.00% Counter({1: 15294})
severe_toxic 100.00% Counter({1: 1595})
obscene 93.81% Counter({1: 7926, 0: 523})
threat 93.93% Counter({1: 449, 0: 29})
insult 93.23% Counter({1: 7344, 0: 533})
identity_hate 92.67% Counter({1: 1302, 0: 103})


In [35]:
train[LABELS].corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0


In [36]:
for i in LABELS:
    for j in LABELS:
        c = Counter(train[train[j] == 1][i])
        print('%{} when {}==1 {:.2%}'.format(i, j, c[1]/sum(c.values())))

%toxic when toxic==1 100.00%
%toxic when severe_toxic==1 100.00%
%toxic when obscene==1 93.81%
%toxic when threat==1 93.93%
%toxic when insult==1 93.23%
%toxic when identity_hate==1 92.67%
%severe_toxic when toxic==1 10.43%
%severe_toxic when severe_toxic==1 100.00%
%severe_toxic when obscene==1 17.95%
%severe_toxic when threat==1 23.43%
%severe_toxic when insult==1 17.41%
%severe_toxic when identity_hate==1 22.28%
%obscene when toxic==1 51.82%
%obscene when severe_toxic==1 95.11%
%obscene when obscene==1 100.00%
%obscene when threat==1 62.97%
%obscene when insult==1 78.14%
%obscene when identity_hate==1 73.45%
%threat when toxic==1 2.94%
%threat when severe_toxic==1 7.02%
%threat when obscene==1 3.56%
%threat when threat==1 100.00%
%threat when insult==1 3.90%
%threat when identity_hate==1 6.98%
%insult when toxic==1 48.02%
%insult when severe_toxic==1 85.96%
%insult when obscene==1 72.85%
%insult when threat==1 64.23%
%insult when insult==1 100.00%
%insult when identity_hate==1 82.56

In [32]:
print(test.shape)
print(test.columns)
test.head(3)

(153164, 2)
Index(['id', 'comment_text'], dtype='object')


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."


In [33]:
print(test_labels.shape)
print(test_labels.columns)
test_labels.head(3)

(153164, 7)
Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1


In [34]:
print(sample_submission.shape)
print(sample_submission.columns)
sample_submission.head(3)

(153164, 7)
Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
