# This notebook is for basic exploration of the training data. Since this dataset does not have many feature columns, there aren't too many things to be done here.

## Basic imports we'll need

In [40]:
import re

import pandas as pd

## Read in the train data as a Pandas DataFrame and then find some basic info. In order to run this notebook, you'll need the train.csv file in the same directory

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95851 entries, 0 to 95850
Data columns (total 8 columns):
id               95851 non-null int64
comment_text     95851 non-null object
toxic            95851 non-null int64
severe_toxic     95851 non-null int64
obscene          95851 non-null int64
threat           95851 non-null int64
insult           95851 non-null int64
identity_hate    95851 non-null int64
dtypes: int64(7), object(1)
memory usage: 5.9+ MB


In [4]:
train_data[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [10]:
label_columns = [x for x in train_data.columns if x not in['id', 'comment_text']]

## Let's check out the distribution of the labels

In [11]:
for label in label_columns:
    print(f"Count for {label}: {train_data[label].sum()/95851}")

Count for toxic: 0.09636832166591898
Count for severe_toxic: 0.010067709257076087
Count for obscene: 0.05330147833616759
Count for threat: 0.003182022096796069
Count for insult: 0.04971257472535498
Count for identity_hate: 0.008492347497678689


## Define a function to apply to the comment_text column to strip punctuation.

In [7]:
def remove_punctuation(row_str):
    return re.sub(r"\W", " ", row_str)

## Now apply this function to comment_text and observe the result

In [8]:
train_data = train_data.assign(comment_text=train_data.comment_text.apply(remove_punctuation))

In [9]:
train_data[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,Nonsense kiss off geek what I said is true...,1,0,0,0,0,0
1,27450690,Please do not vandalize pages as you did ...,0,0,0,0,0,0
2,54037174,Points of interest I removed the p...,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0
5,82428052,Fried chickens Is dat sum fried chickens,0,0,0,0,0,0
6,87311443,Why can you put English for example on some pl...,0,0,0,0,0,0
7,114749757,Guy Fawkes im a resident in bridgwater and i...,0,0,0,0,0,0
8,138560519,as far as nicknames go this article is embarra...,0,0,0,0,0,0
9,139353149,Woodland Meadows Good to hear that you correct...,0,0,0,0,0,0


## Create a new column that stores the lengths of the comment_text column

In [21]:
train_data = train_data.assign(comment_len=train_data.comment_text.str.len())

## Let's explore the distibution of lengths of comments.

In [23]:
deciles = [x/10.0 for x in range(1, 10)]
train_data.comment_len.describe(percentiles=deciles)

count    95851.000000
mean       395.341864
std        595.102072
min          6.000000
10%         47.000000
20%         79.000000
30%        114.000000
40%        155.000000
50%        206.000000
60%        274.000000
70%        367.000000
80%        528.000000
90%        890.000000
max       5000.000000
Name: comment_len, dtype: float64

## Could there be a relationship between the length of a comment and its label?

In [25]:
for label in label_columns:
    print("Correlation with comment length for {}: {}".format(label, train_data[label].corr(train_data.comment_len)))

Correlation with comment length for toxic: -0.05028472393348948
Correlation with comment length for severe_toxic: 0.015504336062655102
Correlation with comment length for obscene: -0.03775477187679391
Correlation with comment length for threat: -0.00474216034143737
Correlation with comment length for insult: -0.044176762264248694
Correlation with comment length for identity_hate: -0.007863956166948306


## What if we considered the number of words instead of characters?

## There are two ways we can do this. An absolute word count, and the number of unique words. We'll start with an absolute word count

### Define a function to find the number of words in comment_text

In [35]:
def get_num_words(row_str):
    return len(row_str.split())

## Create the new column

In [36]:
train_data = train_data.assign(num_words=train_data.comment_text.apply(get_num_words))

### And look at the distribution of word counts

In [37]:
train_data.num_words.describe(percentiles=deciles)

count    95851.000000
mean        69.348322
std        102.843453
min          1.000000
10%          8.000000
20%         14.000000
30%         21.000000
40%         28.000000
50%         37.000000
60%         49.000000
70%         65.000000
80%         93.000000
90%        156.000000
max       1403.000000
Name: num_words, dtype: float64

### Now do the same thing with number of unique words

### First, let's define a function to apply to the comment_text column to calculate the number of unique words

In [29]:
def get_unique_words(row_str):
    return len(set(row_str.lower().split()))

### Now let's create that column

In [30]:
train_data = train_data.assign(unique_words=train_data.comment_text.apply(get_unique_words))

### What does the distribution of unique words look like?

In [31]:
train_data.unique_words.describe(percentiles=deciles)

count    95851.000000
mean        45.571095
std         48.412922
min          1.000000
10%          8.000000
20%         13.000000
30%         18.000000
40%         24.000000
50%         31.000000
60%         39.000000
70%         50.000000
80%         66.000000
90%         98.000000
max        551.000000
Name: unique_words, dtype: float64

### Finally, investigate the relationship between word counts and labels

In [38]:
for label in label_columns:
    print("Correlation with number of words for {}: {}".format(label, train_data[label].corr(train_data.num_words)))

Correlation with number of words for toxic: -0.046702138423240426
Correlation with number of words for severe_toxic: 0.015734045089604583
Correlation with number of words for obscene: -0.03434576025978322
Correlation with number of words for threat: -0.0032372413308730218
Correlation with number of words for insult: -0.04044975242800978
Correlation with number of words for identity_hate: -0.008824860172021191


In [39]:
for label in label_columns:
    print("Correlation with unique words for {}: {}".format(label, train_data[label].corr(train_data.unique_words)))

Correlation with unique words for toxic: -0.09929011909770134
Correlation with unique words for severe_toxic: -0.04906782663421947
Correlation with unique words for obscene: -0.08234877737844694
Correlation with unique words for threat: -0.022897587603277824
Correlation with unique words for insult: -0.08451991900919688
Correlation with unique words for identity_hate: -0.03236846513420009


## One more thing would be to look at mean word length

In [41]:
train_data.eval('mean_word_length = comment_len/num_words', inplace=True)

## Once again, checkout the distribution of mean word length values

In [42]:
train_data.mean_word_length.describe(percentiles=deciles)

count    95851.000000
mean         5.732251
std          6.197771
min          1.916667
10%          4.866667
20%          5.133333
30%          5.312757
40%          5.466667
50%          5.604651
60%          5.750000
70%          5.905882
80%          6.120000
90%          6.492063
max       1242.250000
Name: mean_word_length, dtype: float64

## There is an obvious outlier given that the max mean word length is three orders of magnitude greater than the 99th Percentile

In [45]:
train_data.mean_word_length.quantile(0.99)

8.35140048228529

## Any possible correlations between mean word length and label?

In [46]:
for label in label_columns:
    print("Correlation with unique words for {}: {}".format(label, train_data[label].corr(train_data.mean_word_length)))

Correlation with unique words for toxic: 0.00921057466949899
Correlation with unique words for severe_toxic: 0.018322106930992788
Correlation with unique words for obscene: 0.0031221024568144386
Correlation with unique words for threat: -0.004002386620745559
Correlation with unique words for insult: 0.0044692499804441016
Correlation with unique words for identity_hate: 0.01916976206227629


# In summary, there weren't any obvious connections between various basic string metrics and the label. Using some real NLP techniques such as POS tagging, semantic analysis, and removal of stop words could yield interesting results