In [1]:
import numpy as np
import pandas as pd
from textstat.textstat import textstat

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Extract features

In [3]:

def extract_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    
    return features_df


In [4]:
train_features = extract_features(train)

In [5]:
# Sanity check: test if comment length length feature is correct, and concat is correct
r = pd.concat([train, train_features], axis=1).iloc[0]
len(r['comment_text']), r['comment_text_len']

(83, 83)

# Describe data and feature/label correlation

In [6]:
train_plus_features = pd.concat([train, train_features], axis=1)

In [7]:
train_plus_features[:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_len,comment_text_lex_count,comment_text_syl_count,comment_text_sent_count,comment_text_flesch_reading_ease,comment_text_flesch_kincaid_grade,comment_text_syl_over_lex
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,83,14,21,3,75.16,3.9,1.5
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,142,27,39,2,74.69,6.2,1.444444
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,411,67,98,4,62.88,8.7,1.462687
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,148,25,39,4,65.08,5.7,1.56
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,266,49,68,3,71.85,7.3,1.387755
5,82428052,Fried chickens \n\nIs dat sum fried chickens?,0,0,0,0,0,0,43,7,9,1,89.75,2.5,1.285714
6,87311443,Why can you put English for example on some pl...,0,0,0,0,0,0,90,17,24,1,71.14,7.6,1.411765
7,114749757,Guy Fawkes \n\nim a resident in bridgwater and...,0,0,0,0,0,0,355,64,91,1,23.44,25.9,1.421875
8,138560519,as far as nicknames go this article is embarra...,0,0,0,0,0,0,239,42,59,3,74.19,6.4,1.404762
9,139353149,Woodland Meadows\nGood to hear that you correc...,0,0,0,0,0,0,54,9,13,1,79.26,4.4,1.444444


In [8]:
train_plus_features.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_len,comment_text_lex_count,comment_text_syl_count,comment_text_sent_count,comment_text_flesch_reading_ease,comment_text_flesch_kincaid_grade,comment_text_syl_over_lex
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,395.341864,66.413444,98.688715,4.132821,63.063657,8.398637,1.507688
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,595.102072,99.080553,148.834461,6.607542,156.816828,23.56254,1.832507
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,1.0,1.0,-36707.2,-10.2,0.090909
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,16.0,23.0,1.0,54.05,5.0,1.333333
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,35.0,51.0,2.0,67.15,7.6,1.454545
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,435.0,74.0,109.0,5.0,79.77,10.3,1.586207
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,5000.0,1403.0,2132.0,333.0,187.21,5133.9,436.333333


In [9]:
corr = train_plus_features.corr()
corr.round(5)*100

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_len,comment_text_lex_count,comment_text_syl_count,comment_text_sent_count,comment_text_flesch_reading_ease,comment_text_flesch_kincaid_grade,comment_text_syl_over_lex
id,100.0,-0.003,0.402,0.587,-0.287,0.366,0.195,0.437,0.522,0.446,0.461,-0.056,0.142,-0.0
toxic,-0.003,100.0,30.881,67.749,16.297,64.833,25.912,-5.028,-4.75,-5.594,-0.128,0.037,0.876,-0.618
severe_toxic,0.402,30.881,100.0,40.454,13.347,37.745,19.339,1.55,1.355,1.031,5.686,-2.738,4.721,1.405
obscene,0.587,67.749,40.454,100.0,14.987,74.468,28.779,-3.775,-3.599,-4.228,0.138,0.216,0.605,-0.729
threat,-0.287,16.297,13.347,14.987,100.0,15.753,12.397,-0.474,-0.337,-0.839,0.652,0.362,-0.084,-0.521
insult,0.366,64.833,37.745,74.468,15.753,100.0,33.192,-4.418,-4.17,-4.822,0.398,0.44,-0.028,-0.691
identity_hate,0.195,25.912,19.339,28.779,12.397,33.192,100.0,-0.786,-0.942,-0.913,0.098,-2.09,2.793,1.584
comment_text_len,0.437,-5.028,1.55,-3.775,-0.474,-4.418,-0.786,100.0,98.635,99.205,77.985,-8.557,15.442,4.046
comment_text_lex_count,0.522,-4.75,1.355,-3.599,-0.337,-4.17,-0.942,98.635,100.0,98.738,78.391,-4.139,11.942,-0.794
comment_text_syl_count,0.446,-5.594,1.031,-4.228,-0.839,-4.822,-0.913,99.205,98.738,100.0,77.116,-8.988,15.755,4.538


In [10]:
corr_mat = np.matrix(corr)
np.round(np.sort(corr_mat[np.triu_indices(corr_mat.shape[0], 1)])[:,::-1], 3) * 100

array([[ 99.2,  98.7,  98.6,  91.4,  78.4,  78. ,  77.1,  74.5,  67.7,
         64.8,  40.5,  37.7,  33.2,  30.9,  28.8,  25.9,  19.3,  16.3,
         15.8,  15.8,  15.4,  15. ,  13.3,  12.4,  11.9,   5.7,   4.7,
          4.5,   4. ,   2.8,   1.6,   1.6,   1.4,   1.4,   1. ,   0.9,
          0.7,   0.7,   0.6,   0.6,   0.5,   0.5,   0.4,   0.4,   0.4,
          0.4,   0.4,   0.4,   0.4,   0.2,   0.2,   0.1,   0.1,   0.1,
          0. ,  -0. ,  -0. ,  -0. ,  -0.1,  -0.1,  -0.1,  -0.3,  -0.3,
         -0.5,  -0.5,  -0.5,  -0.6,  -0.7,  -0.7,  -0.7,  -0.8,  -0.8,
         -0.8,  -0.9,  -0.9,  -2.1,  -2.7,  -3.6,  -3.8,  -4.1,  -4.2,
         -4.2,  -4.4,  -4.7,  -4.8,  -5. ,  -5.6,  -8.6,  -9. , -96.7,
        -98.7]])