In [2]:
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [3]:
warnings.filterwarnings('ignore')

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

X_train = train["comment_text"].fillna(" ")
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
X_test = test["comment_text"].fillna(" ")

max_features = 10000  #max number of words extracted from train corpus
maxlen = 200 #max char length kept for each comment 
embed_size = 300 #dimension of embedded vector  

In [4]:
#here num_words is set to max_features, to notice that
#Tokenizer indexes words by the descending order of their frequency
#the texts_to_sequences will skip the word with a index larger than
#num_words, this doesn't harm because the skipped words have low frequency

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [5]:
X_train_cut = sequence.pad_sequences(X_train, maxlen=maxlen, value=0, truncating="post", padding="post")
X_test_cut = sequence.pad_sequences(X_test, maxlen=maxlen, value=0, truncating="post", padding="post")

In [7]:

print("cut X_train:")
print(X_train_cut)
word_index = tokenizer.word_index
nb_words = min(len(word_index), max_features) + 1 #index begin from 1, as we added 0 so nb_words = indexed_words + 1
print("number of index including the padding one:")
print(nb_words)

cut X_train:
[[ 688   75    1 ...    0    0    0]
 [  52 2635   13 ...    0    0    0]
 [ 412  437   73 ...    0    0    0]
 ...
 [7330 5208   46 ...    0    0    0]
 [   4   11  574 ...    0    0    0]
 [   4    7  134 ...    0    0    0]]
number of index including the padding one:
10001


In [15]:


def create_cnn(maxlen, nb_words, filter_sizes, num_filters, dp1=0.2, dp2=0.1, solver='adam'):
    """
    Define all tunnable params here for model building
    :param filter_sizes:example: [2,3,4,5] different weights for filters, the filter's length is embedding size
    :param num_filters: the height of each filters, number of recognizable pattern
    :param dp1: dropout rate 1
    :param dp2: dropout rate 2
    :param solver: the name of optimozer
    """
    inp = Input(shape=(maxlen,))
    x = Embedding(nb_words, embed_size, input_length=maxlen, embeddings_initializer="uniform")(inp)
    x = SpatialDropout1D(dp1)(x)
    x = Reshape((maxlen, embed_size, 1))(x)

    outs = []
    for size in filter_sizes:
        conv = Conv2D(num_filters, kernel_size=(size, embed_size), kernel_initializer='normal',
                      activation='elu')(x)
        print(conv.shape)
        w = int(conv.shape[1])
        maxpool = MaxPool2D(pool_size=(w, 1))(conv)
        outs.append(maxpool)
    z = Concatenate(axis=1)(outs)
    z = Flatten()(z)
    z = Dropout(dp2)(z)

    outp = Dense(6, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=solver,
                  metrics=['accuracy'])
    model.summary()
    return model

In [16]:
def run(subfile='subm/submission_basic_cnn.csv', test_size=0.10, n_splits=3):
    """
    [CV]  batch_size=256, epochs=10, filter_sizes=[1, 2, 3, 4, 5], maxlen=200, nb_words=100000, num_filters=32, score=0.9774102756913097, total= 3.8min
    """
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        cv_sets = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
        parameters = {
            "maxlen": [maxlen],
            "nb_words": [nb_words],
            "filter_sizes": [[1, 2, 3, 5], [1, 2, 3, 4]],
            "num_filters": [50],
            "batch_size": [128],
            "epochs": [3]
        }
        # define params used in fit() method here
        model = KerasClassifier(build_fn=create_cnn, verbose=10, epochs=10, batch_size=256)
        grid = GridSearchCV(model, param_grid=parameters, scoring='roc_auc', cv=cv_sets, verbose=10, n_jobs=1)

        grid = grid.fit(X_train_cut, y_train)
        best_clf = grid.best_estimator_
        print('CV score is {}'.format(grid.best_score_))
        print("best params are:")
        print(grid.best_params_)
        print(best_clf)

        y_pred = best_clf.predict_proba(X_test_cut, batch_size=1024)
        print(y_pred.shape)
        submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
        submission.to_csv(subfile, index=False)

In [17]:
run()

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 5], maxlen=200, nb_words=10001, num_filters=50 
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 196, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     3000300     input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 200, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.4min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 5], maxlen=200, nb_words=10001, num_filters=50, score=0.9864994706318994, total= 2.8min
[CV] batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 5], maxlen=200, nb_words=10001, num_filters=50 
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 196, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 200, 300)     3000300     input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_4 (SpatialDro (None, 200, 300)     0           em

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.6min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 5], maxlen=200, nb_words=10001, num_filters=50, score=0.9812648747743816, total= 2.3min
[CV] batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50 
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 197, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 200, 300)     3000300     input_5[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_5 (SpatialDro (None, 200, 300)     0           em

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  9.2min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50, score=0.984286459971537, total= 1.6min
[CV] batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50 
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 197, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 200, 300)     3000300     input_6[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_6 (SpatialDro (None, 200, 300)     0           emb

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 11.0min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50, score=0.9860000104369903, total= 1.6min
[CV] batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50 
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 197, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 200, 300)     3000300     input_7[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_7 (SpatialDro (None, 200, 300)     0           em

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.9min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  batch_size=128, epochs=3, filter_sizes=[1, 2, 3, 4], maxlen=200, nb_words=10001, num_filters=50, score=0.9805766172389597, total= 1.7min
(?, 200, 1, 50)
(?, 199, 1, 50)
(?, 198, 1, 50)
(?, 196, 1, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 200, 300)     3000300     input_8[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_8 (SpatialDro (None, 200, 300)     0           embedding_8[0][0]                
______________________________________________________________________

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 14.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 14.8min finished


Epoch 1/3
Epoch 2/3
Epoch 3/3
CV score is 0.9839840237547389
best params are:
{'batch_size': 128, 'epochs': 3, 'filter_sizes': [1, 2, 3, 5], 'maxlen': 200, 'nb_words': 10001, 'num_filters': 50}
<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f941d820fd0>
(153164, 6)
