In [35]:
import numpy as np
np.random.seed(42)
import pandas as pd

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv1D, MaxPool2D,merge
from keras.layers import Reshape, Flatten, concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models.keyedvectors import KeyedVectors
from keras import backend as K
from keras.engine import InputSpec, Layer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'

train = pd.read_csv('train_pre2.csv')
test = pd.read_csv('test_pre2.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [4]:
max_features = 100000
max_len = 200
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [41]:
X_train_sequences_pad = sequence.pad_sequences(X_train, maxlen=max_len)
X_test_sequences_pad = sequence.pad_sequences(X_test, maxlen=max_len)

In [6]:
word_index = tokenizer.word_index

In [7]:
nb_words = min(max_features, len(word_index))

In [8]:
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=False)

In [9]:
def get_embeddings():
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = model[word] if word in model else None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [10]:
embedding_matrix = get_embeddings()

In [11]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data


    def on_epoch_end(self, epoch, logs = {}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.4f \n" % (epoch+1, score))

In [14]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """

    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])

        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]

        # return flattened output
        return Flatten()(top_k)

In [15]:
filter_sizes = [2,3,4,5]
num_filters = 128

In [37]:
def get_model():    
    inp = Input(shape=(max_len,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    conv_0 = Conv1D(num_filters, filter_sizes[0], kernel_initializer='normal',activation='elu')(x)
    conv_1 = Conv1D(num_filters, filter_sizes[1], kernel_initializer='normal',activation='elu')(x)
    conv_2 = Conv1D(num_filters, filter_sizes[2], kernel_initializer='normal',activation='elu')(x)
    conv_3 = Conv1D(num_filters, filter_sizes[3], kernel_initializer='normal',activation='elu')(x)
    
    maxpool_0 = KMaxPooling(k=3)(conv_0)
    maxpool_1 = KMaxPooling(k=3)(conv_1)
    maxpool_2 = KMaxPooling(k=3)(conv_2)
    maxpool_3 = KMaxPooling(k=3)(conv_3)
    
    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3],axis=1)
    
    output = Dropout(0.6)(merged_tensor)
    output = Dense(64, activation='relu')(output)        
    output = Dense(6, activation="sigmoid")(output)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [18]:
mskf = MultilabelStratifiedKFold(n_splits= 10, random_state=423)

In [19]:
early = EarlyStopping(monitor="val_loss", mode="auto", patience=3)

In [38]:
model = get_model()

In [39]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 200, 300)     30000000    input_9[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_9 (SpatialDro (None, 200, 300)     0           embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_22 (Conv1D)              (None, 199, 128)     76928       spatial_dropout1d_9[0][0]        
__________________________________________________________________________________________________
conv1d_23 

In [44]:
subs = []
for train_index,text_index in mskf.split(X_train_sequences_pad,y_train):        
    print('train_nums,',len(train_index),'test_nums:',len(text_index))
    train_x,text_x = X_train_sequences_pad[train_index],X_train_sequences_pad[text_index]
    train_y,text_y = y_train[train_index],y_train[text_index]
    hist = model.fit(train_x,train_y,batch_size = 256,epochs = 50,validation_data = (text_x,text_y), verbose = 2,callbacks = [early])
        
    #result = model1.predict(text_x,batch_size = 256)
    #s = roc_auc_score(text_y,result)
    #print('roc_auc_score:',s)
        
    '''tpd = pd.DataFrame(columns=[['id']+list_classes])
    tpd[list_classes] = result
    tpd ['id'] = text_index
    text_cnn_result.append(tpd)'''
        
    sub = model.predict(X_test_sequences_pad,batch_size = 256)
        
    subs.append(sub)

train_nums, 143614 test_nums: 15957
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
 - 145s - loss: 0.0632 - acc: 0.9781 - val_loss: 0.0456 - val_acc: 0.9829
Epoch 2/50
 - 141s - loss: 0.0426 - acc: 0.9835 - val_loss: 0.0422 - val_acc: 0.9834
Epoch 3/50
 - 141s - loss: 0.0373 - acc: 0.9851 - val_loss: 0.0417 - val_acc: 0.9835
Epoch 4/50
 - 141s - loss: 0.0327 - acc: 0.9867 - val_loss: 0.0428 - val_acc: 0.9832
Epoch 5/50
 - 141s - loss: 0.0289 - acc: 0.9882 - val_loss: 0.0464 - val_acc: 0.9834
Epoch 6/50
 - 141s - loss: 0.0256 - acc: 0.9895 - val_loss: 0.0498 - val_acc: 0.9828
train_nums, 143614 test_nums: 15957
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
 - 141s - loss: 0.0263 - acc: 0.9898 - val_loss: 0.0197 - val_acc: 0.9924
Epoch 2/50
 - 141s - loss: 0.0220 - acc: 0.9913 - val_loss: 0.0210 - val_acc: 0.9917
Epoch 3/50
 - 141s - loss: 0.0191 - acc: 0.9925 - val_loss: 0.0229 - val_acc: 0.9909
Epoch 4/50
 - 141s - loss: 0.0168 - acc: 0.9935 - val_loss: 0

In [45]:
sub_cv = sum(subs)/10

In [47]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [49]:
submission[list_classes] = sub_cv
submission.to_csv("kmax_cnn_sub.csv",index=False)