In [2]:
import numpy as np
import pandas as pd
import random
import copy

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models.keyedvectors import KeyedVectors
 
import matplotlib.pyplot as plt
%matplotlib inline 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'

train = pd.read_csv('train_pre2.csv')
test = pd.read_csv('test_pre2.csv')
submission = pd.read_csv('sample_submission.csv')

In [4]:
len(train)

159571

In [5]:
train['cv_id'] = [random.randint(1,10) for _ in range(len(train))]

In [6]:
test['cv_id'] = -1

In [7]:
X_train_list = train["comment_text"].fillna("fillna").values
y_train_list = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test_list = test["comment_text"].fillna("fillna").values

In [8]:
max_features = 100000
max_len = 200
embed_size = 300


In [9]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_list) + list(X_test_list))

In [10]:
X_train_sequences = tokenizer.texts_to_sequences(X_train_list)
X_test_sequences = tokenizer.texts_to_sequences(X_test_list)

In [11]:
X_train_sequences_pad = sequence.pad_sequences(X_train_sequences, maxlen=max_len)
X_test_sequences_pad = sequence.pad_sequences(X_test_sequences, maxlen=max_len)

In [12]:
word_index = tokenizer.word_index

In [13]:
nb_words = min(max_features, len(word_index))

In [14]:
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=False)

In [15]:
def get_embeddings():
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = model[word] if word in model else None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [16]:
embedding_matrix_fasttext = get_embeddings()

In [17]:
embedding_glove = 'glove.840B.300d.txt'

In [18]:
embedding_index_glove = {}
with open(embedding_glove,encoding ='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        cof = np.asarray(values[1:],dtype = 'float32')
        embedding_index_glove[word] = cof
print(len(embedding_index_glove))

2196016


In [19]:
embedding_matrix_glove = np.zeros((nb_words,embed_size))
for word,i in word_index.items():
    if i >= max_features:
        continue
    embedding_vec_glove = embedding_index_glove.get(word)
    if embedding_vec_glove is not None:
        embedding_matrix_glove[i] = embedding_vec_glove

In [20]:
embedding_matrix_glove.shape

(100000, 300)

In [21]:
filter_sizes = [3,4,5]
num_filters =64

In [22]:
def get_model():    
    inp = Input(shape=(max_len, ))
    #fasttext
    fast_text_embedding = Embedding(max_features, embed_size, weights=[embedding_matrix_fasttext])(inp)
    x = SpatialDropout1D(0.1)(fast_text_embedding)
    x = Reshape((max_len, embed_size, 1))(x)
    

    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(max_len - filter_sizes[0] + 1, 1),strides=(1,1), padding='valid')(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(max_len - filter_sizes[1] + 1, 1),strides=(1,1), padding='valid')(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(max_len - filter_sizes[2] + 1, 1),strides=(1,1), padding='valid')(conv_2)
        
    concatenated_tensor1  = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])   

    
    
    #glove
    glove_embedding = Embedding(max_features, embed_size, weights=[embedding_matrix_glove])(inp)
    y = SpatialDropout1D(0.1)(glove_embedding)
    y = Reshape((max_len, embed_size, 1))(y)
    

    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',activation='elu')(x)
    conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',activation='elu')(x)
    conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',activation='elu')(x)
    
    maxpool_3 = MaxPool2D(pool_size=(max_len - filter_sizes[0] + 1, 1),strides=(1,1), padding='valid')(conv_3)
    maxpool_4 = MaxPool2D(pool_size=(max_len - filter_sizes[1] + 1, 1),strides=(1,1), padding='valid')(conv_4)
    maxpool_5 = MaxPool2D(pool_size=(max_len - filter_sizes[2] + 1, 1),strides=(1,1), padding='valid')(conv_5)
        
    concatenated_tensor2  = Concatenate(axis=1)([maxpool_3, maxpool_4, maxpool_5])   

    
    #
    concatenated_tensor = Concatenate([concatenated_tensor1,concatenated_tensor2])
    flatten = Flatten()(concatenated_tensor2)
    dropout = Dropout(0.1)(flatten)
        
    output = Dense(6, activation="sigmoid")(dropout)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [23]:
cv_models=[]
cv_results=[]
cv_scores=[]

In [24]:
Kfold = 10
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [25]:
idx_train = train[train['cv_id'] != 1].index
data_train = X_train_sequences_pad[idx_train]
data_train.shape

(143512, 200)

In [26]:
for i in range(1,Kfold+1):
    
    idx_train = train[train['cv_id'] != i].index
    idx_val = train[train['cv_id'] == i].index
    valid_id = train[train['cv_id'] == i]['id'].values
    data_train = X_train_sequences_pad[idx_train]
    labels_train = y_train_list[idx_train]
    data_val = X_train_sequences_pad[idx_val]
    labels_val = y_train_list[idx_val]
    print("fold %d"%(i))
    print("train_shape")
    print(data_train.shape, labels_train.shape)
    print("val_shape")
    print(data_val.shape, labels_val.shape)
    model = get_model()
    best = [-1, 0, 0, 0]
    earlystop = 3
    
    for epoch in range(20):
        model.fit(data_train,labels_train,batch_size=256, epochs=1, verbose=1)
        r = model.predict(data_val ,batch_size=256)
        s = roc_auc_score(labels_val,r)
        print(i,epoch,s)
        if s > best[0]:
            print("epoch " + str(epoch) + " improved from " + str(best[0]) + " to " + str(s))
            best = [s,epoch,copy.copy(model),r]
        if epoch-best[1]>earlystop:
            break
    #save cv_results
    tpd=pd.DataFrame(columns=[['id']+list_classes])
    tpd['id'] = valid_id
    tpd[list_classes] = best[-1]
    cv_results.append(tpd)
    cv_models.append(best[2])
    cv_scores.append(best[0])

fold 1
train_shape
(143512, 200) (143512, 6)
val_shape
(16059, 200) (16059, 6)
Epoch 1/1
1 0 0.9841871640524714
epoch 0 improved from -1 to 0.9841871640524714
Epoch 1/1
1 1 0.9868000516562327
epoch 1 improved from 0.9841871640524714 to 0.9868000516562327
Epoch 1/1
1 2 0.98601408978588
Epoch 1/1
1 3 0.984810757080663
Epoch 1/1
1 4 0.9831064138932907
Epoch 1/1
1 5 0.9813887012442492
fold 2
train_shape
(143637, 200) (143637, 6)
val_shape
(15934, 200) (15934, 6)
Epoch 1/1
2 0 0.9854065169519162
epoch 0 improved from -1 to 0.9854065169519162
Epoch 1/1
2 1 0.9876680432947991
epoch 1 improved from 0.9854065169519162 to 0.9876680432947991
Epoch 1/1
2 2 0.9873545635845247
Epoch 1/1
2 3 0.9867533966206001
Epoch 1/1
2 4 0.9855158241511793
Epoch 1/1
2 5 0.9841354108421826
fold 3
train_shape
(143677, 200) (143677, 6)
val_shape
(15894, 200) (15894, 6)
Epoch 1/1
3 0 0.98793734706529
epoch 0 improved from -1 to 0.98793734706529
Epoch 1/1
3 1 0.9893580013575226
epoch 1 improved from 0.98793734706529 to

9 1 0.9864503151044315
epoch 1 improved from 0.9848435398872649 to 0.9864503151044315
Epoch 1/1
9 2 0.9861379280327207
Epoch 1/1
9 3 0.9851777456434953
Epoch 1/1
9 4 0.9833522075081591
Epoch 1/1
9 5 0.981676716624043
fold 10
train_shape
(143489, 200) (143489, 6)
val_shape
(16082, 200) (16082, 6)
Epoch 1/1
10 0 0.9783977962383074
epoch 0 improved from -1 to 0.9783977962383074
Epoch 1/1
10 1 0.9783471275310512
Epoch 1/1
10 2 0.9780853694506488
Epoch 1/1
10 3 0.9768960037122637
Epoch 1/1
10 4 0.975235550608324


In [28]:
len(cv_scores)

10

In [29]:
r=[]
avg_val_score = np.average(cv_scores)
print(cv_scores,avg_val_score)
print("prediction begin....")

[0.9868000516562327, 0.9876680432947991, 0.9893580013575226, 0.9889511199044531, 0.9873759209042451, 0.9884787237197993, 0.9886640903206971, 0.986410475752124, 0.9864503151044315, 0.9783977962383074] 0.9868554538252612
prediction begin....


In [30]:
len(cv_models)

10

In [31]:
for i in range(Kfold):
    print("prediction "+ str(i))
    if len(r) == 0:
        r = cv_models[i].predict(X_test_sequences_pad,batch_size=256)
    else:
        r += cv_models[i].predict(X_test_sequences_pad,batch_size=256)

prediction 0
prediction 1
prediction 2
prediction 3
prediction 4
prediction 5
prediction 6
prediction 7
prediction 8
prediction 9


In [32]:
r /= 10
index = 'cnn-10-fold'

In [33]:
pd.concat(cv_results).to_csv("%.4ftextcnn_cv_"% (avg_val_score)+str(index)+".csv",index=False)

In [34]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[list_classes] = r

sample_submission.to_csv("%.4ftextcnn_submssion"% (avg_val_score)+ index+".csv",index=False)