In [1]:
import numpy as np 
import pandas as pd 
import random
import copy

from keras.models import Model
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout, Embedding
from keras.layers import GlobalAveragePooling1D, concatenate, CuDNNLSTM
from keras.layers.core import SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from gensim.models.keyedvectors import KeyedVectors

from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


import matplotlib.pyplot as plt
%matplotlib inline 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_pre2.csv')
test = pd.read_csv('test_pre2.csv')
subm = pd.read_csv('sample_submission.csv')

In [3]:
train['cv_id'] = [random.randint(1,10) for _ in range(len(train))]
test['cv_id'] = -1

In [4]:
X_train_list = train["comment_text"].fillna("fillna").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train_list = train[list_classes].values
X_test_list = test["comment_text"].fillna("fillna").values

In [None]:
type(X_train_list[0])

In [5]:
max_features = 100000
max_len = 150
embed_size=300

In [6]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_list) + list(X_test_list))

In [7]:
X_train_sequences = tokenizer.texts_to_sequences(X_train_list)
X_test_sequences = tokenizer.texts_to_sequences(X_test_list)

In [8]:
X_train_sequences_pad = sequence.pad_sequences(X_train_sequences, maxlen=max_len)
X_test_sequences_pad = sequence.pad_sequences(X_test_sequences, maxlen=max_len)

In [9]:
word_index = tokenizer.word_index

In [10]:
nb_words = min(max_features, len(word_index))

In [11]:
embedding_vec = 'crawl-300d-2M.vec'
embedding_glove = 'glove.840B.300d.txt'

In [12]:
model = KeyedVectors.load_word2vec_format(embedding_vec, binary=False)

In [13]:
embedding_matrix_fast_text = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = model[word] if word in model else None
    if embedding_vector is not None:
        embedding_matrix_fast_text[i] = embedding_vector
embedding_matrix_fast_text.shape

(100000, 300)

In [14]:
embedding_index_glove = {}
with open(embedding_glove,encoding ='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        cof = np.asarray(values[1:],dtype = 'float32')
        embedding_index_glove[word] = cof

In [15]:
embedding_matrix_glove = np.zeros((nb_words,embed_size))
for word,i in word_index.items():
    if i >= max_features:
        continue
    embedding_vec_glove = embedding_index_glove.get(word)
    if embedding_vec_glove is not None:
        embedding_matrix_glove[i] = embedding_vec_glove

In [16]:
embedding_matrix_glove.shape

(100000, 300)

In [17]:
def get_model():
    inp = Input(shape=(max_len,))
    x1 = Embedding(max_features,embed_size,weights=[embedding_matrix_glove])(inp)
    x1 = SpatialDropout1D(0.1)(x1)
    x1 = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x1)
    
    
    x = Embedding(max_features,embed_size,weights=[embedding_matrix_fast_text])(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    
    concat = concatenate([x1,x])
    avg_pool = GlobalAveragePooling1D()(concat)
    max_pool = GlobalMaxPooling1D()(concat)
    concat_pool = concatenate([avg_pool, max_pool])
    
    
    x = Dense(32, activation="relu")(concat_pool)
    x = Dropout(0.1)(x)
    out = Dense(6, activation='sigmoid')(x)
    
    model = Model(inp, out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [18]:
cv_models=[]
cv_results=[]
cv_scores=[]
Kfold = 10

In [19]:
for i in range(1,Kfold+1):
    
    idx_train = train[train['cv_id'] != i].index
    idx_val = train[train['cv_id'] == i].index
    valid_id = train[train['cv_id'] == i]['id'].values
    data_train = X_train_sequences_pad[idx_train]
    labels_train = y_train_list[idx_train]
    data_val = X_train_sequences_pad[idx_val]
    labels_val = y_train_list[idx_val]
    print("fold %d"%(i))
    print("train_shape")
    print(data_train.shape, labels_train.shape)
    print("val_shape")
    print(data_val.shape, labels_val.shape)
    model = get_model()
    best = [-1, 0, 0, 0]
    earlystop = 3
    
    for epoch in range(20):
        model.fit(data_train,labels_train,batch_size=256, epochs=1, verbose=1)
        r = model.predict(data_val ,batch_size=256)
        s = roc_auc_score(labels_val,r)
        print(i,epoch,s)
        if s > best[0]:
            print("epoch " + str(epoch) + " improved from " + str(best[0]) + " to " + str(s))
            best = [s,epoch,copy.copy(model),r]
        if epoch-best[1]>earlystop:
            break
    #save cv_results
    tpd=pd.DataFrame(columns=[['id']+list_classes])
    tpd['id'] = valid_id
    tpd[list_classes] = best[-1]
    cv_results.append(tpd)
    cv_models.append(best[2])
    cv_scores.append(best[0])

fold 1
train_shape
(143341, 150) (143341, 6)
val_shape
(16230, 150) (16230, 6)
Epoch 1/1
1 0 0.9800507026381684
epoch 0 improved from -1 to 0.9800507026381684
Epoch 1/1
1 1 0.9843001310513645
epoch 1 improved from 0.9800507026381684 to 0.9843001310513645
Epoch 1/1
1 2 0.9842615258589947
Epoch 1/1
1 3 0.9838517260689285
Epoch 1/1
1 4 0.9832659035889563
Epoch 1/1
1 5 0.9814830798272206
fold 2
train_shape
(143479, 150) (143479, 6)
val_shape
(16092, 150) (16092, 6)
Epoch 1/1
2 0 0.9806708856331957
epoch 0 improved from -1 to 0.9806708856331957
Epoch 1/1
2 1 0.9849125787914712
epoch 1 improved from 0.9806708856331957 to 0.9849125787914712
Epoch 1/1
2 2 0.9861773692176916
epoch 2 improved from 0.9849125787914712 to 0.9861773692176916
Epoch 1/1
2 3 0.9860001294990237
Epoch 1/1
2 4 0.9858581700978838
Epoch 1/1
2 5 0.9848965510777802
Epoch 1/1
2 6 0.985125797450729
fold 3
train_shape
(143772, 150) (143772, 6)
val_shape
(15799, 150) (15799, 6)
Epoch 1/1
3 0 0.9799039655431621
epoch 0 improved fr

In [20]:
r=[]
avg_val_score = np.average(cv_scores)
print(cv_scores,avg_val_score)
print("prediction begin....")

[0.9843001310513645, 0.9861773692176916, 0.9863888724993432, 0.987653416089057, 0.9896646228479963, 0.9847420020607728, 0.9886027054089528, 0.9864531247771567, 0.9891467313781334, 0.9884405537693558] 0.9871569529099824
prediction begin....


In [24]:
for i in range(Kfold):
    print("prediction "+ str(i))
    if len(r) == 0:
        r = cv_models[i].predict(X_test_sequences_pad,batch_size=256)
    else:
        r += cv_models[i].predict(X_test_sequences_pad,batch_size=256)

prediction 0
prediction 1
prediction 2
prediction 3
prediction 4
prediction 5
prediction 6
prediction 7
prediction 8
prediction 9


In [25]:
r /= 10
index = 'bi-lstm-10-fold'

In [30]:
pd.concat(cv_results).to_csv("%.4lstm_cv"% (avg_val_score)+str(index)+".csv",index=False)

In [31]:
subm[list_classes] = r

subm.to_csv("%.4lstm_submssion"% (avg_val_score)+ index+".csv",index=False)