In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import matplotlib.pyplot as plt
%matplotlib inline
import gensim.models.keyedvectors as word2vec
import gc
import numpy as np

Using TensorFlow backend.


In [2]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")

In [3]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [4]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [5]:
maxlen = 100
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [6]:
def loadEmbeddingMatrix(typeToLoad):
    if(typeToLoad=="glove"):
        EMBEDDING_FILE='./input/glove.6B.50d.txt'
        embed_size = 25
    elif(typeToLoad=="word2vec"):
        word2vecDict = word2vec.KeyedVectors.load_word2vec_format("../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin", binary=True)
        embed_size = 300
    elif(typeToLoad=="fasttext"):
        EMBEDDING_FILE='./input/crawl-300d-2M.vec'
        embed_size = 300

    if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
        embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
        f = open(EMBEDDING_FILE)
        for line in f:
            values = line.split()
                #first index is word
            word = values[0]
                #store the rest of the values in the array as a new array
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except ValueError:
                pass
            embeddings_index[word] = coefs #50 dimensions
        f.close()
        print('Loaded %s word vectors.' % len(embeddings_index))
    else:
        embeddings_index = dict()
        for word in word2vecDict.wv.vocab:
            embeddings_index[word] = word2vecDict.word_vec(word)
        print('Loaded %s word vectors.' % len(embeddings_index))
            
    gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
    nb_words = len(tokenizer.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
    embeddedCount = 0
    for word, i in tokenizer.word_index.items():
        i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
        embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            embeddedCount+=1
    print('total embedded:',embeddedCount,'common words')
        
    del(embeddings_index)
    gc.collect()
        
        #finally, return the embedding matrix
    return embedding_matrix

The function would return a new embedding matrix that has the loaded weights from the pretrained embeddings for the common words we have, and randomly initialized numbers that has the same mean and standard deviation for the rest of the weights in this matrix.

Let's move on and load our first embeddings from Word2Vec.

In [7]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def embed_vec(EMBEDDING_FILE,embed_size, max_features = 20000,maxlen = 100):
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    print(embedding_matrix.shape)
    return embedding_matrix

In [8]:
EMBEDDING_FILE_glove = './input/glove.6B.50d.txt'

In [11]:
embedding_matrix_glove = embed_vec(EMBEDDING_FILE_glove,embed_size=50)

(20000, 50)


In [12]:
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Merge, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional 
from keras.models import Model
from keras.callbacks import EarlyStopping

In [13]:
def cnn_model(embedding_matrix):
    embedding_layer = Embedding(max_features, 
                                embedding_matrix.shape[1],
                                weights=[embedding_matrix],
                                trainable=False)

    sequence_input = Input(shape=(maxlen,))
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = BatchNormalization()(Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences))
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = Merge(mode='concat', concat_axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = BatchNormalization()(Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences))
    pool = MaxPooling1D(pool_size=3)(conv)

    #x = Dropout(0.5)(pool)
    x = Dropout(0.5)(l_merge) 
    x = Flatten()(x)
    x = BatchNormalization()(Dense(128, activation='relu')(x))
    x = Dropout(0.5)(x)

    preds = Dense(len(list(list_classes)), activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    return model    

In [14]:
model = cnn_model(embedding_matrix_glove)



In [15]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [16]:
num_epochs = 3
batch_size = 32
hist = model.fit(X_t, y, 
                 epochs=num_epochs, 
                 callbacks=callbacks_list, 
                 validation_split=0.2, shuffle=True, batch_size=batch_size)

Train on 127656 samples, validate on 31915 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
y_test = model.predict(X_te, batch_size=32, verbose=1)




In [None]:
model.save_weights("cnn_model_glove.h5")
print("Saved model to disk")