In [1]:
## Load Embeddings
## 1. Glove
import pandas as pd
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

## Keras import
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences


## CNN
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.callbacks import ModelCheckpoint

import numpy as np
import math


Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
## Load data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sub = pd.read_csv("./data/sample_submission.csv")
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
## Pre Processing
all_sentence = pd.concat([train["question_text"], test["question_text"]])
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ \'')
tokenizer.fit_on_texts(all_sentence)
word_dict = tokenizer.word_index
print("Found %s unique words" % len(word_dict))


Found 209967 unique words


In [4]:
nb_words = len(word_dict)
embed_size=300

def make_embedding_matrix(word_vec):
    embedding = np.zeros((nb_words+1, embed_size))
    for word, i in word_dict.items():
        vec = word_vec.get(word)
        if vec is not None: embedding[i] = vec
    return embedding

def loadEmbeddings(name):
    embeddings_index = {}
    if name == "glove":
        f = open('./embedding/glove.840B.300d.txt')
        for line in f:
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        
    if name == "google":
        model = KeyedVectors.load_word2vec_format('./embedding/GoogleNews-vectors-negative300.bin', binary=True)
        vocab = model.vocab.keys()
        for word in vocab:
            embeddings_index[word] = model.wv[word]
        del model
        del vocab
        #print('Found %s word vectors.' % len(vocab))
        
    if name == "paragram":
        print("paragram")
        EMBEDDING_FILE = './embedding/paragram_300_sl999/paragram_300_sl999.txt'
        f = open(EMBEDDING_FILE, encoding="utf8", errors='ignore')
        for line in f:
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        
        
    if name == "wiki":
        EMBEDDING_FILE = './embedding/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        f = open(EMBEDDING_FILE)
        #print("wiki")
        for line in f:
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
    embedding_matrix = make_embedding_matrix(embeddings_index)
    del embeddings_index
    return embedding_matrix



In [6]:
## Load Embeddings

embedding_matrix_wordvec = loadEmbeddings("google")
embedding_matrix_glove = loadEmbeddings("glove")

## ADD more more Channel 
## memory Constraints - Adding only one
#embedding_matrix_wiki  = loadEmbeddings("wiki")
#embedding_matrix_paragram = loadEmbeddings("paragram")



In [7]:
## Hyperparameters
EMBEDDING_DIM = 300
sequence_length = 30
vocab_size = nb_words+1

batch_size_train = 128
filter_sizes = [2,3,4,5,6]
num_filters = 10
drop = 0.2
epochs = 100

In [8]:
## Model Defination

def create_model():

    inputs_sent = Input(shape=(sequence_length,), dtype='int32')
    
    ## Embedding 1 - GLOVE
    embedding_1 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length,  weights=[embedding_matrix_glove])(inputs_sent)

    input_re_1 = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding_1)

    conv_1_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_1)
    conv_1_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_1)
    conv_1_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_1)
    conv_1_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_1)


    maxpool_1_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_1_0)
    maxpool_1_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1_1)
    maxpool_1_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_1_2)
    maxpool_1_3 = MaxPool2D(pool_size=(sequence_length - filter_sizes[3] + 1, 1), strides=(1,1), padding='valid')(conv_1_3)
    
    ## Embedding 2 - Google - word2Vec
    embedding_2 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length, weights=[embedding_matrix_wordvec])(inputs_sent)
    input_re_2 = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding_2)

    conv_2_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_2)
    conv_2_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_2)
    conv_2_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_2)
    conv_2_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_2)


    maxpool_2_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_2_0)
    maxpool_2_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_2_1)
    maxpool_2_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2_2)
    maxpool_2_3 = MaxPool2D(pool_size=(sequence_length - filter_sizes[3] + 1, 1), strides=(1,1), padding='valid')(conv_2_3)
    
    ## Embedding 1 - WIKI (Reparting Glove since we dont have enoght memory, This time changing filter size)
    #embedding_3 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length,  weights=[ebedding_matrix_wiki])(inputs_sent)
    embedding_3 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length,  weights=[embedding_matrix_glove])(inputs_sent)


    input_re_3 = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding_3)

    conv_3_0 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_3)
    conv_3_1 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_3)
    conv_3_2 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_3)
    conv_3_3 = Conv2D(num_filters, kernel_size=(filter_sizes[4], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_3)


    maxpool_3_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_3_0)
    maxpool_3_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_3_1)
    maxpool_3_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[3] + 1, 1), strides=(1,1), padding='valid')(conv_3_2)
    maxpool_3_3 = MaxPool2D(pool_size=(sequence_length - filter_sizes[4] + 1, 1), strides=(1,1), padding='valid')(conv_3_3)
    
    ## Embedding 1 - PARAGRAM (Reparting Word2vec since we dont have enoght memory, This time changing filter size)
    #embedding_4 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length,  weights=[embedding_matrix_paragram])(inputs_sent)
    embedding_4 = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=sequence_length,  weights=[embedding_matrix_wordvec])(inputs_sent)

    input_re_4 = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding_4)

    conv_4_0 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_4)
    conv_4_1 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_4)
    conv_4_2 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_4)
    conv_4_3 = Conv2D(num_filters, kernel_size=(filter_sizes[4], EMBEDDING_DIM), padding='valid', kernel_initializer='he_normal', activation='relu')(input_re_4)


    maxpool_4_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_4_0)
    maxpool_4_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_4_1)
    maxpool_4_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[3] + 1, 1), strides=(1,1), padding='valid')(conv_4_2)
    maxpool_4_3 = MaxPool2D(pool_size=(sequence_length - filter_sizes[4] + 1, 1), strides=(1,1), padding='valid')(conv_4_3)
    

    ## CONCATINATION OF ALL

    concatenated_tensor = Concatenate(axis=1)([maxpool_1_0,maxpool_1_1, maxpool_1_2, maxpool_1_3,
                                               maxpool_2_0, maxpool_2_1, maxpool_2_2,maxpool_2_3,
                                               maxpool_3_0, maxpool_3_1, maxpool_3_2,maxpool_3_3,
                                               maxpool_4_0, maxpool_4_1, maxpool_4_2,maxpool_4_3,
                                              ])
    flatten = Flatten()(concatenated_tensor)
    #drop_out = Dropout(0.1)(flatten)
    output_prob = Dense(units=1, activation='sigmoid')(flatten)
    
    model = Model(inputs=inputs_sent, outputs=output_prob)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    #checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    
    print("Model Created")
    return model


# this creates a model
model= create_model()



Model Created


In [9]:
def preProcessing(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, padding = "post", maxlen=sequence_length)
    return (sequences,data[0])

def batch_gen_train_cnn(data,batch_size, text_column,label_column):
    n_batches = math.ceil(len(data)/batch_size)
    while True:
        data = data.sample(frac = 1.0) ## resuffle
        for i in range(n_batches):
            sample_data = data[batch_size*i:batch_size*(i+1)]
            wrd_vec = np.array([preProcessing([X_text])[1] for X_text in sample_data[text_column]])
            lable_indx = np.array(sample_data[label_column])
            yield wrd_vec,lable_indx
    

In [11]:
## Train and Test split
from sklearn.model_selection import train_test_split

split_train , split_val = train_test_split(train, test_size = 0.2)
train_batch = batch_gen_train_cnn(split_train,batch_size_train,"question_text","target")
val_batch = batch_gen_train_cnn(split_val,batch_size_train,"question_text","target")

print("Traning Model...")
#no_validation_steps = int(len(split_val)/batch_size_train)
#steps_epoch_train = int(len(split_train)/batch_size_train)
steps_epoch_train=100
no_validation_steps = 10
model.fit_generator(train_batch, epochs=2, steps_per_epoch=steps_epoch_train, validation_data = val_batch, validation_steps = no_validation_steps,verbose=True)




Traning Model...
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2670abe80>

In [12]:
from tqdm import tqdm


def batch_gen_test_cnn(data,batch_size, text_column):
    n_batches = math.ceil(len(data)/batch_size)
    for i in range(n_batches):
        sample_data = data[batch_size*i:batch_size*(i+1)]
        wrd_vec = np.array([preProcessing([X_text])[1] for X_text in sample_data[text_column]])
        yield wrd_vec
    
batch_size_test=20
test_sample = test[:100]
test_data_gen = batch_gen_test_cnn(test_sample,batch_size_test,'question_text')      
all_preds = []
for x in tqdm(test_data_gen):
    all_preds.extend(model.predict(x).flatten())

5it [00:00,  2.20it/s]


In [13]:
submit_df = pd.DataFrame({"qid": test_sample["qid"], "prediction_prob": all_preds,"q":test_sample["question_text"]})
submit_df["prediction_5"] = np.where(submit_df['prediction_prob']>0.5, '1', '0')
submit_df["prediction_3"] = np.where(submit_df['prediction_prob']>0.3, '1', '0')

submit_df.head()

Unnamed: 0,qid,prediction_prob,q,prediction_5,prediction_3
0,00014894849d00ba98a9,0.009947,My voice range is A2-C5. My chest voice goes u...,0,0
1,000156468431f09b3cae,0.001019,How much does a tutor earn in Bangalore?,0,0
2,000227734433360e1aae,0.002606,What are the best made pocket knives under $20...,0,0
3,0005e06fbe3045bd2a92,0.023742,Why would they add a hypothetical scenario tha...,0,0
4,00068a0f7f41f50fc399,0.005262,What is the dresscode for Techmahindra freshers?,0,0


In [14]:
submit_df[submit_df.prediction_3=='1']

Unnamed: 0,qid,prediction_prob,q,prediction_5,prediction_3
39,003069ba70645b15c3ba,0.510671,Why don't India start a War with Pakistan ? Th...,1,1
69,005af7396a84a515f67c,0.35668,Are Hindus allowed to build new temples in Pak...,0,1
77,0061c39bba71f03ac780,0.547797,Why do people think white privilege is real wh...,1,1
86,0069468befb619ce22c6,0.373943,Is the Indian Fuhrer capable of operating a sm...,0,1
98,007e65e4441890f4416b,0.570884,Why does Quora send me a notice because I told...,1,1
