In [1]:
## Load Embeddings
## 1. Glove
import pandas as pd
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors


In [2]:
## Keras import
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Activation
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
def loadEmbeddings(name):
    if name == "glove":
        glove_file = datapath('./embedding/glove.840B.300d.txt')
        tmp_file = get_tmpfile("./embedding/glove_word2vec.txt")
        glove2word2vec(glove_file, tmp_file)
        model = KeyedVectors.load_word2vec_format(tmp_file)
        return model

    if name == "google":
        model = KeyedVectors.load_word2vec_format('./embedding/GoogleNews-vectors-negative300.bin', binary=True)
        vocab = model.vocab.keys()
        print('Found %s word vectors.' % len(vocab))
        return model
        
    if name == "paragram":
        print("paragram")
        
    if name == "wiki":
        print("wiki")


In [4]:
## Load Embeddings
google_vec = loadEmbeddings("google")
#tmp = loadEmbeddings("glove")

Found 3000000 word vectors.


In [5]:
## Load data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sub = pd.read_csv("./data/sample_submission.csv")
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
## EDA
## 1. Size of data - train and test Count
print("Train Data: ",str(len(train)))
print("Test Data: ",str(len(test)) )

## 2. Classes distributions in train and test
print("Train Data Positive Class: ",str(len(train[(train.target == 1)])))
print("Train Data Negative Class: ",str(len(train[(train.target == 0)])))
print("% Split +ve class",str(len(train[(train.target == 1)])/len(train) *100)) ## Unbalanced class
print("\n")
## 3. Positive and Negative sample
print(train[(train.target == 0)].head(10))
train[(train.target == 1)].head(10)


Train Data:  1306122
Test Data:  56370
Train Data Positive Class:  80810
Train Data Negative Class:  1225312
% Split +ve class 6.187017751787352


                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   
5  00004f9a462a357c33be  Is Gaza slowly becoming Auschwitz, Dachau or T...   
6  00005059a06ee19e11ad  Why does Quora automatically ban conservative ...   
7  0000559f875832745e2e  Is it crazy if I wash or wipe my groceries off...   
8  00005bd3426b2d0c8305  Is there such a thing as dressing moderately, ...   
9  00006e6928c5df60eacb  Is it just me or have you ever been in this ph..

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1
115,000537213b01fd77b58a,Which races have the smallest penis?,1
119,00056d45a1ce63856fc6,Why do females find penises ugly?,1
127,0005de07b07a17046e27,How do I marry an American woman for a Green C...,1
144,00068875d7c82a5bcf88,Why do Europeans say they're the superior race...,1
156,0006ffd99a6599ff35b3,Did Julius Caesar bring a tyrannosaurus rex on...,1
167,00075f7061837807c69f,In what manner has Republican backing of 'stat...,1


In [7]:
## 4. Top bi-gram and tri-gram in positive and negative classes


In [8]:
## Train and Test split
from sklearn.model_selection import train_test_split
split_train , split_test = train_test_split(train, test_size = 0.3)

In [9]:
## Pre Processing
all_sentence = pd.concat([train["question_text"], test["question_text"]])
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ \'')
tokenizer.fit_on_texts(all_sentence)
word_dict = tokenizer.word_index
print("Found %s unique words" % len(word_dict))

## Get index to word mapping after tokenization
word_index={}
for wrd,indx in word_dict.items():
    word_index[indx] = wrd
    

Found 209967 unique words


In [10]:
def preProcessing(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, padding = "post", maxlen=100)
    return (sequences,data[0])


In [11]:
embedding = google_vec
embedding_word_vectors = embedding.wv

## For zero padding
dummy_word =  "__TEMP__"
word_index[0] = dummy_word

## Get word vectors for sentences
def getWordVectors(sentence):
    sentence_embd = []
    for wrd_indx in sentence:
        word = word_index.get(wrd_indx)
        if word ==dummy_word:
            break
        elif word in embedding_word_vectors:
            sentence_embd.append(embedding.wv[word])
        else:
            continue
            #print("%s word does not exists" % word)  ## For Non existing words : skip word and add padding in the end
    
    for i in range(len(sentence_embd),100):
        tmp =[0]*300
        sentence_embd.append(tmp)
    return np.array(sentence_embd)
 

## For each sentence
# sentence_indx, data_indx = preProcessing(["hello world"])
# rt = getWordVectors(data_indx)
# print(len(rt))
# #print(rt)


  


In [75]:
## Generate batch data
batch_size = 128
import math
import numpy as np
def batch_gen(data, text_column, label_column= None):
    n_batches = math.ceil(len(data)/batch_size)
    while True:
        data = data.sample(frac = 1.0) ## resuffle
        for i in range(n_batches):
            sample_data = data[128*i:128*(i+1)]
            sample_data["text_column_vec"] = sample_data[text_column].apply(lambda x: getWordVectors(preProcessing([x])[1]))
            wrd_vec  = np.array([sample_data.text_column_vec])
            if label_column:
                lable_indx = np.array(sample_data[label_column])
                yield wrd_vec[0,:,:,:],lable_indx
            else:
                yield wrd_vec[0,:,:,:]
            
#y = batch_gen(split_test[:20],'question_text','target')


In [76]:
## Model
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences = True), input_shape = (100,300)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])


In [None]:
no_validation_steps = int(len(split_test)/batch_zise)
train_data_gen =  batch_gen(split_train,'question_text','target')
validation_data_gen =  batch_gen(split_test,'question_text','target')

model.fit_generator(train_data_gen, epochs=5, steps_per_epoch=500, validation_data = validation_data_gen, validation_steps = 100)

Epoch 1/5


  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [29]:
## Validate 
print(model.history.history.items())

dict_items([('val_loss', [0.13326924592256545]), ('val_acc', [0.945078125]), ('loss', [0.15399025445431472]), ('acc', [0.9454375])])


In [53]:
batch_size=128
test_data_gen = batch_gen(test,'question_text')
test_size = math.ceil(len(test)/batch_size)+1

In [54]:
## Predict on test set
results = model.predict_generator(test_data_gen,steps=test_size)

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [70]:
## Organize output in req. format
predictions = results.flatten()
result_df = pd.DataFrame({"qid": test["qid"],"text":test["question_text"], "prediction_prob": predictions[:len(test)]})
result_df["prediction"] = np.where(result_df['prediction_prob']>0.5, '1', '0')

In [71]:
result_df.head()

Unnamed: 0,qid,text,prediction_prob,prediction
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...,0.056964,0
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?,0.017584,0
2,000227734433360e1aae,What are the best made pocket knives under $20...,0.002196,0
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...,0.003624,0
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?,0.015095,0


In [74]:
result_df[result_df["prediction"]=='1'].head(10)

Unnamed: 0,qid,text,prediction_prob,prediction
39,003069ba70645b15c3ba,Why don't India start a War with Pakistan ? Th...,0.533687,1
42,0036696fb9d739e9afbf,What proof is required to claim input tax credit?,0.60903,1
80,00627daf7194fcf7cbad,What are some latest trending outfits of 2018?,0.62225,1
111,00914f8020ca8495e229,What does it mean when people say you are cute?,0.572724,1
139,00b73d8f97f862044b8f,Whatis distance between toll plaza & yellow line?,0.583141,1
171,00db83cea2561cdf958f,What does to feel like to have sex with a big ...,0.596127,1
210,010a8dddaa14a80b73c7,Is more of America and the world in general fi...,0.526013,1
242,0130a98dab4078e68ec7,Has India stopped crying over Pakistan CPEC?,0.569267,1
244,0135bc547ede7a40f502,What are the uses of wool grease?,0.692081,1
284,0165894435a98dbd50d1,Why can't child abusers be put to death?,0.568771,1
