# Finalized data pipelining and model experimentations

We experiment with the data and model.

In [1]:
import tensorflow as tf
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%time df = pd.read_csv('data/cleaned_data.csv')
df = df.drop(['Unnamed: 0'], axis = 1)

df.head()

CPU times: user 21.1 s, sys: 1.7 s, total: 22.8 s
Wall time: 23.1 s


Unnamed: 0,query,passage_text,label
0,what is a corporation,A company is incorporated in a specific nation...,0
1,what is a corporation,"Today, there is a growing community of more th...",0
2,what is a corporation,"Corporation definition, an association of indi...",0
3,what is a corporation,Examples of corporation in a Sentence. 1 He w...,0
4,what is a corporation,1: a government-owned corporation (as a utilit...,0


We find the max sizes of the query sentence and the passage sentences

In [3]:
query_set = list(set(df['query']))
lengths = [len(x) for x in tqdm(query_set)]

print(lengths.index(max(lengths)))

100%|██████████| 521737/521737 [00:00<00:00, 1000653.25it/s]

439507





In [4]:
len(query_set[38430])

19

In [5]:
passage_set = list(set(df['passage_text']))
lengths = [len(x) for x in tqdm(passage_set)]
print(lengths.index(max(lengths)))

100%|██████████| 4732639/4732639 [00:03<00:00, 1423735.03it/s]


116841


In [6]:
len(passage_set[2972819])

269

The max size for a query is **214** and the max size for a passage is **1397**. Based on these findings we provide 0 paddings to the inputs for both the query and the passage embeddings.

## Fetching the data and forming data pipeline

In [3]:
data = df.values

In [4]:
d = data[:20]

In [5]:
d = d.astype(np.unicode_)

In [6]:
d.dtype

dtype('<U432')

In [7]:
with open('embedings/glove.6B.50d.txt','r') as file:
    %time embeddings = file.read()
e = embeddings.split('\n')

CPU times: user 484 ms, sys: 196 ms, total: 680 ms
Wall time: 681 ms


In [8]:
e[:5]

['the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581',
 ', 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392',
 '. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.4132

In [8]:
embeddings = [x.split(' ') for x in tqdm(e)]
e = {x[0] : x[1:] for x in tqdm(embeddings)}

100%|██████████| 400001/400001 [00:04<00:00, 93697.40it/s]
100%|██████████| 400001/400001 [00:03<00:00, 131901.64it/s]


In [9]:
#print(e.get)
for k, v in tqdm(e.items()):
    v = [float(x) for x in v]
    v = np.array(v)
    try:
        v.reshape(50, 1)
    except Exception:
        print(v.shape)
        print(k)
    e[k] = v
    
e['the']

100%|██████████| 400001/400001 [00:08<00:00, 49728.09it/s]

(0,)






array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

In [14]:
def fetch_data(data):
    #print(data)
    #print(data.shape)
    #return 1
    query = data[0]
    passage = data[1]
    return tf.stack((query, passage)), tf.reshape(data[2],(1,))

In [15]:
q = tf.constant(d, dtype=tf.string)

dataset = tf.data.Dataset.from_tensor_slices(q).map(fetch_data)
dataset = dataset.batch(1).repeat()

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

In [17]:
with tf.Session() as sess:
    #sess.run(tf.global_variable_initializer())
    for i in range():
        query_em = []; passage_em = []
        print('batch ' + str(i))
        r = sess.run(next_element)
        inputs = r[0]
        #print(inputs.shape)
        labels = r[1]
        for j in tqdm(inputs):
            
            #print(temp)
            #print(str(j[0]))
            query = j[0].decode('utf-8').split(' ')
            passage = str.lower(j[1].decode('utf-8'))
            passage = re.split('\W+', passage)
#             print(passage)
#             print(len(passage))
#             print(passage[-1])
            query_em.append([e[str(x)] for x in query])
            embeds = []
            for x in passage:
                #print('entered loop')
                try:
                    if(e[x].shape == (50,)):
                        em = e[x]
                        embeds.append(em)
                    #print(len(embeds))
                except KeyError:
                    #em = np.zeros(shape = (50, 1), dtype = np.float64)
                    pass
            embeds = np.array(embeds)
            #print(type(embeds))
            #print()
            #print(embeds)
#             for q in embeds:
#                 print(q.shape)
#             print(embeds.shape)
#             print(np.array(embeds).shape)
            passage_em.append(embeds)
            
#         print(r)
#         print(query_em[0])
#         print(passage_em[0])
        query_em = np.array(query_em)
        passage_em = np.array(passage_em)
        print(query_em.shape)
        print(passage_em.shape)

100%|██████████| 1/1 [00:00<00:00, 366.73it/s]

batch 0
(1, 4, 50)
(1, 73, 50)





In [15]:
passage_em

array([[[ 0.21705 ,  0.46515 , -0.46757 , ..., -0.043782,  0.41013 ,
          0.1796  ],
        [ 0.62583 , -0.57703 ,  0.41163 , ...,  0.21582 , -0.15586 ,
          0.64018 ],
        [ 0.6185  ,  0.64254 , -0.46552 , ..., -0.27557 ,  0.30899 ,
          0.48497 ],
        ...,
        [ 0.92212 , -0.14503 ,  0.70623 , ...,  0.014533, -0.072347,
         -0.29128 ],
        [ 0.26358 ,  0.18747 ,  0.044394, ..., -0.42936 ,  0.52879 ,
         -0.12598 ],
        [ 0.87013 , -0.53648 ,  0.70927 , ...,  0.73918 , -0.11095 ,
         -0.083541]]])

In [16]:
inp = tf.keras.Input(shape = (None, 50))
inp1 = tf.keras.Input(shape = (None, 50))
x = tf.keras.layers.LSTM(100, return_sequences = True)(inp)

x1 = tf.keras.layers.LSTM(100, return_sequences = True)(inp1)
con = tf.keras.layers.concatenate([x, x1], axis = -1)
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(con)
model = tf.keras.Model([inp, inp1], output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 50)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 50)     0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 100)    60400       input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 100)    60400       input_2[0][0]                    
__________________________________________________________________________________________________
concatenat