In [2]:
from __future__ import unicode_literals, division
import re
import sys
from tqdm import tqdm_notebook

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from embedding import get_embedding
from config import Config
from data_utils import tokenize_sent

# Get configurations

In [6]:
config = Config()
max_len = 70
embedding_size = config.embedding_size

In [7]:
path = '../train.csv'

In [8]:
data = pd.read_csv(path)
data = data[:50000]


In [9]:
columns = data.columns

In [10]:
columns

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate'], dtype='object')

In [11]:
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [12]:
data.isna().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

# sequence lengths

In [13]:
data['seql_one'] = data['question1'].apply(lambda row: len(tokenize_sent(str(row).lower())))
data['seql_two'] = data['question2'].apply(lambda row: len(tokenize_sent(str(row).lower())))

In [14]:
data = data.loc[(data.seql_one != 0)  & (data.seql_two!=0) ]

In [15]:
data.columns

Index([          u'id',         u'qid1',         u'qid2',    u'question1',
          u'question2', u'is_duplicate',     u'seql_one',     u'seql_two'],
      dtype='object')

# spliting data to train and dev

80% train data and 20% test data

In [16]:
split_ratio = 0.8

In [17]:
l = len(data)

In [18]:
train_length = int(l*split_ratio)

In [19]:
data.groupby('is_duplicate').count()

Unnamed: 0_level_0,id,qid1,qid2,question1,question2,seql_one,seql_two
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,31346,31346,31346,31346,31346,31346,31346
1,18649,18649,18649,18649,18649,18649,18649


# word2vec features


In [20]:
def get_sent_embedding(sent):
    sent = str(sent).lower().strip()
    vector = []
    counter = 0
    vector = [get_embedding(w) for w in tokenize_sent(sent)]
    return vector

In [21]:
# get_sent_embedding('dsfhgdsj sajkfgksdgf')

In [22]:
len(get_sent_embedding('i love reading'))

loading w2v


3

In [23]:
train_data = data[:train_length]


train_data['vec_1'] = train_data['question1'].apply(lambda row: get_sent_embedding(row))
train_data['vec_2'] = train_data['question2'].apply(lambda row: get_sent_embedding(row))

train_data = zip(train_data.vec_1, train_data.vec_2, 
                 train_data.seql_one, train_data.seql_two, 
                 train_data.is_duplicate)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [24]:
dev_data = data[train_length:]


dev_data['vec_1'] = dev_data['question1'].apply(lambda row: get_sent_embedding(row))
dev_data['vec_2'] = dev_data['question2'].apply(lambda row: get_sent_embedding(row))

dev_data = zip(dev_data.vec_1, dev_data.vec_2, 
               dev_data.seql_one, dev_data.seql_two, 
               dev_data.is_duplicate)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [25]:
dev_data[0][4]

1

# Building and training Model

In [26]:
from model import Model
model = Model()

In [27]:
tf.reset_default_graph()
model.build()

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [28]:
model.train(train_data=train_data, dev_data=dev_data)

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 0 loss is..0.806667316302
At epoch 0 training accuracy is..0.654397226823


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 0 dev acc..0.710820312798


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 1 loss is..0.560248920302
At epoch 1 training accuracy is..0.710043458043


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 1 dev acc..0.716770833731


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 2 loss is..0.53181825227
At epoch 2 training accuracy is..0.728997478819


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 2 dev acc..0.726634114981


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 3 loss is..0.505417032227
At epoch 3 training accuracy is..0.748678012259


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 3 dev acc..0.728300781548


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 4 loss is..0.481678633838
At epoch 4 training accuracy is..0.760448182085


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 4 dev acc..0.725364583731


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 5 loss is..0.449789798279
At epoch 5 training accuracy is..0.780732484001


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 5 dev acc..0.720983073115


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 6 loss is..0.420289531255
At epoch 6 training accuracy is..0.794167993555


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 6 dev acc..0.721940104663


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 7 loss is..0.394134924955
At epoch 7 training accuracy is..0.807603503109


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 7 dev acc..0.728307291865


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 8 loss is..0.370588906726
At epoch 8 training accuracy is..0.82230294586


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


At epoch 8 dev acc..0.722122396529


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

KeyboardInterrupt: 