In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df_train = pd.read_csv("../input/quora-question-pairs/train.csv.zip")

In [3]:
df_train.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
len(df_train)

404290

In [5]:
df_train = df_train[df_train['question1'].apply(lambda x: isinstance(x, str))]
df_train = df_train[df_train['question2'].apply(lambda x: isinstance(x, str))]

In [6]:
len(df_train)

404287

In [31]:
import re, string, six

from nltk.corpus import stopwords
import pandas as pd
import numpy as np

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

def clean_text(s):
    try:
        return re.sub(r'[^A-Za-z0-9,?"\'. ]+', '', s).encode('utf-8').decode('utf-8')
    except:
        return ""

stops = set(stopwords.words("english"))

def word_match_share(row):
    q1words = {}
    q2words = {}
    try:
        for word in tokenize(row['question1']):
            if word not in stops:
                q1words[word] = 1
        for word in tokenize(row['question2']):
            if word not in stops:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            return 0
        shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
        shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
        return (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    except:
        return 0


In [8]:
df_train['wms'] = df_train.apply(word_match_share, axis=1)

In [9]:
df_train['question1'] = df_train['question1'].apply(lambda x: clean_text(x))
df_train['question2'] = df_train['question2'].apply(lambda x: clean_text(x))

In [13]:
import tensorflow as tf
import tensorflow_hub as hub

hub_url = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
embed = hub.KerasLayer(hub_url, trainable=False)

In [14]:
def euc_dist(x, y):
    return np.sqrt(np.dot((x-y), (x-y)))

In [15]:
X_train_q1 = df_train['question1'].tolist()
X_train_q2 = df_train['question2'].tolist()
X_wms = df_train['wms'].tolist()
y_train = (1-df_train['is_duplicate']).tolist()

In [16]:
from sklearn.model_selection import train_test_split

X_train_q1, X_test_q1, X_train_q2, X_test_q2, X_wms_train, X_wms_test, y_train, y_test = train_test_split(X_train_q1, X_train_q2, X_wms, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [17]:
X_wms_train[:4]

[0.9230769230769231,
 0.4444444444444444,
 0.6666666666666666,
 0.1111111111111111]

In [18]:
import tensorflow as tf
# tf.config.run_functions_eagerly(False)
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

In [21]:
input1 = Input(shape=(), dtype=tf.string)
input2 = Input(shape=(), dtype=tf.string)
input_wms = Input(shape=(1,), dtype=tf.float16)

embed1 = embed(input1)
embed2 = embed(input2)

dist = Lambda(lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=-1, keepdims=True)))([embed1,embed2])

concat = Concatenate(axis=1)([dist, input_wms])

out = Dense(1, activation="sigmoid", kernel_regularizer=l2(1e-4))(concat)
model = Model(inputs=[input1, input2, input_wms], outputs=out)

In [22]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      (None, 128)          124642688   input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 1)            0           keras_layer_2[4][0]        

In [23]:
model.compile(optimizer=Adam(1e-3), loss="binary_crossentropy", metrics=["accuracy"])

In [26]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.003
    drop = 0.5
    epochs_drop = 3
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

model.fit(x=[np.array(X_train_q1), np.array(X_train_q2), np.array(X_wms_train)],
          y=np.array(y_train),
          batch_size=128,
          epochs=5,
          validation_data=([np.array(X_test_q1), np.array(X_test_q2), np.array(X_wms_test)], np.array(y_test)),
          callbacks=[lrate_scheduler, early_stop, model_chkpoint])

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.57685, saving model to best_model.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.57685 to 0.57635, saving model to best_model.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.57635 to 0.57613, saving model to best_model.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.57613 to 0.57600, saving model to best_model.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.57600 to 0.57591, saving model to best_model.h5


<tensorflow.python.keras.callbacks.History at 0x7fc834f2b190>

In [28]:
df_test = pd.read_csv("../input/quora-question-pairs/test.csv")

In [29]:
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [32]:
df_test['wms'] = df_test.apply(word_match_share, axis=1)

In [33]:
df_test['question1'] = df_test['question1'].apply(lambda x: clean_text(x))
df_test['question2'] = df_test['question2'].apply(lambda x: clean_text(x))

In [34]:
df_test.head()

Unnamed: 0,test_id,question1,question2,wms
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0.444444
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0.666667
2,2,What but is the best way to send money from Ch...,What you send money to China?,0.769231
3,3,Which food not emulsifiers?,What foods fibre?,0.25
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.727273


In [35]:
X_test_q1 = df_test['question1'].tolist()
X_test_q2 = df_test['question2'].tolist()
X_test_wms = df_test['wms'].tolist()

In [36]:
from tqdm import tqdm
preds = []
batch_size = 512
steps = len(X_test_q1) // batch_size + 1
for i in tqdm(range(0, steps)):
    X_test_q1_batch = np.array(X_test_q1[i*batch_size: i*batch_size+batch_size])
    X_test_q2_batch = np.array(X_test_q2[i*batch_size: i*batch_size+batch_size])
    X_test_wms_batch = np.array(X_test_wms[i*batch_size: i*batch_size+batch_size])
    preds.extend(model.predict([X_test_q1_batch, X_test_q2_batch, X_test_wms_batch]))

100%|█████████▉| 4582/4583 [06:43<00:00, 11.34it/s]


ValueError: Expect x to be a non-empty array or dataset.

In [38]:
preds = [1 - x[0] for x in preds]

In [40]:
df_test['is_duplicate'] = preds

In [41]:
df_test = df_test.drop(['question1', 'question2', 'wms'], axis=1)

In [42]:
df_test.set_index('test_id').to_csv("submission.csv")