In [2]:
import numpy as np
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch



In [3]:
data = pd.read_csv('data/train.csv', dtype={'id': int, 'qid1': int, 'qid2': int, 'question1': str, 'question2': str, 'is_duplicate': int})
data.drop(data.columns[[0, 1, 2]], axis=1, inplace=True)
train_df, test_df = train_test_split(data, test_size=0.4)
train_df.head()

Unnamed: 0,question1,question2,is_duplicate
80226,Why is email encryption important?,Why isn't email encrypted?,0
292700,How can I learn to focus on something?,How can I focus on something?,1
329203,Why is bluing agent used?,Why is bluing agent used to whiten the white c...,1
142931,"If I ask a question and then unfollow it, will...","If I ask a question anonymously, do I get cred...",0
314893,What type of economy does Greece have? How eff...,What type of government does Greece have? How ...,0


In [4]:
from podium import Vocab, Field, LabelField
from podium.datasets import TabularDataset
from podium.vectorizers import GloVe

In [5]:
from podium.preproc import TextCleanUp

max_vocab_size = 10_000
vocab = Vocab(max_size=max_vocab_size, min_freq=2)

cleanup = TextCleanUp(remove_punct=True)

def lowercase(raw: str):
    return raw.lower()

Q1 = Field(name='question1',
           tokenizer='split',
           numericalizer=vocab,
           pretokenize_hooks=[cleanup, lowercase])
Q2 = Field(name='question2',
           tokenizer='split',
           numericalizer=vocab,
           pretokenize_hooks=[cleanup, lowercase])
IS_DUPLICATE = LabelField(name='is_duplicate')

fields = [
    Q1,
    Q2,
    IS_DUPLICATE,
]

train = TabularDataset.from_pandas(train_df, fields)
test = TabularDataset.from_pandas(test_df, fields)
train.finalize_fields()

glove = GloVe()
# Load only the vectors of vocab words.
embeddings = glove.load_vocab(vocab)

# Generate padded batch.
train_batch = train.batch(add_padding=True)
test_batch = test.batch(add_padding=True)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [6]:
train_batch.question1 = train_batch.question1.astype(int)
train_batch.question2 = train_batch.question2.astype(int)
test_batch.question1 = test_batch.question1.astype(int)
test_batch.question2 = test_batch.question2.astype(int)

In [7]:
def cosine_similarity(a, b):
    """
    Receives two 2D numpy arrays and calculates cosine similarity across the second axis.
    For examples, if `a` and `b` have shape (32, 10), the resulting array should have shape (32,).
    
    Returns:
        1D numpy array with cosine similarities
    """
    res = np.empty(shape=a.shape[0])
    for i in range(res.shape[0]):
        res[i] = np.dot(a[i], b[i]) / np.linalg.norm(a[i]) / np.linalg.norm(b[i])
    return res

def top_n(sims, n=10):
    """
    Receives a numpy array `sims` and finds the indices of the top `n` highest similarities.
    The indices are returned in the ascending order (from lowest to highest index).
    """
    # Source - https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
    return np.argpartition(sims, -n)[-n:]

In [8]:
N_train = len(train_batch.question1)
N_test = len(test_batch.question1)

train_s1_dim = train_batch.question1.shape[1]
train_s2_dim = train_batch.question2.shape[1]
test_s1_dim = test_batch.question1.shape[1]
test_s2_dim = test_batch.question2.shape[1]

question1_train = np.empty(shape=(N_train, train_s1_dim, 300))
question2_train = np.empty(shape=(N_train, train_s2_dim, 300))
question1_test = np.empty(shape=(N_test, test_s1_dim, 300))
question2_test = np.empty(shape=(N_test, test_s2_dim, 300))

question1_train_mean = np.empty(shape=(N_train, 300))
question2_train_mean = np.empty(shape=(N_train, 300))
question1_test_mean = np.empty(shape=(N_test, 300))
question2_test_mean = np.empty(shape=(N_test, 300))

for i in range(N_train):
    s1_arr = np.array([embeddings[x] for x in train_batch.question1[i]])
    s2_arr = np.array([embeddings[x] for x in train_batch.question2[i]])
    question1_train[i] = s1_arr
    question2_train[i] = s2_arr
    question1_train_mean[i] = np.mean(s1_arr, axis=0)
    question2_train_mean[i] = np.mean(s2_arr, axis=0)
for i in range(N_test):
    s1_arr = np.array([embeddings[x] for x in test_batch.question1[i]])
    s2_arr = np.array([embeddings[x] for x in test_batch.question2[i]])
    question1_test[i] = s1_arr
    question2_test[i] = s2_arr
    question1_test_mean[i] = np.mean(s1_arr, axis=0)
    question2_test_mean[i] = np.mean(s2_arr, axis=0)

# for i, si in enumerate(reversed(top_n(cosine_similarity(question1_train_mean, question2_train_mean), n=10))):
#     row = train_df.iloc[si]
#     print(f"{i+1})\n {row['question1']}\n {row['question2']}\n")

: 

In [1]:
from torch import nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_layer_shape):
        self.fc1 = nn.Linear(input_size, hidden_layer_shape[0])
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_layer_shape[1], 1)
        self.softmax = nn.Softmax()
    def forward(self, x):
        hidden = self.relu(self.fc1(x))
        output = self.fc2(hidden)
        return self.softmax(output)

In [None]:
nlp = spacy.blank('en_core_web_lg')
nlp