In [None]:
# default_exp model

# Glove embeddings and match pyramid

> How to load glove embeddings and implement Match Pyramid Model?

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import torchtext

from inspect import signature
from fastai.text.all import *
from sklearn.feature_extraction.text import CountVectorizer

from matchpyramid.data import *


## Prepare Dataset

In [None]:
#slow
train = load_dataset()

In [None]:
#slow

splits      = IndexSplitter(np.arange(len(train)-int(.2 * len(train)), len(train)))(train)
combined_df = pd.DataFrame({'text': list(train.iloc[splits[0]]['question1'].unique()) + list(train.iloc[splits[0]]['question2'].unique())})
_, cnt      = tokenize_df(combined_df, text_cols='text')

In [None]:
#slow
dset = Datasets(train, [[Tokenizer.from_df('question1', tok_text_col='q1'), Tokenizer.from_df('question2', tok_text_col='q2'), 
                          NumericalizePair(vocab=list(cnt.keys()))], [ItemGetter('is_duplicate'), Categorize()]], splits=splits)

In [None]:
#slow
seq_len    = 72
dls_kwargs = {
              'before_batch': Pad_Chunk_Pair(seq_len=seq_len),
              'after_batch': Undict(),
              'create_batch': fa_convert
             }

dls        = dset.dataloaders(bs=128, seq_len=seq_len, **dls_kwargs)

In [None]:
#export
def get_dls():
    train       = load_dataset()
    splits      = IndexSplitter(np.arange(len(train)-int(.2 * len(train)), len(train)))(train)
    
    combined_df = pd.DataFrame({'text': list(train.iloc[splits[0]]['question1'].unique()) + list(train.iloc[splits[0]]['question2'].unique())})
    _, cnt      = tokenize_df(combined_df, text_cols='text')
    dset = Datasets(train, [[Tokenizer.from_df('question1', tok_text_col='q1'), Tokenizer.from_df('question2', tok_text_col='q2'), 
                          NumericalizePair(vocab=list(cnt.keys()))], [ItemGetter('is_duplicate'), Categorize()]], splits=splits)
    seq_len    = 72
    dls_kwargs = {
                  'before_batch': Pad_Chunk_Pair(seq_len=seq_len),
                  'after_batch': Undict(),
                  'create_batch': fa_convert
                 }

    dls        = dset.dataloaders(bs=128, seq_len=seq_len, **dls_kwargs)
    return dls, cnt

## GloveEmbeddding

In [None]:
#export
def get_glove_embeddings():
    return torchtext.vocab.GloVe(name = '6B', dim = 100)

In [None]:
#slow
# Pretrained word vectors/embeddings
glove = get_glove_embeddings()
glove.vectors.shape

torch.Size([400000, 100])

In [None]:
#export
def get_my_vocab(cnt):
    return torchtext.vocab.vocab(cnt, min_freq=1)

In [None]:
#slow
my_vocab = get_my_vocab(cnt)

In [None]:
#export
def convert_cnt_to_glove_emb(my_vocab):
    return glove.get_vecs_by_tokens(my_vocab.get_itos())

In [None]:
#slow
my_vocab.vectors = convert_cnt_to_glove_emb(my_vocab)
my_vocab.get_itos()[13], my_vocab.vectors[3]

('disprove',
 tensor([-0.5153,  0.8319,  0.2246, -0.7387,  0.1872,  0.2602, -0.4256,  0.6712,
         -0.3108, -0.6127,  0.0895, -0.2401,  1.1878,  0.6761, -0.0229, -0.9253,
          0.0712,  0.3884, -0.4292,  0.3714,  0.3267,  0.4314,  0.8749,  0.3401,
         -0.2319, -0.4114,  0.4906, -0.3291, -0.4911, -0.1899,  0.3341, -0.2124,
         -0.3839, -0.0805,  1.1161,  0.2362,  0.3133,  0.4929,  0.1000, -0.1513,
         -0.1418, -0.2802, -0.2388, -0.3549,  0.1828, -0.1913,  0.6054,  0.0746,
         -0.2073, -0.6097,  0.1991, -0.5702, -0.1743,  1.4419, -0.2502, -1.8648,
          0.4167, -0.2461,  1.5010,  0.8741, -0.6714,  1.2762, -0.2721,  0.1758,
          1.2242,  0.2824,  0.6237,  0.6395,  0.3691, -0.8468, -0.3227, -0.6715,
         -0.1963, -0.4079, -0.2097, -0.1962,  0.0419,  0.5397, -1.1105, -0.3952,
          0.6659, -0.2330, -1.0820,  0.0465, -2.0993, -0.2849,  0.0800, -0.1296,
         -0.3001, -0.4676, -0.8183, -0.0485, -0.3223, -0.3201, -1.1207, -0.0568,
         -0.730

## Model

In [None]:
#export
class MatchPyramid(Module):
  
    def __init__(self, vocab, max_len):
        vocab_size, emb_size = vocab.vectors.shape
        self.max_len = max_len
        self.emb = nn.Embedding(vocab_size, emb_size, _weight=vocab.vectors)
        
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=8,
                               kernel_size=(3, 3),
                               padding=0,
                                 bias=True
                                 )
        
        self.conv2 = torch.nn.Conv2d(in_channels=8,
                                     out_channels=16,
                                     kernel_size=(3, 3),
                                     padding=0,
                                     bias=True
                                     )
        
        self.pool1 = torch.nn.AdaptiveMaxPool2d(10)
        self.pool2 = torch.nn.AdaptiveMaxPool2d(5)
        self.linear1 = torch.nn.Linear(5 * 5 * 16,
                                       128, 
                                       bias=True
                                      )
        
        self.linear2 = torch.nn.Linear(128, 1, bias=True)
        self.relu    = nn.ReLU()
        
    def forward(self, xa, xb):
        seq_len1, seq_len2 = xa.size()[1], xb.size()[1]
        
        emb_a, emb_b = self.emb(xa), self.emb(xb)
        pad1         = self.max_len - seq_len1
        pad2         = self.max_len - seq_len2
        simi_img     = torch.matmul(emb_a, emb_b.transpose(1, 2)) / np.sqrt(emb_a.size()[2])
        
        simi_img = simi_img.unsqueeze(1)
        
        if pad1 != 0 or pad2 != 0:
            simi_img = F.pad(simi_img, (0, pad2, 0, pad1))
        
        simi_img = self.relu(self.conv1(simi_img))        
        simi_img = self.pool1(simi_img)
        simi_img = self.relu(self.conv2(simi_img))
        simi_img = self.pool2(simi_img)

        simi_img = simi_img.squeeze(1).view(xa.size()[0], -1)
        
        output = self.linear2(F.relu(self.linear1(simi_img)))
        
        return output

In [None]:
#slow
m = MatchPyramid(my_vocab, max_len=72).cuda()
x = dls.one_batch()
out = m(x[0]['pa'], x[0]['pb'])

l = BCEWithLogitsLossFlat()
res = l(out, x[0]['labels'])
print(res)

TensorBase(0.6836, device='cuda:0', grad_fn=<AliasBackward0>)
