In [1]:
import pickle
import os
import numpy as np
from fasttext import load_model
import torch
from torch.nn.modules.sparse import EmbeddingBag
from torch.autograd import Variable
import numpy as np

In [2]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='sent_tok'
FASTTEXT_INPUT='/home/kvassay/data/z/data/fasttext/train.txt'
FASTTEXT_MODEL_SAVE='/home/kvassay/data/z/models/fasttext/sent_tok_lc_100'

In [3]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    train,_,_=pickle.load(f)

CPU times: user 10.3 s, sys: 1.24 s, total: 11.5 s
Wall time: 11.5 s


## Prepare training data

In [4]:
all_sents=[]
for sample in train:
    for sent in sample['text']:
        all_sents.append(' '.join(sent).lower()+'\n')
    all_sents.append(' '.join(sample['summary']).lower()+'\n')
np.random.shuffle(all_sents)

In [5]:
with open(FASTTEXT_INPUT, 'w') as f:
    f.writelines(all_sents)

## Train fastText

In [6]:
!cd ~/project/fastText && ./fasttext skipgram -dim 100 -neg 10 -epoch 2 -input $FASTTEXT_INPUT -output $FASTTEXT_MODEL_SAVE

Read 58M words
Number of words:  48498
Number of labels: 0
Progress: 100.0% words/sec/thread:  107377 lr:  0.000000 loss:  2.632658 ETA:   0h 0m 105586 lr:  0.044902 loss:  2.677036 ETA:   0h 1m lr:  0.030798 loss:  2.640083 ETA:   0h 0m lr:  0.019923 loss:  2.619680 ETA:   0h 0m


## Sanity check
https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/FastTextEmbeddingBag.py

In [7]:
class FastTextEmbeddingBag(EmbeddingBag):
    def __init__(self, model_path):
        self.model = load_model(model_path)
        input_matrix = self.model.get_input_matrix()
        input_matrix_shape = input_matrix.shape
        super().__init__(input_matrix_shape[0], input_matrix_shape[1])
        self.weight.data.copy_(torch.FloatTensor(input_matrix))

    def forward(self, words):
        word_subinds = np.empty([0], dtype=np.int64)
        word_offsets = [0]
        for word in words:
            _, subinds = self.model.get_subwords(word)
            word_subinds = np.concatenate((word_subinds, subinds))
            word_offsets.append(word_offsets[-1] + len(subinds))
        word_offsets = word_offsets[:-1]
        ind = Variable(torch.LongTensor(word_subinds))
        offsets = Variable(torch.LongTensor(word_offsets))
        return super().forward(ind, offsets)

In [8]:
model=FastTextEmbeddingBag(FASTTEXT_MODEL_SAVE+'.bin')




In [9]:
model.forward(['this','is','!!!']).shape

torch.Size([3, 100])

In [10]:
# Command to evaluate k-NN:
# ./fasttext nn /home/kvassay/data/z/models/fasttext/sent_tok_lc_100.bin 10