In [1]:
import nltk
import torch
import numpy as np

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/coder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from InferSent.models import InferSent 

In [4]:
# Load model
model_version = 2
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print(model)

InferSent(
  (enc_lstm): LSTM(300, 2048, bidirectional=True)
)


In [24]:
# Load some sentences
for split in ['dev','test']:
    for file in ['neutral','entailment','contradiction']:
        print('{}/{}'.format(split,file))
        sentences = []
        with open('{}/{}.txt'.format(split,file)) as f:
            for line in f:
                sentences.append(line.strip())
        print(len(sentences))

        embeddings = model.encode(sentences, bsize=128, tokenize=True, verbose=True)
        print('nb sentences encoded : {0}'.format(len(embeddings)))

        embeddings.shape

        np.save('{}/{}'.format(split,file),embeddings)

        del embeddings

dev/neutral
6470
Nb words kept : 91009/91468 (99.5%)
Speed : 218.8 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6470
dev/entailment
6658
Nb words kept : 89074/89513 (99.5%)
Speed : 417.2 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6658
dev/contradiction
6556
Nb words kept : 89578/90051 (99.5%)
Speed : 362.9 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6556
test/neutral
6438
Nb words kept : 90362/90869 (99.4%)
Speed : 351.0 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6438
test/entailment
6736
Nb words kept : 90185/90712 (99.4%)
Speed : 482.9 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6736
test/contradiction
6474
Nb words kept : 87768/88302 (99.4%)
Speed : 381.1 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 6474


In [16]:
a=np.load("train/entailment.npy",mmap_mode='r')

In [17]:
a.shape

(366832, 4096)

In [18]:
a[0]

memmap([ 0.00746889, -0.04436943,  0.1555068 , ..., -0.00516149,
         0.02470247, -0.01385642], dtype=float32)

In [19]:
a[1]

memmap([ 0.00746889, -0.04436943,  0.15578723, ..., -0.02317487,
        -0.04125318, -0.01385642], dtype=float32)