In [15]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import torch
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Load model
from InferSent.models import InferSent
model_version = 1
MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [3]:
# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [30]:
def embedSentence(text):
    embeddings = model.encode([text], bsize=128, tokenize=True, verbose=True)
    return embeddings

In [54]:
names = [('Other', 193203, 0),
         ('Politics', 15734, 1),
         ('Media', 4468, 2),
         ('Fashion', 2270, 3),
         ('Foreign Policy', 2205, 4),
         ('Immigration', 1739, 5),
         ('Economy', 1629, 6),
         ('Health', 1434, 7),
         ('Art', 1235, 8),
         ('Gender', 1068, 9),
         ('Sport', 951, 10),
         ('Violence', 766, 11),
         ('Climate', 574, 12)]

In [50]:
for name, size, idx in names:
    file = open('data/%s.txt'%name, 'r')
    if name == 'Other':
        lines = []
        for line in file:
            if np.random.rand() < 0.08:
                lines.append(line)
    else:
        lines = file.readlines()
    print("Read the file %s"%name)
    embed = model.encode(lines, bsize=128, tokenize=True, verbose=True)
    results = np.zeros((embed.shape[0], embed.shape[1] + 1))
    results[:, :4096] = embed
    results[:, -1] = idx
    np.save('data/np_array/%s.npy'%name, results)

Read the file Other
Nb words kept : 559704/597757 (93.6%)
Speed : 36.7 sentences/s (cpu mode, bsize=128)
Read the file Politics
Nb words kept : 532875/562984 (94.7%)
Speed : 40.1 sentences/s (cpu mode, bsize=128)
Read the file Election
Nb words kept : 532875/562984 (94.7%)
Speed : 39.8 sentences/s (cpu mode, bsize=128)
Read the file Media
Nb words kept : 161249/171200 (94.2%)
Speed : 33.2 sentences/s (cpu mode, bsize=128)
Read the file Fashion
Nb words kept : 67512/72342 (93.3%)
Speed : 40.1 sentences/s (cpu mode, bsize=128)
Read the file Foreign Policy
Nb words kept : 79309/83998 (94.4%)
Speed : 32.4 sentences/s (cpu mode, bsize=128)
Read the file Immigration
Nb words kept : 59314/62721 (94.6%)
Speed : 32.5 sentences/s (cpu mode, bsize=128)
Read the file Economy
Nb words kept : 56716/59850 (94.8%)
Speed : 31.6 sentences/s (cpu mode, bsize=128)
Read the file Health
Nb words kept : 47073/49346 (95.4%)
Speed : 32.2 sentences/s (cpu mode, bsize=128)
Read the file Art
Nb words kept : 43081

In [55]:
data = np.load('data/np_array/Other.npy')
for name, size, idx in names:
    if name == 'Election':
        continue
    if name == 'Other':
        continue
    else:
        data = np.concatenate([data, np.load('data/np_array/%s.npy'%name)], axis=0)

In [62]:
# Correction only once
# for i in range(49603):
#     if data[i, -1] > 2:
#         data[i, -1] -= 1

In [65]:
# Shuffling the data
np.random.shuffle(data)
np.save('data/np_array/all.npy', data)