# Trying to import external embeddings

In [1]:
import numpy as np
import os
import pickle
import shutil

from tqdm import tqdm_notebook
from glob import glob
from zipfile import ZipFile
from collections import Counter
from gensim.models import KeyedVectors


In [2]:
base_path = "D:/USP/Mestrado/Stance/checkpoints/embeddings/"
base_prefix = "NILC"

In [3]:
paths_list = glob("D:/USP/Mestrado/data/embeddings/pre_trained/*")
paths_list

['D:/USP/Mestrado/data/embeddings/pre_trained\\fast_skip_s300.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s100.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s1000.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s300.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s600.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s100.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s1000.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s300.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s50.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s100.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s1000.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s300.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s50.zip',
 'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s600.zip']

In [10]:
paths_list = [
    'D:/USP/Mestrado/data/embeddings/pre_trained\\fast_skip_s300.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s100.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s1000.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s300.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s600.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s100.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s1000.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s300.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_cbow_s50.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s100.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s1000.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s300.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s50.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\w2v_skip_s600.zip'
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s100.zip',
    'D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s300.zip',
]

In [11]:
def extract_zip(file_path):
    shutil.rmtree("./temp", ignore_errors=True)
    
    with ZipFile(file_path, "r") as f_:
        f_.extractall('./temp')
    
    return glob("./temp/*")

def get_weights_vocab(model):
    int_to_vocab = {id_:word for id_,word in enumerate(tqdm_notebook(model.index2word, desc="Vocab_to_int", leave=True))}
    cntr = Counter(model.index2word)
    weights = np.stack([model.word_vec(word) for word in tqdm_notebook(model.index2word, desc="Weights", leave=True)])
    
    return weights, int_to_vocab, cntr

def save_pickle(weights, int_to_vocab, cntr, base_path, prefix, name=""):
    os.makedirs(f"{base_path}/{prefix}", exist_ok=True)
    
    with open(f"{base_path}/{prefix}/emb_weights.pkl", "wb") as f_:
        pickle.dump(weights, f_)
    
    np.save(f"{base_path}/{prefix}/{name}.vectorsF", weights)
    
    with open(f"{base_path}/{prefix}/vocab_counter.pkl", "wb") as f_:
        pickle.dump({"int_to_vocab":int_to_vocab, "wrd_cnt":cntr}, f_)
    
    wrd2idx = {wrd: idx for idx, wrd in int_to_vocab.items()}
    with open(f"{base_path}/{prefix}/{name}.vocabF.pkl", "wb") as f_:
        pickle.dump(wrd2idx, f_)

def format_ext_emb_data(base_path, base_prefix, paths_list):
    for model_path in tqdm_notebook(paths_list):
        temp_file_name = extract_zip(model_path)[0]
        model = KeyedVectors.load_word2vec_format(temp_file_name)

        weights, int_to_vocab, cntr = get_weights_vocab(model)

        curr_prefix_ = base_prefix + temp_file_name.split("/")[-1]
        name = model_path.split("\\")[-1].replace(".zip", "")
        save_pickle(weights, int_to_vocab, cntr, base_path, curr_prefix_, name)
    
    shutil.rmtree("./temp", ignore_errors=True)

In [12]:
format_ext_emb_data(base_path, base_prefix, paths_list)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


Vocab_to_int:   0%|          | 0/929605 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


Weights:   0%|          | 0/929605 [00:00<?, ?it/s]

In [9]:
base_path, base_prefix, paths_list

('D:/USP/Mestrado/Stance/checkpoints/embeddings/',
 'NILC',
 ['D:/USP/Mestrado/data/embeddings/pre_trained\\glove_s100.zip'])

---