In [1]:
import re
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize  
from toxic import embedding_utils

In [2]:
DATA_DIR = '../../input'
EMBEDDING_PATH = f'{DATA_DIR}/embeddings'

local_list = [
  f'{EMBEDDING_PATH}/glove/toxic_glove_100d.txt',
  f'{EMBEDDING_PATH}/fasttext/toxic_fasttext_100d.txt',
]

external_list = [
  f'{EMBEDDING_PATH}/fasttext/common_crawl_and_en_wiki_fasttext_300d.txt', 
  f'{EMBEDDING_PATH}/glove/840B_glove_300d.txt',
  f'{EMBEDDING_PATH}/lexvec/common_crawl_lexvec_300d.txt',
]

In [26]:
def get_save_name(local, external):
  def get_base_name(file_name):
    base = file_name.split('/')
    base = base[len(base)-1]
    base = re.sub('.txt', '', base)
    return base

  return f'{EMBEDDING_PATH}/imputed/'\
    f'{get_base_name(local)}_impute_'\
    f'{get_base_name(external)}.txt'  

In [29]:
for local_file in local_list:
  
  # load local vectors
  local_vectors = embedding_utils.read_embedding(local_file)
    
  for external_file in external_list[::-1]:    

    # load external vectors
    external_vectors = embedding_utils.read_embedding(external_file)
    
    # impute missing vectors
    imputed_vectors = embedding_utils.impute_missing(
      local_vectors, external_vectors, use_gpu=True)
    
    # save imputed
    save_name = get_save_name(local_file, external_file)
    embedding_utils.write_embedding(imputed_vectors, save_name)