## Generating 'all_data' dict

In [None]:
import pickle
import gensim
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from collections import defaultdict
from tqdm import tqdm

In [None]:
def create_embedding_matrix(language):
    with open(f'../../models/{language}_w2v.pickle', 'rb') as f:
        w2v = pickle.load(f)

    # Turn vocab in to mapping dict:
    word_mapping = w2v.wv.key_to_index

    # Use mapping dict to retrieve all embeddings and generate the embedding matrix
    embedding_matrix = np.zeros((len(word_mapping), 200)) # d = 200
    
    row = 0
    for word in word_mapping:
        embedding_matrix[row] = w2v.wv.get_vector(word_mapping[word]).reshape(1,-1)
        row+=1
    return torch.from_numpy(embedding_matrix), word_mapping

def get_numeric_corpus(corpus,lookup):
    input_x = []
    for sequence in corpus:
        seq = []
        for word in sequence:
            seq.append(lookup[word])
        input_x.append(seq)
    return input_x

def padding(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

def compute_seq_len():
    return 200

In [None]:
def get_data(lang,split,lookup):
    with open(f'../../data/amazon_reviews/{split}/processed_data/{split}_tokens_{lang}.pickle', 'rb') as f:
        corp = pickle.load(f)
    ncorp = get_numeric_corpus(corp,lookup)
    padded = padding(ncorp, compute_seq_len())
    with open(f'../../data/amazon_reviews/{split}/processed_data/y_{split}_{lang}.pickle', 'rb') as y:
        ys = pickle.load(y)
    return corp, ncorp, padded, np.array(ys)

In [None]:
def dd2():
    return dict()
def dd():
    return defaultdict(dd2)

In [None]:
data = defaultdict(dd)
embedding_dict = defaultdict(dd)
languages = ['en','fr','jp']
splits = ['train','test'] # can try validation
steps = ['corpus', 'ncorp', 'padded','y']

for lang in tqdm(languages):
    embedding_dict[lang]['matrix'], embedding_dict[lang]['lookup'] = create_embedding_matrix(lang)

    for split in tqdm(splits,leave=False):
        data[lang][split][steps[0]], data[lang][split][steps[1]], data[lang][split][steps[2]], data[lang][split][steps[3]]  = get_data(lang, split, embedding_dict[lang]['lookup'])


In [None]:
for l in data.keys():
    for s in data[l].keys():
        for d in data[l][s]:
            print(l,s, d, np.shape(data[l][s][d]))

In [None]:
with open(f'../../data/amazon_reviews/all_data.pickle','wb')as f:
    pickle.dump(data,f)

with open(f'../../data/amazon_reviews/mono_lang_embeddings.pickle','wb')as x:
    pickle.dump(embedding_dict,x)