This notebook loads training and validation set in Fastai structures and gets embeddings for both input and output:

In [1]:
import numpy as np
import random
import torch

In [2]:
def setReproducibility(seed_value, use_cuda):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

setReproducibility(42,True)

We change working directory to access python package:

In [None]:
import os
from pathlib import Path
cwd = os.getcwd()
print("Working directory before change:",cwd)
path = Path(cwd)
os.chdir(path.parent)
print("Working directory after change:", os.getcwd())

In [3]:
#dataset_path = "/kaggle/input/e2e-dataset"
#dataset_path = "/floyd/input/e2e_nlg/"
dataset_path= "datasets/e2e-nlg"
fasttext_path = "datasets/fasttext/cc.en.300.bin"
out_path = "output"
main_path = "."

In [5]:
import pandas as pd
from utils.fastai_custom import *
from e2e_nlg.loading.loader import E2ENLGDataLoader
from utils.seq2seq_embeds import Seq2SeqEmbeddings
from seq2seq import *
from utils import randm

In [15]:
randm.setReproducibility(42,True)
pd.set_option('display.max_colwidth', 1000)

We load the datasets from csv and create data bunch:

In [7]:
dl = E2ENLGDataLoader(dataset_path,"trainset.csv","devset.csv",percentile=100)
dl.setDataAndMaxSize(bs=32)
dl.save_data()

2019-12-29 10:38:12,075 - root - INFO - Maximum size for inputs and outputs is: 95

2019-12-29 10:38:13,948 - root - INFO - Size of input vocabulary=56
2019-12-29 10:38:13,949 - root - INFO - Size of output vocabulary=1216
2019-12-29 10:38:13,950 - root - INFO - Seq2SeqDataBunch;

Train: LabelList (42061 items)
x: Seq2SeqTextList
xxbos xxmaj the xxup xxx , coffee shop , xxmaj italian , customer rating low , price range less than £ 20 , family friendly yes , area riverside , near xxup yyy,xxbos xxup xxx , coffee shop , xxmaj italian , customer rating average , area city centre , near xxup yyy,xxbos xxup xxx , xxmaj english , family friendly yes , area riverside , near the xxup yyy,xxbos xxmaj the xxup xxx , coffee shop , xxmaj french , customer rating low , price range less than £ 20 , family friendly no , near xxup yyy,xxbos xxup xxx , coffee shop , xxmaj french , customer rating 3 out of 5 , price range high , family friendly yes , area riverside
y: TextList
xxbos xxmaj there is an in

We create fasttext model embedding for each word in the input and output vocabulary:

In [8]:
embs = Seq2SeqEmbeddings(dl.data,main_path) 
embs.set_pretrained_embeddings(model_path=fasttext_path) 
embs.set_embeddings()




xxunk
xxpad
xxbos
xxeos
xxfld
xxmaj
xxup
xxrep
xxwrep
xxunk
xxpad
xxbos
xxeos
xxfld
xxmaj
xxup
xxrep
xxwrep
n't
xxxs
clentele
d'oeuvres
yyyn
d'oeuvre
caféteria
xxfake
xxfake
xxfake
xxfake
xxfake
xxfake


In [None]:
embs.save_embeddings()