In [None]:
!pip install -q import_ipynb
import import_ipynb

import env_setup
import datasets

import numpy as np
import pickle
import pandas

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# float -> list
def fitBins(value: int):

    #       .125            ->      [1, 0, 0, 0, 0, 0, 0, 0]
    #       .250            ->      [0, 1, 0, 0, 0, 0, 0, 0]
    #       .375            ->      [0, 0, 1, 0, 0, 0, 0, 0]
    #                   ..........
    #       .750            ->      [0, 0, 0, 0, 0, 1, 0, 0]
    #       .875            ->      [0, 0, 0, 0, 0, 0, 1, 0]
    #      1.000            ->      [0, 0, 0, 0, 0, 0, 0, 1]

    bins = np.array([0]*8)
    for i in range(1, 9):
        if value == i/8:
            bins[i-1] = 1
    return bins


#                      time note  vol
#                      row  col   bins
# list(list(float)) -> list(list(list(int)))
def performance2BinsMHE(perfomance: np.array):

    #   (N, 88)         ------>         (N, 704)
    #------------------------------------------------------
    performance_bins = np.empty([perfomance.shape[0], perfomance.shape[1] * 8], dtype=int)
    # print(performance_bins.shape)

    for f, frame in enumerate(perfomance):
        frame_values = []
        for v, value in enumerate(frame):
            # print(f'{value} -> {fitBins(value)}')
            for i in fitBins(value):
                frame_values.append(i)

        performance_bins[f] = frame_values
    return performance_bins

# row bins
def buildVocab(corpus):
    vocab = np.unique(corpus, axis=0)
    return vocab

# fitBins(.375)

In [None]:
serial = pandas.read_pickle('Data/Aguas_De_Marco.pkl')

perf = serial.iloc[:, 9:]

_, velocities = datasets.performanceDecode(perf)

corpus = performance2BinsMHE(velocities)
print(corpus.shape)

vocab = buildVocab(corpus)
print(vocab.shape)

In [None]:
ix_to_word = {ix:word for ix, word in enumerate(vocab)}
word_to_ix = {tuple(word):ix for ix, word in enumerate(vocab)}
print(word_to_ix)
np.save('Embeddings/ix_to_word', ix_to_word)
np.save('Embeddings/word_to_ix', word_to_ix)

In [None]:
CONTEXT_SIZE = 8 #
EMBEDDING_DIM = 64

# F32
#

# F1, F2,...

# build a list of tuples.
# Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
ngrams = [
    (
        [corpus[i - j - 1] for j in range(CONTEXT_SIZE)],
        corpus[i]
    )
    for i in range(CONTEXT_SIZE, len(corpus))
]
# Print the first 3, just so you can see what they look like.
# print(len(ngrams), ngrams[0])

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)


    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
    def plot_loss_update(i, n, mb, train_loss):
        '''
            Dynamically print the loss plot during the training/validation loop.
            Expects epoch to start from 1.
        '''

        mb.names = ['Loss']
        x = range(1, i+1)
        y = train_loss
        graphs = [[x,train_loss]]
        x_margin = 0.2
        y_margin = 0.05
        x_bounds = [1-x_margin, n+x_margin]
        y_bounds = [np.min(y)-y_margin, np.max(y)+y_margin]

        mb.update_graph(graphs, x_bounds, y_bounds)

In [None]:
from fastprogress.fastprogress import master_bar, progress_bar


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

N_EPOCHS = 6

mb = master_bar(range(1, N_EPOCHS+1))

for epoch in mb:
    total_loss = 0
    for index in progress_bar(range(len(ngrams)), parent=mb):
        context, target = ngrams[index]

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[tuple(w)] for w in context], dtype=torch.long).to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[tuple(target)]], dtype=torch.long).to(device))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # print(loss.item())

        loss_step = loss.item()

        mb.child.comment = f'[Loss step {loss_step:.8f}]'

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss_step


    losses.append(total_loss/len(ngrams))

    mb.main_bar.comment = f'[Epoch {epoch} | Loss {total_loss:.8f}]'
    plot_loss_update(epoch, N_EPOCHS, mb, losses)

torch.save(model.embeddings, f'Embeddings/test_embeddings.pkl')

print(model.embeddings)

In [None]:
# To get the emb

print(vocab[55])
# To get the embedding of a particular word, e.g. "beauty"
A = model.embeddings.weight[word_to_ix[tuple(vocab[55])]]
B = model.embeddings.weight[word_to_ix[tuple(vocab[56])]]
print(A)