<a href="https://colab.research.google.com/github/manuelladron/mmml_f20/blob/main/Skipgram_ADARI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive 
#drive.flush_and_unmount()
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [None]:
import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021')
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/notebooks/')

In [None]:
import json, pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time, sys, os, random, io
from operator import itemgetter
from IPython.display import clear_output
from vocab4embeddings import *

CUDA is True
8 cuda


In [None]:
#Check if cuda is available
cuda = torch.cuda.is_available()
print('CUDA is', cuda)
device = torch.device("cuda" if cuda else "cpu")
num_workers = 8 if cuda else 0
print(num_workers, device)

CUDA is True
8 cuda


In [None]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

### Glove embeddings 

In [None]:
glove_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/embeddings/glove.6B.50d.txt'
with io.open(glove_path, 'r', encoding='utf8') as f:    
    glove_file = f.read()
    
glove_sentences = glove_file.splitlines()
glove_vocab = {}
for sentence in glove_sentences:
    word = sentence.split()[0]
    embedding = np.array(sentence.split()[1:], dtype = float)
    glove_vocab[word] = embedding

### Our embeddings

### Train embeddings

In [None]:
def load_vocabulary(file_path):
    # open file
    f = open(file_path, 'rb')
    # dump info to that file
    data = pickle.load(f)
    # close file
    f.close()
    # return vocab
    return data

vocab = load_vocabulary('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/vocabulary/vocab4embeds_fur_c5sw.json')

In [None]:
vocab.word2idx['angular'], vocab.word2idx_context['angular']
vocab.last_target_id

3458

In [None]:
c = 0
for w, id in vocab.word2idx.items():
    if w not in glove_vocab:
      c += 1
c

9247

In [None]:
print(len(vocab.data))
print(len(vocab.word2idx))
vocab.data[1000:1010]
vocab.last_target_id

46670
26926


3458

**Data set**

In [None]:
def create_datasets(data_):  
  print(data_.data[0])
  # shuffle data
  random.shuffle(data_.data)
  # split
  split = 0.8
  training_number = int(len(data_.data)*split)
  test_number = len(data_.data) - training_number
  training_data = data_.data[:training_number]
  test_data = data_.data[training_number:]
  return training_data, test_data

training_data, test_data = create_datasets(vocab)

(['maple', 'plywood', 'boards', '.', 'objects', 'originally', 'designed', 'purpose'], '"-conceptualizing')


In [None]:
print(type(list(vocab.idx2word.keys())))

<class 'list'>


In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, X):
        self.X = X
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        target = self.X[index][1]
        context = self.X[index][0]
        context_ids = torch.tensor([vocab.word2idx[w] for w in context], dtype=torch.long)
        print('context_ids')
        print(context_ids)
        # Change this to avoid having words belonging to real context 
        context_fakes = np.random.choice(list(vocab.idx2word.keys()), size=(10))
        print('context fakes')
        print(context_fakes)
        context_fakes = torch.from_numpy(context_fakes) # same shape than ocntext_ids [10]        
        target_id = torch.tensor([vocab.word2idx[target]], dtype=torch.long)
  
        return context_ids, target_id, context_fakes
    
def collate(sequence): # sequence is [context, target, fakes] times batch_size
    """
    "the input of this function is the output of function __getitem__"
    "this gets BATCH_SIZE times GETITEM! "
    if batch_Size == 2 --> sequence is a list with length 2. 
    Each list is a tuple (contexts, target) = ([4], [1])
    """
    #Concatenate all targets and contexts in the batch
    contexts = []
    for batch in sequence:
        if len(batch[0]) != 10:
            length = len(batch[0])
            pad_length = 10 - length
            context = F.pad(batch[0], (0,pad_length), 'constant', 0)
            contexts.append(context.view(1, 10))
        else:
            contexts.append(batch[0].view(1, 10))
    contexts = torch.cat(contexts, dim=0) # [batch, 10]
    
    # If using mean uncomment this line 
    # contexts = torch.cat(([batch_[0] for batch_ in sequence]), dim=0) # [batch]
    # targets
    targets = torch.cat(([batch_[1] for batch_ in sequence]), dim=0) # [batch]
    context_fakes = torch.cat(([batch_[2] for batch_ in sequence]), dim=0) # [batch, 10]
    context_fakes = context_fakes.view(targets.shape[0], -1)
    
    # contexts and context fakes are [batch, 10], targets is [batch]
    return contexts, targets, context_fakes


In [None]:
batch_size = 64
train_dataset = EmbeddingDataset(training_data)
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, collate_fn = collate, drop_last=True)

test_dataset = EmbeddingDataset(test_data)
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = batch_size, collate_fn = collate)

In [None]:
def create_embedding_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
      emb_layer.requires_grad = False
    
    return emb_layer#, num_embeddings, embedding_dim

# Fill vocabulary with glove vectors, if it exists. 
weights_matrix = torch.zeros((len(vocab.word2idx), 50))
for w, id in vocab.word2idx.items():
    try:
        weights_matrix[id] = torch.from_numpy(glove_vocab[w])
    except KeyError:
        weights_matrix[id] = torch.from_numpy(np.random.normal(scale=0.6, size=(50,)))

In [None]:
class SkipGramModel(nn.Module):
    """Skip gram model of word2vec.
    Attributes:
        emb_size: Embedding size.
        emb_dimention: Embedding dimention, typically from 50 to 500.
        u_embedding: Embedding for center word.
        v_embedding: Embedding for neibor words.
    """
    def __init__(self, emb_size, emb_dimension):
        super().__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        # self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True).to(device) Creates random weights
        self.u_embeddings = create_embedding_layer(weights_matrix, False) # init weights with glove vectors
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=False).to(device)
        self.init_emb()

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        #self.u_embeddings.weight.data.uniform_(-initrange, initrange).to(device)
        self.v_embeddings.weight.data.uniform_(-0, 0).to(device)

    def forward(self, pos_u, pos_v, neg_v):
        
        emb_u = self.u_embeddings(pos_u) # target -> [batch, 300]
        emb_v = self.v_embeddings(pos_v) # context -> [batch, 10, 300]

        # Average context vector
        emb_v = torch.mean(emb_v, dim=1) # avg context -> [batch, 300]

        score = torch.mul(emb_u, emb_v).squeeze() # [batch, 300]
        score = torch.sum(score, dim=1)
        log_target = F.logsigmoid(score) # batch size 

        # for negative 
        neg_emb_v = self.v_embeddings(neg_v) # neg context -> [batch, 10, 300]
        
        # bmm between [batch, 10, 300] x [batch, 300, 1]
        neg_score = torch.bmm(neg_emb_v, emb_u.unsqueeze(2)).squeeze() # [batch, 10]
        neg_score = torch.sum(neg_score, dim=1)
        sum_log_sampled = F.logsigmoid(-1*neg_score) # batch size

        loss = log_target + sum_log_sampled
        bs = emb_u.shape[0]
        
        return -1*loss.sum()/bs



In [None]:
def train_epoch_s(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    predictions = []
    ground_truth = []
    
    start_time = time.time()
    for batch_idx, (pos, target, neg) in enumerate(train_loader):   
        optimizer.zero_grad()   # .backward() accumulates gradients
        
        pos = pos.cuda()  # [batch size, context_window] [2, 4]
        target = target.cuda() # [batch size] [2]
        neg = neg.cuda()
        
        loss = model(target, pos, neg)    # [batch, vocab_size]
        running_loss += loss.item()
        
        # #calculating accuracy
        # _, predicted = torch.max(log_probs.data, 1)
        # total_predictions += target.size(0)
        # correct_predictions += (predicted == target).sum().item()
            
        # #calculuating confusion matrix
        # predictions += list(predicted.cpu().numpy())
        # ground_truth += list(target.cpu().numpy())
        
        if batch_idx%50 == 0:
            print('loss: ', loss.item())
        
        loss.backward()
        optimizer.step()
    
    end_time = time.time()
    
    print('------ Training -----')
    running_loss /= len(train_loader)
    # acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    # print('Training Accuracy: ', acc, '%')
    return running_loss

def validate_model_s(model, validate_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        predictions = []
        ground_truth = []

        for batch_idx, (pos, target, neg) in enumerate(validate_loader):   
            pos = pos.cuda()  # [batch size, context_window] [2, 4]
            target = target.cuda() # [batch size] [2]
            neg = neg.cuda()
            
            loss = model(target, pos, neg)    # [batch, vocab_size]
            running_loss += loss.item()

            # # Get sizes 
            # batch_size = log_probs.shape[0]
            # vocab_size = log_probs.shape[1]

            # _, predicted = torch.max(log_probs.data, 1)
            # total_predictions += target.size(0)
            # correct_predictions += (predicted == target).sum().item()
            
            # #calculuating confusion matrix
            # predictions += list(predicted.cpu().numpy())
            # ground_truth += list(target.cpu().numpy())

            if batch_idx%50 == 0:
                print('loss: ', loss.item())

        print('------ Testing -----')
        running_loss /= len(validate_loader)
        #acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        #print('Testing Accuracy: ', acc, '%')
        return running_loss #, acc

In [None]:
def create_run_id(name):
  run_id = str(int(time.time()))
  if not os.path.exists('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments'):
      os.mkdir('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments')
  path_name = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/{}_{}'.format(run_id, name)
  os.mkdir(path_name)
  print("Saving models, and predictions to {}".format(path_name))
  return path_name

In [None]:
save_path = create_run_id('skipgram_6')

Saving models, and predictions to /content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/1597271606_skipgram_6


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
vocab_size = len(vocab.word2idx)
embedding_dimension = 50
context_size = 10

model = SkipGramModel(vocab_size, embedding_dimension)
model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.003)
best_loss = 1e30

for epoch in range(20):
    print('\nEpoch: ', epoch)
    train_loss = train_epoch_s(model, train_loader, criterion, optimizer)
    test_loss = validate_model_s(model, test_loader, criterion)
    print('='*20)
 
    if test_loss < best_loss:
        best_loss = test_loss
        print("Saving model, predictions and generated output for epoch " + str(epoch) + " with Loss: " + str(best_loss))
            
        torch.save(model, save_path + '/nlp_embed_' + str(epoch) + '.pth')


Epoch:  0
loss:  1.3862943649291992
loss:  0.9968018531799316
loss:  0.5957505106925964
loss:  0.4017864465713501
loss:  0.3436018228530884
loss:  0.1795228123664856
loss:  0.28758037090301514
loss:  0.27599671483039856
loss:  0.19278834760189056
loss:  0.0950952023267746
loss:  0.21500767767429352
loss:  0.15024398267269135
------ Training -----
Training Loss:  0.38883759080085706 Time:  84.07404232025146 s
loss:  0.15371014177799225
loss:  0.135403111577034
loss:  0.18737682700157166
------ Testing -----
Testing Loss:  0.1407396647415749
Saving model, predictions and generated output for epoch 0 with Loss: 0.1407396647415749

Epoch:  1
loss:  0.1107078567147255
loss:  0.08864519000053406
loss:  0.07819347083568573
loss:  0.1453486680984497
loss:  0.12177090346813202
loss:  0.050636645406484604
loss:  0.036164697259664536
loss:  0.045713160187006
loss:  0.09270841628313065
loss:  0.07459942996501923
loss:  0.1530609428882599
loss:  0.025021545588970184
------ Training -----
Training 

In [None]:
# Save embeddings

load_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/1597271606_skipgram_6/nlp_embed_9.pth'
save_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/embeddings/fur_5c_50d_sk_glove_ft.json'
class Embedding(object):
    def __init__(self, file_name, load_file):
        if load_file != None:
            model = torch.load(load_file)
        self.file_name = file_name
        self.create_embedding()
        self.save_json()


    def create_embedding(self):
        embeddings = model.u_embeddings.weight.cpu().data.numpy()
        self.embedding = {}
        for id, w in vocab.idx2word.items():
            e = embeddings[id].tolist()
            self.embedding[w] = e
        
    def save_json(self):
        out_file = open(self.file_name, "w")
        json.dump(self.embedding, out_file)
        out_file.close()
    
E = Embedding(save_path, None)

In [None]:
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

emb = open_json(save_path)