<a href="https://colab.research.google.com/github/manuelladron/mmml_f20/blob/main/Skipgram_ADARI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive 
#drive.flush_and_unmount()
drive.mount('/content/gdrive') 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021')
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/notebooks/')

In [3]:
import json, pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time, sys, os, random, io
from operator import itemgetter
from collections import Counter
from IPython.display import clear_output

CUDA is True
8 cuda


In [4]:
#Check if cuda is available
cuda = torch.cuda.is_available()
print('CUDA is', cuda)
device = torch.device("cuda" if cuda else "cpu")
num_workers = 8 if cuda else 0
print(num_workers, device)

CUDA is True
8 cuda


In [5]:
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

def save_json(file_path, data):
    out_file = open(file_path, "w")
    json.dump(data, out_file)
    out_file.close()

In [6]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

### Glove embeddings 

### Our embeddings

In [7]:
fur_vocab_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/vocabulary/ADARI_furniture_vocab.json'
context_w_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/context_words/furniture_context_size_9_all.json'
fur_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/clean_files/ADARI_furniture_info.json'

In [9]:
def sampling_rate_and_negative_sample(vocab, w2i):
    # Returns sampling rate of word (prob of keeping the word ) and negative sampling rate
    # 1) variables for sampling_rate
    frequencies_ids = dict()
    frequencies = dict()
    total_number_words = sum(vocab.values())
    threshold = 1e-5
    for word, count in vocab.items():
        # for sampling rate 
        z_w = count / total_number_words # this all add up to 1
        # p_w = z_w**(3/4)
        # p_w = (np.sqrt(z_w/0.001)+1) *(0.001/z_w)
        # print('prob of keeping: {}= {}'.format(word, p_w))
        frequencies[word] = z_w
        w_id = w2i[word]
        frequencies_ids[w_id] = z_w

    # Noise_dist
    noise_dist = {key:val**(3/4) for key, val in frequencies.items()}
    
    # Frequency of dropping
    p_drop = {word: 1 - np.sqrt(threshold/frequencies[word]) for word in vocab}
    
    # Noise dist normalized 
    Z = sum(noise_dist.values())
    neg_sampling = dict()
    neg_sampling_ids = dict()
    for k, v in noise_dist.items():
        k_id = w2i[k]
        n_s_value = v/Z
        neg_sampling[k] = n_s_value
        neg_sampling_ids[k_id] = n_s_value

    return frequencies, frequencies_ids, neg_sampling, neg_sampling_ids, p_drop

In [10]:
def get_vocab_adjs(path):
    data = open_json(path)
    vocab = []
    targets = dict()
    
    for a, article in enumerate(data):
        if a != len(data)-1:
            q_adjs = article['q_adjs']
            nq_adjs = article['nq_adjs']
            all_adjs = q_adjs + nq_adjs
            vocab += all_adjs

            # contexts 
            unique_v = list(set(all_adjs))
            for i in range(len(unique_v)):
                for j in range(len(unique_v)):
                    adj_1 = unique_v[i]
                    adj_2 = unique_v[j]
                    if i != j:
                        if adj_1 not in targets:
                            targets[adj_1] = [adj_2]
                        else:
                            targets[adj_1] += [adj_2]
    
    vocab_d = Counter(vocab)
    
    # include <pad>
    if '<pad>' not in targets:
        targets['<pad>'] = ['<pad>']
    if '<pad>' not in vocab_d:
        vocab_d['<pad>'] = 1
    
    # Both dicts have same length 
    assert(len(vocab_d) == len(targets))

    # Get list of words
    all_words = list(vocab_d.keys())
    all_words.remove('<pad>')
    
    # Create 2 dicts
    w2i = dict()
    i2w = dict()
    w2i['<pad>'] = 0
    i2w[0] = '<pad>'
    
    for i in range(len(all_words)):
        w = all_words[i]
        w2i[w] = i+1
        i2w[i+1] = w

    s_rate, s_rate_ids, n_rate, n_rate_ids, p_drop = sampling_rate_and_negative_sample(vocab_d, w2i)

    return vocab_d, targets, w2i, i2w, s_rate, n_rate, s_rate_ids, n_rate_ids, p_drop   

In [11]:
vocab, targets, w2i, i2w, s_rate, n_rate, s_rate_ids, n_rate_ids, p_drop = get_vocab_adjs(fur_path)

In [12]:
def get_target_words(vocab_path, context_words_path):
    """
    Returns:
      - vocab  dict
      - targets dict
      - w2i and i2w dicts 
      - sampling rate and sampling rate ids
      - negative sampling and negative sampling ids 
      - p drop of words 
    """
    cw = open_json(context_words_path)
    vocab = open_json(vocab_path)
    # Clean vocab 
    vocab_list_sorted = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    sixty_most_common = vocab_list_sorted[:60]
    stop_words = []
    for w,f in sixty_most_common:
        stop_words.append(w)
    vocab = {word:v for word, v in vocab.items() if vocab[word] > 2 if word not in stop_words}
    vocab['<path>'] = 1
    # Get targets 
    targets = dict()
    for i, w_context in enumerate(cw): # list with a bunch of dictionaries and a list (containing the center word)

        context = []
        center_w = None
        # Extract center word and context of each entry
        for elem in w_context: # elem is a LIST like: [{'word': 'well-priced', 'tag': 'JJ', 'pos': 'ADJ'}, 'root'] or DICT like {'word': 'furniture', 'tag': 'NN', 'pos': 'NOUN'}
  
            if isinstance(elem, list): # list containing center word 
                center_w = elem[0]['word']
                if center_w not in vocab: 
                    center_w = None
                    break
            else:
                if elem['word'] in vocab:
                    c_word = elem['word']
                    if c_word in vocab:
                        context.append(elem['word'])

        # Ensure center word and context are not empty
        if center_w != None:
            #assert(center_w in vocab)
            if context != []:
                # Add to dictionary 
                if center_w not in targets:
                    targets[center_w] = context
                else:
                    targets[center_w] += context

    # Clean reps 
    targets_c = dict()
    for center, context in targets.items():
        targets_c[center] = list(set(context))

    # Delete words in vocab that do not appear in context words 
    new_vocab = vocab.copy()
    for w in vocab:
        if w not in targets_c:
            del new_vocab[w]
    
    if '<pad>' not in targets_c:
        targets_c['<pad>'] = ['<pad>']
    if '<pad>' not in new_vocab:
        new_vocab['<pad>'] = 1
    
    # Both dicts have same length 
    assert(len(new_vocab) == len(targets_c))
    
    # Get list of words
    all_words = list(new_vocab.keys())
    
    # Create 2 dicts
    w2i = dict()
    i2w = dict()
    w2i['<pad>'] = 0
    i2w[0] = '<pad>'
    
    for i in range(len(all_words)):
        w = all_words[i]
        if w == '<pad>':
            continue
        w2i[w] = i+1
        i2w[i+1] = w

    s_rate, s_rate_ids, n_rate, n_rate_ids, p_drop = sampling_rate_and_negative_sample(new_vocab, w2i)

    return new_vocab, targets_c, w2i, i2w, s_rate, n_rate, s_rate_ids, n_rate_ids, p_drop

In [13]:
#vocab, targets_c, w2i, i2w, s_rate, n_rate, s_rate_ids, n_rate_ids, p_drop = get_target_words(fur_vocab_path, context_w_path)

In [12]:
len(targets['family-run']), len(targets['italian'])

(320, 12250)

In [13]:
targets['well-priced'], i2w[3160], s_rate['italian'], s_rate['studio-designed']

(['country-friendly',
  'old',
  'international',
  'three-piece',
  'mutual',
  'practical',
  'low',
  'circular',
  'cute',
  'back',
  'simple',
  'contemporary',
  'modern',
  'occasional',
  'spacious',
  'high-backed',
  'physical',
  'front',
  'well-made',
  'useful',
  'three-legged',
  'small',
  'comfortable',
  'gorgeous',
  'woolly',
  'passionate',
  'lovely',
  'studio-designed',
  'gritty',
  'coherent',
  'british-made',
  'about',
  'continental',
  'least',
  'chubby',
  'in-house',
  'unusual',
  'american',
  'family-run',
  'cultural',
  'wonderful',
  'general',
  'mixed',
  'lifestyle-oriented',
  'collaborative',
  'special',
  'single',
  'fantastic',
  'italian',
  'original',
  'three-seat',
  'strong',
  'hand-finished',
  'hand-made',
  'small',
  'different',
  'expensive',
  'particular',
  'major',
  'real',
  'natural',
  'inspiring',
  'nice',
  'colourful',
  'diluted',
  'interesting',
  'involved',
  'great',
  'japanese',
  'spacious',
  'full',


In [100]:
len(vocab), len(targets)

(7492, 7492)

In [101]:
print(sum(s_rate.values()))
print(sum(n_rate.values()))
print(sum(p_drop.values()))

1.0000000000000995
0.99999999999989
2510.8669174032625


**Data set**

In [14]:
class Skipgram_data(Dataset):
    def __init__(self, vocab, targets_w, sampling_r, neg_sampling, p_drop, w2i, i2w, number_context):
        self.vocab = vocab
        self.targets_w = targets_w
        self.sampling_r = sampling_r
        self.neg_sampling = neg_sampling
        self.pdrop = p_drop
        self.w2i = w2i
        self.i2w = i2w
        self.number_context = number_context

    def __len__(self):
        return len(self.vocab)

    def __getitem__(self, index):
        # get word from idx
        word = self.i2w[index]
        # print('word: ', word)
        # get list of contexts 
        context = self.targets_w[word]         # words (strings)
        # print('context: ', context)
        context_ids = [self.w2i[w] for w in context]# idxs  (ints)
        # print('context ids: ', context_ids)
        # convert to numpy
        context_ids = np.asarray(context_ids)
        # print('context ids np: ', context_ids)

        # get number of elements in context: hyperparameter
        # if len(context_ids) < self.number_context:
        #     pad_number = self.number_context - len(context_ids)
        #     # print([0]*pad_number)
        #     # print(context_ids)
        #     context_ids = np.append(context_ids, [0]*pad_number)
        #     # print('context ids appended: ', context_ids)
        

        # elif len(context_ids) > self.number_context:
        #     # normalize samples 
        #     sampling_r = []
        #     for c_id in context_ids:
        #         sampling_r.append(self.sampling_r[c_id])
        #     sampling_r_norm = [s/sum(sampling_r) for s in sampling_r]

        #     context_ids = np.random.choice(context_ids, size = self.number_context)#, p=sampling_r_norm)
        #     # print('context ids: ', context_ids)
        # else: pass
        
        context_ids = np.random.choice(context_ids, size = self.number_context)#, p=sampling_r_norm)
        # Conver to tensor 
        # print([self.i2w[i] for i in context_ids])
        context_ids_t = torch.from_numpy(context_ids).long()
        # print('context_ids torch: ', context_ids_t)
        
        # Negative samples - copy dictionary and remove elements in context 
        # neg_d = self.neg_sampling.copy()
        # for w in context:
        #     del neg_d[w]
        
        # print('neg samping')
        neg_samples = np.random.choice(list(self.neg_sampling.keys()), size = self.number_context, p = list(self.neg_sampling.values()))
        # neg_samples_ids = np.asarray([self.w2i[w] for w in neg_samples if w not in context])
        # print(neg_samples)
        neg_samples_ids = []
        for w in neg_samples:
            # print(w)
            w_id = self.w2i[w]
            # print(w_id)
            if w not in context:
                # print('word: {} not in context'.format(w))
                neg_samples_ids.append(w_id)
        if len(neg_samples_ids) != len(neg_samples):
            difference = len(neg_samples) - len(neg_samples_ids)
            for i in range(difference):
                neg_samples_ids.append(0)
        neg_samples_ids = np.asarray(neg_samples_ids)
        neg_samples_t = torch.from_numpy(neg_samples_ids).long()
        assert(context_ids_t.shape == neg_samples_t.shape)
        return index, context_ids_t, neg_samples_t
    

In [15]:
s = Skipgram_data(vocab, targets, s_rate_ids, n_rate, p_drop, w2i, i2w, 5)

In [136]:
# to test dataloader
it = iter(s)
for i in range(10):
    next(it)


In [64]:
glove_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/embeddings/glove.6B.50d.txt'
with io.open(glove_path, 'r', encoding='utf8') as f:    
    glove_file = f.read()
    
glove_sentences = glove_file.splitlines()
glove_vocab = {}
for sentence in glove_sentences:
    word = sentence.split()[0]
    embedding = np.array(sentence.split()[1:], dtype = float)
    glove_vocab[word] = embedding

In [65]:
batch_size = 64
train_dataset = s
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, drop_last=False)

# test_dataset = EmbeddingDataset(test_data)
# test_loader = DataLoader(test_dataset, shuffle = False, batch_size = batch_size)

In [66]:
def create_embedding_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
      emb_layer.requires_grad = False
    
    return emb_layer #, num_embeddings, embedding_dim

# Fill vocabulary with glove vectors, if it exists. 
weights_matrix = torch.zeros((len(w2i), 50))
for w, id in w2i.items():
    try:
        weights_matrix[id] = torch.from_numpy(glove_vocab[w])
    except KeyError:
        weights_matrix[id] = torch.from_numpy(np.random.normal(scale=0.6, size=(50,)))

In [67]:
class SkipGramModel(nn.Module):
    """Skip gram model of word2vec.
    Attributes:
        emb_size: Embedding size.
        emb_dimention: Embedding dimention, typically from 50 to 500.
        u_embedding: Embedding for center word.
        v_embedding: Embedding for neibor words.
    """
    def __init__(self, emb_size, emb_dimension):
        super().__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        # self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True).to(device) Creates random weights
        
        self.u_embeddings = create_embedding_layer(weights_matrix, False) # init weights with glove vectors
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=False).to(device)
        self.init_emb()

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        #self.u_embeddings.weight.data.uniform_(-initrange, initrange).to(device)
        self.v_embeddings.weight.data.uniform_(-0, 0).to(device)

    def forward(self, pos_u, pos_v, neg_v):
        
        emb_u = self.u_embeddings(pos_u) # target -> [batch, 300]
        emb_v = self.v_embeddings(pos_v) # context -> [batch, 5, 300]

        # bmm between [batch, 10, 300] x [batch, 300, 1]
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze() # [batch, 300]
        # score = torch.sum(score, dim=1)
        log_target = F.logsigmoid(score) # batch size 

        # for negative 
        neg_emb_v = self.v_embeddings(neg_v) # neg context -> [batch, 10, 300]
        
        # bmm between [batch, 10, 300] x [batch, 300, 1]
        neg_score = torch.bmm(neg_emb_v, emb_u.unsqueeze(2)).squeeze() # [batch, 10]
        # neg_score = torch.sum(neg_score, dim=1)
        sum_log_sampled = F.logsigmoid(-1*neg_score) # batch size

        loss = log_target + sum_log_sampled
        bs = emb_u.shape[0]
        
        # Return average batch loss 
        return -1*loss.sum()/bs



In [21]:
def train_epoch_s(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    predictions = []
    ground_truth = []
    
    start_time = time.time()
    for batch_idx, (target, pos, neg) in enumerate(train_loader):   
        optimizer.zero_grad()   # .backward() accumulates gradients
        
        pos = pos.cuda()  # [batch size, context_window] [2, 4]
        target = target.cuda() # [batch size] [2]
        neg = neg.cuda()
        
        loss = model(target, pos, neg)    # [batch, vocab_size]
        running_loss += loss.item()
        
        # #calculating accuracy
        # _, predicted = torch.max(log_probs.data, 1)
        # total_predictions += target.size(0)
        # correct_predictions += (predicted == target).sum().item()
            
        # #calculuating confusion matrix
        # predictions += list(predicted.cpu().numpy())
        # ground_truth += list(target.cpu().numpy())
        
        if batch_idx%50 == 0:
            print('loss: ', loss.item())
        
        loss.backward()
        optimizer.step()
    
    end_time = time.time()
    
    print('------ Training -----')
    running_loss /= len(train_loader)
    # acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    # print('Training Accuracy: ', acc, '%')
    return running_loss



In [68]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
vocab_size = len(w2i)
embedding_dimension = 50
context_size = 10

model = SkipGramModel(vocab_size, embedding_dimension)
model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.003)
best_loss = 1e30
epochs = 200

In [69]:
def create_run_id(name):
  run_id = str(int(time.time()))
  if not os.path.exists('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments'):
      os.mkdir('/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments')
  path_name = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/{}'.format(name)
  os.mkdir(path_name)
  print("Saving models, and predictions to {}".format(path_name))
  return path_name

In [70]:
save_path = create_run_id('skipgram_adari_v2_adjs_{}d_{}'.format(embedding_dimension, epochs))

Saving models, and predictions to /content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/skipgram_adari_v2_adjs_50d_200


In [71]:
for epoch in range(epochs):
    print('\nEpoch: ', epoch)
    train_loss = train_epoch_s(model, train_loader, criterion, optimizer)
    # test_loss = validate_model_s(model, test_loader, criterion)
    print('='*20)
 
    if train_loss < best_loss:
        best_loss = train_loss
        print("Saving model, predictions and generated output for epoch " + str(epoch) + " with Loss: " + str(best_loss))
            
        torch.save(model, save_path + '/nlp_embed_' + str(epoch+200) + '.pth')


Epoch:  0
loss:  6.931471824645996
loss:  6.796605587005615
loss:  6.6711835861206055
------ Training -----
Training Loss:  6.7054078578948975 Time:  14.658074617385864 s
Saving model, predictions and generated output for epoch 0 with Loss: 6.7054078578948975

Epoch:  1
loss:  6.687984466552734
loss:  6.32261848449707
loss:  6.586603164672852
------ Training -----
Training Loss:  6.489506155757581 Time:  14.573624849319458 s
Saving model, predictions and generated output for epoch 1 with Loss: 6.489506155757581

Epoch:  2
loss:  6.217268943786621
loss:  6.5249433517456055
loss:  6.3776631355285645
------ Training -----
Training Loss:  6.364770990307048 Time:  14.615103006362915 s
Saving model, predictions and generated output for epoch 2 with Loss: 6.364770990307048

Epoch:  3
loss:  6.09870719909668
loss:  6.247710227966309
loss:  6.488022327423096
------ Training -----
Training Loss:  6.265937033346144 Time:  14.646153926849365 s
Saving model, predictions and generated output for ep

In [72]:
# Save embeddings

load_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/experiments/skipgram_adari_v2_adjs_50d_200/nlp_embed_199.pth'
save_path = '/content/gdrive/My Drive/Colab Notebooks/ieee_slt_2021/data/embeddings/fur_v2_5c_50d_adjs.json'
class Embedding(object):
    def __init__(self, file_name, load_file):
        if load_file != None:
            model = torch.load(load_file)
        self.file_name = file_name
        self.create_embedding()
        self.save_json()


    def create_embedding(self):
        embeddings = model.u_embeddings.weight.cpu().data.numpy()
        self.embedding = {}
        for id, w in i2w.items():
            e = embeddings[id].tolist()
            self.embedding[w] = e
        
    def save_json(self):
        out_file = open(self.file_name, "w")
        json.dump(self.embedding, out_file)
        out_file.close()
    
E = Embedding(save_path, None)

In [73]:
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

emb = open_json(save_path)

In [74]:
a = torch.Tensor(emb['organic'])
b = torch.Tensor(emb['curvy'])

ga = torch.Tensor(glove_vocab['organic'])
gb = torch.Tensor(glove_vocab['curvy'])

In [75]:
F.cosine_similarity(a, b, dim=0), F.cosine_similarity(ga, gb, dim=0)

(tensor(0.3775), tensor(0.0226))

In [76]:
a = torch.Tensor(emb['futuristic'])
b = torch.Tensor(emb['white'])

ga = torch.Tensor(glove_vocab['futuristic'])
gb = torch.Tensor(glove_vocab['white'])

In [77]:
F.cosine_similarity(a, b, dim=0), F.cosine_similarity(ga, gb, dim=0)

(tensor(0.5099), tensor(0.1415))

In [84]:
a = torch.Tensor(emb['dynamic'])
b = torch.Tensor(emb['curvy'])

ga = torch.Tensor(glove_vocab['dynamic'])
gb = torch.Tensor(glove_vocab['curvy'])

In [85]:
F.cosine_similarity(a, b, dim=0), F.cosine_similarity(ga, gb, dim=0)

(tensor(0.2791), tensor(0.1650))