In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/GloveCode')
from vectorizer import Vectorizer

In [None]:
import os
# corpus=[]
# i=0
# path = '/content/gdrive/MyDrive/parsed_data'
# for file in os.listdir(path):
#    with open(path+'/'+file) as f:
#     print(file)
#     data = f.read()
#     data=data.split()
#     corpus.extend(data)
# print(len(corpus)) 
# with open('/content/gdrive/MyDrive/corpus', 'w') as writefile:
#       for c in corpus:
#         writefile.write("%s\n" % c)


In [None]:
with open('/content/drive/MyDrive/corpus') as f:
  newC=f.read().split("\n")



In [None]:
import torch
import torch.nn as nn


class GloVe(nn.Module):

    def __init__(self, vocab_size, embedding_size, x_max, alpha):
        super().__init__()
        self.weight = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size,
            sparse=True
        )
        self.weight_tilde = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size,
            sparse=True
        )
        self.bias = nn.Parameter(
            torch.randn(
                vocab_size,
                dtype=torch.float,
            )
        )
        self.bias_tilde = nn.Parameter(
            torch.randn(
                vocab_size,
                dtype=torch.float,
            )
        )
        self.weighting_func = lambda x: (x / x_max).float_power(alpha).clamp(0, 1)
    
    def forward(self, i, j, x):
        loss = torch.mul(self.weight(i), self.weight_tilde(j)).sum(dim=1)
        loss = (loss + self.bias[i] + self.bias_tilde[j] - x.log()).square()
        loss = torch.mul(self.weighting_func(x), loss).mean()
        return loss


In [None]:
import argparse
import pickle
import os
from pathlib import Path

import yaml
import matplotlib.pyplot as plt
import torch
import torch.optim
from tqdm import tqdm

from vectorizer import Vectorizer
from cooccurrence_entries import CooccurrenceEntries
#from glove import GloVe
from hdf5_dataloader import HDF5DataLoader


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--first-step-only",
        help="only calculate the cooccurrence matrix",
        action="store_true"
    )
    parser.add_argument(
        "--second-step-only",
        help="train the word vectors given the cooccurrence matrix",
        action="store_true"
    )
    return parser.parse_args()


def load_config():
    #config_filepath = /content/gdrive/MyDrive/gloveCode/config.yaml
    with open('/content/gdrive/MyDrive/gloveCode/config.yaml') as parameters:
            config_dict = yaml.safe_load(parameters)
    config = argparse.Namespace()
    for key, value in config_dict.items():
        setattr(config, key, value)
    return config


def calculate_cooccurrence(config):
    with open(config.input_filepath, "r") as f:
        corpus = f.read().split("\n")
    vectorizer = Vectorizer.from_corpus(
        corpus=corpus,
        vocab_size=config.vocab_size
    )
    cooccurrence = CooccurrenceEntries.setup(
        corpus=corpus,
        vectorizer=vectorizer
    )
    print(config.cooccurrence_dir)
    cooccurrence.build(
        window_size=config.window_size,
        num_partitions=config.num_partitions,
        chunk_size=config.chunk_size,
        output_directory=config.cooccurrence_dir
    ) 


def train_glove(config):
    dataloader = HDF5DataLoader(
        filepath=os.path.join(config.cooccurrence_dir, "cooccurrence.hdf5"),
        dataset_name="cooccurrence",
        batch_size=config.batch_size,
        device=config.device
    )
    model = GloVe(
        vocab_size=config.vocab_size,
        embedding_size=config.embedding_size,
        x_max=config.x_max,
        alpha=config.alpha
    )
    model.to(config.device)
    optimizer = torch.optim.Adagrad(
        model.parameters(),
        lr=config.learning_rate
    )
    with dataloader.open():
        model.train()
        losses = []
        for epoch in tqdm(range(config.num_epochs)):
            epoch_loss = 0
            for batch in tqdm(dataloader.iter_batches()):
                loss = model(
                    batch[0][:, 0],
                    batch[0][:, 1],
                    batch[1]
                )
                epoch_loss += loss.detach().item()
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

            losses.append(epoch_loss)
            print(f"Epoch {epoch}: loss = {epoch_loss}")
            torch.save(model.state_dict(), config.output_filepath)
    
    plt.plot(losses)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()


def main():
    #args = parse_args()
    config = load_config()
    #if not args.second_step_only:
    calculate_cooccurrence(config)
    #if not args.first_step_only:
    train_glove(config)
main()

OSError: ignored

In [None]:
class Network(nn.Module):
    def __init__(self): # You can add any extra arguments as you wish
        super(Network, self).__init__()
        self.embedding = nn.Sequential(
            nn.Conv1d(13, 64, 1), 
            nn.Conv1d(64, 256, 1)
        )
        
        self.lstm = nn.LSTM(256, 256, num_layers=4, bidirectional=True, dropout=0.2) # TODO: # Create a single layer, uni-directional LSTM with hidden_size = 256

        self.classification = nn.Sequential(
            nn.Linear(512, 2048), 
            nn.Linear(2048, 41)) 

    def forward(self, x, lengths_x): # TODO: You need to pass atleast 1 more parameter apart from self and x
        x=x.permute(0,2,1)
        embedded=self.embedding(x)
        embedded=embedded.permute(0,2,1)
        packed_input = pack_padded_sequence(embedded, lengths_x, enforce_sorted=False, batch_first=False)# TODO: Pack the input with pack_padded_sequence. Look at the parameters it requires
        out1, (out2, out3) = self.lstm(packed_input) 
        out, lengths  = pad_packed_sequence(out1) 

        out = self.classification(out)
        out = nn.functional.log_softmax(out, dim=2) 

        
        return out, lengths
model = Network().to(device)

In [None]:
!pip3 install --upgrade gensim --user

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 44.8 MB/s 
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [None]:
from pathlib import Path
import os
import argparse
import pickle

import torch
import yaml
from gensim.models.keyedvectors import KeyedVectors
import gensim.models
#from glove import GloVe
import h5py


def load_config():
    with open('/content/drive/MyDrive/GloveCode/config.yaml') as parameters:
            config_dict = yaml.safe_load(parameters)
    config = argparse.Namespace()
    for key, value in config_dict.items():
        setattr(config, key, value)
    return config


def main():
    config = load_config()
    with open(os.path.join('/content/drive/MyDrive/', "vocab.pkl"), "rb") as f:
        vocab = pickle.load(f)

    model = GloVe(
        vocab_size=config.vocab_size,
        embedding_size=config.embedding_size,
        x_max=config.x_max,
        alpha=config.alpha
    )
    model.load_state_dict(torch.load('/content/drive/MyDrive/glove-output'))
    
    keyed_vectors = gensim.models.keyedvectors.KeyedVectors(vector_size=config.embedding_size)
    #print(type(keyed_vectors))
    keyed_vectors.add_vectors(
        keys=[vocab.get_token(index) for index in range(len(vocab))],
        weights=(model.weight.weight.detach()
            + model.weight_tilde.weight.detach()).numpy()
    )

    print("get vector")
    print(vocab["bhumika"])
    print(len(keyed_vectors.get_vector("shall")))
    
    # print("How similar is company and shall:")
    # print(keyed_vectors.similarity("company", "shall"))
    # print("How similar is million and stock:")
    # print(keyed_vectors.similarity("million", "stock"))
    # print("How similar is financial and date:")
    # print(keyed_vectors.similarity("financial", "date"))
    # for word in ["agreement", "dollar", "stock"]:
    #     print(f"Most similar words of {word}:")
    #     most_similar_words = [word for word, _ in keyed_vectors.similar_by_word(word)]
    #     print(most_similar_words)

main()

get vector
-1
100
