In [7]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/'My Drive'/Assignment1/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Assignment1


In [8]:
import gensim.downloader as api
import gensim
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split

In [9]:
pretrained_model="glove-wiki-gigaword-300"
wv = api.load(pretrained_model)

In [10]:
def load_data():
    data = list()
    with open('Train.neg', mode='rb') as f:
      Content = f.read()
    neg_samples = Content.splitlines()
    label = 1
    for s in neg_samples:
      s_text = gensim.utils.simple_preprocess(s)
      data.append((s_text, label))
    with open('Train.pos', mode='rb') as f:
      Content = f.read()
    pos_samples = Content.splitlines()
    label = 0
    for s in pos_samples:
      s_text = gensim.utils.simple_preprocess(s)
      data.append((s_text, label))
    return data

In [11]:
def load_words():
    words = set()
    UNK = '<unk>'
    word2ind = {UNK: 0}
    ind2word = {0: UNK}
    for w in wv.vocab.keys():
      words.add(w)
    words = sorted(words)
    for w in words:
        idx = len(word2ind)
        word2ind[w] = idx
        ind2word[idx] = w
    words = [UNK] + words
    return words, word2ind, ind2word


def vectorize_sample(s, word2ind):
    sample_text, sample_label = s
    vec_text = [0] * len(sample_text)

    for i in range(len(sample_text)):
        try:
            vec_text[i] = word2ind[sample_text[i]]
        except KeyError:
            vec_text[i] = word2ind['<unk>']

    return vec_text, sample_label

class Sample_Dataset(Dataset):

    def __init__(self, samples, word2ind):
        self.samples = samples
        self.word2ind = word2ind

    def __getitem__(self, index):
        return vectorize_sample(self.samples[index], self.word2ind)

    def __len__(self):
        return len(self.samples)

def create_batch(batch):
    sample_len = list()
    label_list = list()
    for s in batch:
        sample_len.append(len(s[0]))
        label_list.append(s[1])
    target_labels = torch.LongTensor(label_list)
    x1 = torch.LongTensor(len(sample_len), max(sample_len)).zero_()
    for i in range(len(sample_len)):
        sample_text = batch[i][0]
        vec = torch.LongTensor(sample_text)
        x1[i, :len(sample_text)].copy_(vec)
    s_batch = {'text': x1, 'length': torch.FloatTensor(sample_len), 'labels': target_labels}
    return s_batch


def evaluate(data_loader, model, device):
    model.eval()
    num_examples = 0
    error = 0

    for idx, batch in enumerate(data_loader):
        sample_text = batch['text'].to(device)
        sample_len = batch['length']
        labels = batch['labels']
        
        logits = model(torch.LongTensor(sample_text), torch.tensor(sample_len))
        top_n, top_i = logits.topk(1)

        error += torch.nonzero(top_i.squeeze() - torch.LongTensor(labels)).size(0)
        num_examples += sample_text.size(0)
    accuracy = 1 - error / num_examples
    print('Accuracy : ', accuracy)
    return accuracy


def train(args, model, train_loader, test_loader, accuracy, device):
    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()

    for idx, batch in enumerate(train_loader):
        sample_text = batch['text'].to(device)
        sample_length = batch['length']
        labels = batch['labels']

        model.zero_grad()
        out = model(torch.LongTensor(sample_text), torch.tensor(sample_length))
        loss = criterion(out, batch['labels'])
        loss.backward()
        optimizer.step()

        clip_grad_norm_(model.parameters(), int(args['grad_clipping']))

        if idx % int(args['checkpoint']) == 0 and idx > 0:
            print('Iteration No: %d' % (idx))
            curr_accuracy = evaluate(test_loader, model, device)
            if accuracy < curr_accuracy:
                torch.save(model, args['save_model'])
                accuracy = curr_accuracy
    return accuracy

def create_emb_layer(weights_matrix):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim


class DAN_Model(nn.Module):

    def __init__(self, n_classes, weights_matrix,
                 hidden_dim=300, dropout=.5):
        super(DAN_Model, self).__init__()
        self.n_classes = n_classes
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        
        self.embeddings, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, n_classes)
        self.classifier = nn.Sequential(self.linear1, nn.ReLU(), self.linear2)
        self._softmax = nn.Softmax()

    def forward(self, input_text, text_len):
        logits = None
        text_embed = self.embeddings(input_text)
        encoded = text_embed.sum(1)
        encoded /= text_len.view(text_embed.size(0), -1)
        logits = self.classifier(encoded)
        return self._softmax(logits)

In [12]:
args = {
'batch_size': 16,
'num_epochs': 20,
'grad_clipping': 5,
'save_model': 'sentiment.pt',
'load_model': 'sentiment.pt',
'num_classes': 2,
'checkpoint': 50,
}

args['cuda'] = torch.cuda.is_available()
device = torch.device("cuda" if args['cuda'] else "cpu")

data = load_data()

voc, word2ind, ind2word = load_words()

weights_matrix = np.random.random((len(voc),300))
i=1
for word in voc[1:]:
  weights_matrix[i,:] = wv[word]
  i += 1

weights_matrix = torch.from_numpy(weights_matrix)
model = DAN_Model(args['num_classes'], weights_matrix)
model.to(device)
print(model)
train_data, val_data = train_test_split(data, test_size=0.3, random_state=42)

train_dataset = Sample_Dataset(train_data, word2ind)
train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

val_dataset = Sample_Dataset(val_data, word2ind)
val_sampler = torch.utils.data.sampler.SequentialSampler(val_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args['batch_size'],
                                        sampler=val_sampler, num_workers=0,
                                        collate_fn=create_batch)
        
accuracy = 0
for epoch in range(args['num_epochs']):
    print('---------------------------------------------------------------------------------------------')
    print('Epoch %d' % epoch)
    print('---------------------------------------------------------------------------------------------')
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'],
                                        sampler=train_sampler, num_workers=0,
                                        collate_fn=create_batch)
    accuracy = train(args, model, train_loader, val_loader, accuracy, device)
# print('start testing:\n')

# test_dataset = Sample_Dataset(test_exs, word2ind)
# test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'],
#                                         sampler=test_sampler, num_workers=0,
#                                         collate_fn=create_batch)
# evaluate(test_loader, model, device)

DAN_Model(
  (embeddings): Embedding(400001, 300)
  (linear1): Linear(in_features=300, out_features=300, bias=True)
  (linear2): Linear(in_features=300, out_features=2, bias=True)
  (classifier): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=2, bias=True)
  )
  (_softmax): Softmax(dim=None)
)
Epoch 0
---------------------------------------------------------------------------------------------
Iteration No: 50




Accuracy :  0.5093333333333334
Iteration No: 100
Accuracy :  0.5093333333333334
Iteration No: 150
Accuracy :  0.5246666666666666
Iteration No: 200
Accuracy :  0.534
Iteration No: 250
Accuracy :  0.5783333333333334
Iteration No: 300
Accuracy :  0.5366666666666666
Iteration No: 350
Accuracy :  0.5836666666666667
Iteration No: 400
Accuracy :  0.4993333333333333
Epoch 1
---------------------------------------------------------------------------------------------
Iteration No: 50
Accuracy :  0.6659999999999999
Iteration No: 100
Accuracy :  0.6166666666666667
Iteration No: 150
Accuracy :  0.6126666666666667
Iteration No: 200
Accuracy :  0.5853333333333333
Iteration No: 250
Accuracy :  0.6253333333333333
Iteration No: 300
Accuracy :  0.637
Iteration No: 350
Accuracy :  0.5489999999999999
Iteration No: 400
Accuracy :  0.6376666666666666
Epoch 2
---------------------------------------------------------------------------------------------
Iteration No: 50
Accuracy :  0.5509999999999999
Iteration