In [1]:
import numpy as np
import torch
import torch.nn as nn
import pickle
from tqdm import tqdm
import pandas as pd

## Preprocessing data

### Extract vocabulary

In [2]:
vocab = set()
labels = []
raw_sents = []
for line in open('yelp_labelled.txt'):
    line = line.strip()
    sent, label = line.split('\t')
    words = sent.split(' ')
    raw_sents.append(words)
    labels.append(int(label))
    for word in words:
        vocab.add(word.lower())

In [3]:
for i in range(len(raw_sents)):
    for j in range(len(raw_sents[i])):
        raw_sents[i][j]=raw_sents[i][j].lower()

In [4]:
raw_sents[:5]

[['wow...', 'loved', 'this', 'place.'],
 ['crust', 'is', 'not', 'good.'],
 ['not', 'tasty', 'and', 'the', 'texture', 'was', 'just', 'nasty.'],
 ['stopped',
  'by',
  'during',
  'the',
  'late',
  'may',
  'bank',
  'holiday',
  'off',
  'rick',
  'steve',
  'recommendation',
  'and',
  'loved',
  'it.'],
 ['the',
  'selection',
  'on',
  'the',
  'menu',
  'was',
  'great',
  'and',
  'so',
  'were',
  'the',
  'prices.']]

In [5]:
vocab = ['<PAD>'] + sorted(list(vocab))
vocab[100:110]

['acknowledged,',
 'acknowledged.',
 'across.',
 'actual',
 'actually',
 'actually.',
 'added',
 'affordable',
 'after',
 'afternoon!']

### Convert words to their index

In [6]:
indexed_sents = []
for cur_raw_sent in raw_sents:
    cur_indexed_sent = []
    for word in cur_raw_sent:
        cur_indexed_sent.append(vocab.index(word))
    indexed_sents.append(cur_indexed_sent)

In [7]:
indexed_sents[:5]

[[2697, 1415, 2418, 1805],
 [594, 1286, 1613, 1054],
 [1613, 2355, 151, 2389, 2378, 2602, 1320, 1576],
 [2273,
  414,
  769,
  2389,
  1349,
  1466,
  251,
  1182,
  1632,
  1995,
  2263,
  1941,
  151,
  1415,
  1293],
 [2389, 2094, 1649, 2389, 1502, 2602, 1068, 151, 2184, 2637, 2389, 1866]]

### Find max length / Assign max length

In [8]:
max_len = -1
for cur_indexed_sent in indexed_sents:
    max_len = max(max_len, len(cur_indexed_sent))

In [9]:
max_len

32

### Padding

In [10]:
# padded_sents = []
# for cur_indexed_sent in indexed_sents:
#     cur_padded_sent = cur_indexed_sent
#     cur_sent_len = len(cur_indexed_sent)
#     for i in range(cur_sent_len, max_len):
#         cur_padded_sent.append(0)
#     padded_sents.append(cur_padded_sent)
    
padded_sents = []
for cur_indexed_sent in indexed_sents:
    cur_padded_sent=[]
    for x in range(max_len-len(cur_indexed_sent)):
        cur_padded_sent.append(0)
    for words in cur_indexed_sent:
        cur_padded_sent.append(words)
    padded_sents.append(cur_padded_sent)

In [11]:
padded_sents[:5]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2697,
  1415,
  2418,
  1805],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  594,
  1286,
  1613,
  1054],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1613,
  2355,
  151,
  2389,
  2378,
  2602,
  1320,
  1576],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2273,
  414,
  769,
  2389,
  1349,
  1466,
  251,
  1182,
  1632,
  1995,
  2263,
  1941,
  151,
  1415,
  1293],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2389,
  2094,
  1649,
  2389,
  1502,
  2602,
  1068,
  151,
  2184,
  2637,
  2389,
  1866]]

In [12]:
print(len(raw_sents[0]))
print(len(padded_sents[0]))

4
32


### Convert data to numpy and torch tensors

In [13]:
X = np.array(padded_sents)
Y = np.array(labels)

In [14]:
X.shape

(1000, 32)

In [15]:
Y.shape

(1000,)

In [16]:
X = torch.from_numpy(X).cuda(device=1)
Y = torch.from_numpy(Y).long().cuda(device=1)

In [17]:
data_size = X.size(0)

In [18]:
train_size = int(data_size * 0.7)

In [19]:
trainX = X[:train_size]
trainY = Y[:train_size]
testX = X[train_size:]
testY = Y[train_size:]

### Load embeddings from glove 50d 

In [20]:
from gensim.models import KeyedVectors

In [21]:
embeddings = KeyedVectors.load_word2vec_format('/home/mtinterndec18/parth/nematus_system/glove.6B/glove_word2vec_50d.txt', binary=False)

In [22]:
unk_vector = embeddings.vectors.mean(axis=0)

In [23]:
vectors = [np.zeros(embeddings.vector_size)]
for word in vocab[1:]:
    if word in embeddings.vocab:
        vectors.append(embeddings[word])
      
    else:
        vectors.append(unk_vector)

vectors = np.array(vectors, dtype=np.float32)
vectors = torch.from_numpy(vectors).cuda(device=1)

In [24]:
vectors.size()

torch.Size([2729, 50])

## Network

In [25]:
class Network(nn.Module):
    
    def __init__(self, max_seq_len, embed_dim, hidden_dim, output_dim):
        super(Network, self).__init__()
        self.max_seq_len = max_seq_len
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.output_dim = output_dim
        self.embedder = nn.Embedding(len(vocab), self.embed_dim)
        self.embedder.weight.data.copy_(vectors)
        self.fc1 = nn.Linear(self.max_seq_len * self.embed_dim, self.hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
        
    def embed(self, x):
        return self.embedder(x)
    
    def forward(self, x):
        x = self.embedder(x)
#         print(x.size())
        x = x.view(-1, self.max_seq_len * self.embed_dim)
#         print(x.size())
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        return x

In [26]:
model = Network(max_len, 50, 200, 2)
model.cuda(device=1)

Network(
  (embedder): Embedding(2729, 50)
  (fc1): Linear(in_features=1600, out_features=200, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=200, out_features=2, bias=True)
)

In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 200
for epoch_cntr in range(num_epochs):
    optimizer.zero_grad()
    trainO = model(trainX)
    loss = criterion(trainO, trainY)
    loss.backward()
    optimizer.step()
    if epoch_cntr % 10 == 0:
        print('Epoch ', epoch_cntr, ' loss = ', loss.item())

Epoch  0  loss =  0.696826696395874
Epoch  10  loss =  0.4058898091316223
Epoch  20  loss =  0.20926892757415771
Epoch  30  loss =  0.08637973666191101
Epoch  40  loss =  0.02816195599734783
Epoch  50  loss =  0.008785173296928406
Epoch  60  loss =  0.003401532769203186
Epoch  70  loss =  0.0018086537020280957
Epoch  80  loss =  0.0011982298456132412
Epoch  90  loss =  0.0009034023969434202
Epoch  100  loss =  0.0007315491093322635
Epoch  110  loss =  0.0006160555640235543
Epoch  120  loss =  0.0005308306426741183
Epoch  130  loss =  0.00046429241774603724
Epoch  140  loss =  0.00041044660611078143
Epoch  150  loss =  0.0003659244976006448
Epoch  160  loss =  0.00032849106355570257
Epoch  170  loss =  0.0002966121246572584
Epoch  180  loss =  0.00026921884273178875
Epoch  190  loss =  0.0002454982604831457


In [28]:
trainO = model(trainX)
trainP = torch.argmax(trainO, dim=1)

print("training score: ",(trainY == trainP).sum().item() / trainY.size(0))

testO = model(testX)
testP = torch.argmax(testO, dim=1)

print("test score: ",(testY == testP).sum().item() / testY.size(0))

training score:  1.0
test score:  0.6966666666666667


In [29]:
def encode_sent(sent):
    sent = sent.strip()
    words = sent.split(' ')
    padded_sents = []
    indexed_sent = []
    for word in words:
        indexed_sent.append(vocab.index(word.lower()))
    indexed_len = len(indexed_sent)
    for x in range(max_len-len(indexed_sent)):
        padded_sents.append(0)
    for word in indexed_sent:
        padded_sents.append(word)
    x = np.array(padded_sents)
    x = torch.from_numpy(x).cuda(device=1)
    return x


In [30]:
test_sent = encode_sent('food quality is good')

In [31]:
test_sent

tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  957, 1899, 1286, 1050], device='cuda:1')

In [32]:
test_sent_output = model(test_sent)

In [33]:
test_sent_pred = torch.argmax(test_sent_output, dim=1)

In [34]:
test_sent_output

tensor([[-1.2057,  1.3180]], device='cuda:1', grad_fn=<AddmmBackward>)

In [35]:
test_sent_pred

tensor([1], device='cuda:1')

In [36]:
model.embed(test_sent).size()

torch.Size([32, 50])

## RNN

In [37]:
class RecurrentNetwork(nn.Module):
    
    def __init__(self, max_seq_len, embed_dim, hidden_dim, hidden_state_dim, output_dim):
        super(RecurrentNetwork, self).__init__()
        self.max_seq_len = max_seq_len
        self.hidden_dim = hidden_dim
        self.hidden_state_dim = hidden_state_dim
        self.embed_dim = embed_dim
        self.output_dim = output_dim
        self.embedder = nn.Embedding(len(vocab), self.embed_dim)
        self.embedder.weight.data.copy_(vectors)
        
        self.U = nn.Linear(self.hidden_state_dim, self.hidden_state_dim)
        self.W = nn.Linear(self.embed_dim, self.hidden_state_dim)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc1 = nn.Linear(self.hidden_state_dim, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
        
    def embed(self, x):
        return self.embedder(x)
    
    def forward(self, x):
        
        x = self.embedder(x)
        h = torch.zeros(x.size(0), self.hidden_state_dim).cuda(device=1)
        print(x.size())
        print(h.size())
        for i in range(x.size(1)):
            h = self.relu1(self.U(h) + self.W(x[:, i, :]))
        h = self.fc1(h)
        h = self.relu2(h)
        h = self.fc2(h)
        return h

In [38]:
rec_model = RecurrentNetwork(max_len, 50, 100, 200, 2)
rec_model.cuda(device=1)

RecurrentNetwork(
  (embedder): Embedding(2729, 50)
  (U): Linear(in_features=200, out_features=200, bias=True)
  (W): Linear(in_features=50, out_features=200, bias=True)
  (relu1): ReLU()
  (relu2): ReLU()
  (fc1): Linear(in_features=200, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=2, bias=True)
)

In [39]:
rec_model(trainX)

torch.Size([700, 32, 50])
torch.Size([700, 200])


tensor([[-0.0383, -0.0237],
        [-0.0305, -0.0409],
        [-0.0476, -0.0326],
        ...,
        [-0.0678, -0.0436],
        [-0.0696, -0.0088],
        [-0.0144, -0.0199]], device='cuda:1', grad_fn=<AddmmBackward>)

In [40]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 200
for epoch_cntr in range(num_epochs):
    optimizer.zero_grad()
    trainO = model(trainX)
    loss = criterion(trainO, trainY)
    loss.backward()
    optimizer.step()
    if epoch_cntr % 10 == 0:
        print('Epoch ', epoch_cntr, ' loss = ', loss.item())

Epoch  0  loss =  0.00022482668282464147
Epoch  10  loss =  2.4252960429294035e-05
Epoch  20  loss =  5.497251095221145e-06
Epoch  30  loss =  2.034051021837513e-06
Epoch  40  loss =  1.2949534493600368e-06
Epoch  50  loss =  9.768349400474108e-07
Epoch  60  loss =  8.181163480003306e-07
Epoch  70  loss =  7.172993150561524e-07
Epoch  80  loss =  6.437301749429025e-07
Epoch  90  loss =  5.742481903325825e-07
Epoch  100  loss =  5.211148845774005e-07
Epoch  110  loss =  4.7206879116856726e-07
Epoch  120  loss =  4.352841926902329e-07
Epoch  130  loss =  3.9645604488214303e-07
Epoch  140  loss =  3.6852699736300565e-07
Epoch  150  loss =  3.3719197745085694e-07
Epoch  160  loss =  3.1335014227806823e-07
Epoch  170  loss =  2.9359546260820935e-07
Epoch  180  loss =  2.711159936552576e-07
Epoch  190  loss =  2.5136131398539874e-07


In [41]:
trainO = model(trainX)
trainP = torch.argmax(trainO, dim=1)

print('training score: ',(trainY == trainP).sum().item() / trainY.size(0))

testO = model(testX)
testP = torch.argmax(testO, dim=1)

print("test score: ",(testY == testP).sum().item() / testY.size(0))

training score:  1.0
test score:  0.6866666666666666
