In [1]:
import os
import torch
import torch.nn as nn
from gensim.models import Word2Vec,KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
title_lemma,desc_lemma = [],[]
with open("title_lemma.txt","r") as f:
    for line in f:
        title_lemma.append(line.strip().split(" "))
with open("desc_lemma.txt","r") as f:
    for line in f:
        desc_lemma.append(line.strip().split(" "))
print(len(title_lemma),len(desc_lemma))

6704 6704


In [3]:
# doc_corpus = title_lemma + desc_lemma
# model = Word2Vec(doc_corpus, min_count=1,size=100,window=3,workers=32,sg=0,iter=30)

In [4]:
load_glove = False

In [5]:
if load_glove: 
    word2vec_file = 'pre_trained_glove_100d.txt'
    model = KeyedVectors.load_word2vec_format(word2vec_file, binary=False)
else:
    model = Word2Vec.load('modelW2V_100iter.bin')

In [6]:
def get_word_vec_matrix(model,sentence,number_of_words):
    not_found_words = []
    if len(sentence) < number_of_words:
        sentence = sentence + ["."]*(number_of_words - len(sentence))
    else:
        sentence = sentence[:number_of_words]
    data_point = []
    for word in sentence:
        try:
            data_point.append(model.wv[word])
        except:
            not_found_words.append(word)
            data_point.append(model.wv['.'])
    return not_found_words,data_point

In [37]:
X,y,documents = [],[],[]
not_found_words = []
for title,desc in zip(title_lemma,desc_lemma):
    doc = title + desc
    not_found_words_in_doc, data_point = get_word_vec_matrix(model,doc,60)
    not_found_words += not_found_words_in_doc
    documents.append(doc)
    X.append(data_point)
with open("rating.txt","r") as f:
    for line in f:
        y.append(int(line.strip()))
print(len(X),len(y),len(documents),len(not_found_words),not_found_words[:10])
print(sum(y)/len(y))

6704 6704 6704 1 ['']
0.6539379474940334


In [38]:
X_tr, X_val, y_tr, y_val, doc_tr, doc_val = train_test_split(X,y,documents,test_size=0.1, random_state=2)

In [39]:
len(X_val),len(X_tr),len(X_tr[0])

(671, 6033, 60)

In [40]:
print(sum(y_tr)/len(y_tr), sum(y_val)/len(y_val))

0.6534062655395325 0.6587183308494784


In [49]:
class TextLoader:
    def __init__(self,X,y,batch_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        
    def get_batch(self,batch_index):
        X = torch.tensor(self.X[batch_index*self.batch_size:(batch_index+1)*self.batch_size],dtype=torch.float)
        X = X.reshape(batch_size,100,60)
        y = torch.tensor(self.y[batch_index*self.batch_size:(batch_index+1)*self.batch_size])
        return X,y

In [50]:
no_of_epochs = 20
batch_size = 1000
no_of_batches = int(len(X_tr)/batch_size)
device = torch.device("cuda:0")
text_loader = TextLoader(X_tr,y_tr,batch_size)

In [51]:
Xb, yb = text_loader.get_batch(1)
print(Xb.shape,yb.shape)

torch.Size([1000, 100, 60]) torch.Size([1000])


In [52]:
class TextClassificationModelLinear(nn.Module):
    def __init__(self, debug_mode=False):
        super().__init__()
        self.debug = debug_mode   
        self.selu = nn.SELU()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(in_features=100*60, out_features=100)
        self.linear2 = nn.Linear(in_features=100, out_features=2)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self,x):
        if self.debug: print("ip",x.shape)
  
        x = self.flatten(x)
        if self.debug: print("flattened_op",x.shape)
        
        x = self.linear1(x)
        if self.debug: print('l1_z',x.shape)
        x = self.selu(x)
        if self.debug: print('l1_a',x.shape)
        
        x = self.linear2(x)
        if self.debug: print('l2_z',x.shape)
        x = self.log_softmax(x)
        if self.debug: print('l2_a',x.shape)
            
        return x

In [53]:
classifier = TextClassificationModelLinear(debug_mode=True)
# model.to(device)

In [54]:
Xb,yb = text_loader.get_batch(0)
op = classifier(Xb)
y_pred = torch.argmax(op,axis=1)
count = 0
for el1,el2 in zip(y_pred,yb): 
    if el1 == el2: count += 1
print("Accuracy:",count/len(y_pred))

ip torch.Size([1000, 100, 60])
flattened_op torch.Size([1000, 6000])
l1_z torch.Size([1000, 100])
l1_a torch.Size([1000, 100])
l2_z torch.Size([1000, 2])
l2_a torch.Size([1000, 2])
Accuracy: 0.642


In [56]:
X_val = torch.tensor(X_val,dtype=torch.float).reshape(671,100,60)
def get_val_score(model):
    y_op = model(X_val)
    y_pred = torch.argmax(y_op,axis=1)
    count = 0
    for el1,el2 in zip(y_pred,y_val):
        if el1 == el2: count += 1
    print("Validation accuracy:",count/len(y_pred))

In [60]:
classifier = TextClassificationModelLinear(debug_mode=False)
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.005)

In [61]:
for epoch in range(no_of_epochs):
    for i in range(no_of_batches):
        optimizer.zero_grad()
        Xb,yb = text_loader.get_batch(i)
        op = classifier(Xb)
        loss = loss_fn(op,yb)
        loss.backward()
        optimizer.step()
        y_pred = torch.argmax(op,axis=1)
        count = 0
        for el1,el2 in zip(y_pred,yb): 
            if el1 == el2: count += 1
        print("Epoch:",epoch,"Batch:",i,"Loss:",loss.item(),"Batch Accuracy:",count/len(y_pred))
    get_val_score(classifier)

Epoch: 0 Batch: 0 Loss: 0.7506355047225952 Batch Accuracy: 0.451
Epoch: 0 Batch: 1 Loss: 11.182941436767578 Batch Accuracy: 0.658
Epoch: 0 Batch: 2 Loss: 8.07805061340332 Batch Accuracy: 0.633
Epoch: 0 Batch: 3 Loss: 2.410994052886963 Batch Accuracy: 0.622
Epoch: 0 Batch: 4 Loss: 1.7781628370285034 Batch Accuracy: 0.358
Epoch: 0 Batch: 5 Loss: 1.853316068649292 Batch Accuracy: 0.319
Validation accuracy: 0.35469448584202684
Epoch: 1 Batch: 0 Loss: 0.9905802607536316 Batch Accuracy: 0.342
Epoch: 1 Batch: 1 Loss: 0.6540819406509399 Batch Accuracy: 0.658
Epoch: 1 Batch: 2 Loss: 0.8795104026794434 Batch Accuracy: 0.633
Epoch: 1 Batch: 3 Loss: 1.102521538734436 Batch Accuracy: 0.622
Epoch: 1 Batch: 4 Loss: 1.115944504737854 Batch Accuracy: 0.655
Epoch: 1 Batch: 5 Loss: 1.029209852218628 Batch Accuracy: 0.682
Validation accuracy: 0.6572280178837556
Epoch: 2 Batch: 0 Loss: 1.0175026655197144 Batch Accuracy: 0.67
Epoch: 2 Batch: 1 Loss: 0.949316143989563 Batch Accuracy: 0.658
Epoch: 2 Batch: 2 

Epoch: 19 Batch: 0 Loss: 0.060349270701408386 Batch Accuracy: 0.988
Epoch: 19 Batch: 1 Loss: 0.056859225034713745 Batch Accuracy: 0.989
Epoch: 19 Batch: 2 Loss: 0.047608040273189545 Batch Accuracy: 0.995
Epoch: 19 Batch: 3 Loss: 0.058458585292100906 Batch Accuracy: 0.989
Epoch: 19 Batch: 4 Loss: 0.052664000540971756 Batch Accuracy: 0.992
Epoch: 19 Batch: 5 Loss: 0.05518941953778267 Batch Accuracy: 0.99
Validation accuracy: 0.8926974664679582


In [65]:
my_review = "i am so waiting to visit the hotel again".strip().split(" ")
not_found_words,data_point = get_word_vec_matrix(model,my_review,60)
print(not_found_words)
data_point = torch.tensor(data_point,dtype=torch.float).reshape(1,100,60)

[]


In [66]:
my_op = classifier(data_point)
y_pred = torch.argmax(my_op,axis=1)
y_pred

tensor([0])

### Ignore below

In [69]:
class TextClassificationModelConv(nn.Module):
    def __init__(self, debug_mode=False):
        super().__init__()
        self.debug = debug_mode
        self.conv1 = nn.Conv1d(in_channels=100, kernel_size=3,out_channels=120)  
        self.conv2 = nn.Conv1d(in_channels=120, kernel_size=3,out_channels=240)
        self.conv3 = nn.Conv1d(in_channels=240, kernel_size=3,out_channels=480) 
        self.selu = nn.SELU()
        self.maxpool = nn.MaxPool1d(kernel_size=56)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(in_features=240*1, out_features=50)
        self.linear2 = nn.Linear(in_features=50, out_features=2)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self,x):
        if self.debug: print("ip",x.shape)
            
#         1st conv layer      
        x = self.conv1(x)
        if self.debug: print('conv1_op',x.shape)
        x = self.selu(x)
        if self.debug: print('selu_op',x.shape)
#         x = self.maxpool(x)
#         if self.debug: print('pool1_op',x.shape)

#         2nd conv layer      
        x = self.conv2(x)
        if self.debug: print('conv2_op',x.shape)
        x = self.selu(x)
        if self.debug: print('selu_op',x.shape)
        x = self.maxpool(x)
        if self.debug: print('pool2_op',x.shape)

# #         3rd conv layer      
#         x = self.conv3(x)
#         if self.debug: print('conv3_op',x.shape)
#         x = self.selu(x)
#         if self.debug: print('selu_op',x.shape)
# #         x = self.maxpool(x)
# #         if self.debug: print('pool1_op',x.shape)
        
        x = self.flatten(x)
        if self.debug: print("flattened_op",x.shape)
        
        x = self.linear1(x)
        if self.debug: print('l1_z',x.shape)
        x = self.selu(x)
        if self.debug: print('l1_a',x.shape)
        
        x = self.linear2(x)
        if self.debug: print('l2_z',x.shape)
        x = self.log_softmax(x)
        if self.debug: print('l2_a',x.shape)
            
        return x

In [70]:
classifier_conv = TextClassificationModelConv(debug_mode=True)
Xb,yb = text_loader.get_batch(0)
op = classifier_conv(Xb)
y_pred = torch.argmax(op,axis=1)
count = 0
for el1,el2 in zip(y_pred,yb): 
    if el1 == el2: count += 1
print("Accuracy:",count/len(y_pred))

ip torch.Size([1000, 100, 60])
conv1_op torch.Size([1000, 120, 58])
selu_op torch.Size([1000, 120, 58])
conv2_op torch.Size([1000, 240, 56])
selu_op torch.Size([1000, 240, 56])
pool2_op torch.Size([1000, 240, 1])
flattened_op torch.Size([1000, 240])
l1_z torch.Size([1000, 50])
l1_a torch.Size([1000, 50])
l2_z torch.Size([1000, 2])
l2_a torch.Size([1000, 2])
Accuracy: 0.584


In [71]:
classifier_conv = TextClassificationModelConv(debug_mode=False)
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(classifier_conv.parameters(), lr=0.01)

In [72]:
for epoch in range(no_of_epochs):
    for i in range(no_of_batches):
        optimizer.zero_grad()
        Xb,yb = text_loader.get_batch(i)
        op = classifier_conv(Xb)
        loss = loss_fn(op,yb)
        loss.backward()
        optimizer.step()
        y_pred = torch.argmax(op,axis=1)
        count = 0
        for el1,el2 in zip(y_pred,yb): 
            if el1 == el2: count += 1
        print("Epoch:",epoch,"Batch:",i,"Loss:",loss.item(),"Batch Accuracy:",count/len(y_pred))
    get_val_score(classifier_conv)

Epoch: 0 Batch: 0 Loss: 0.7320619225502014 Batch Accuracy: 0.334
Epoch: 0 Batch: 1 Loss: 3.9010252952575684 Batch Accuracy: 0.657
Epoch: 0 Batch: 2 Loss: 2.1872470378875732 Batch Accuracy: 0.633
Epoch: 0 Batch: 3 Loss: 0.746225118637085 Batch Accuracy: 0.622
Epoch: 0 Batch: 4 Loss: 1.6548269987106323 Batch Accuracy: 0.344
Epoch: 0 Batch: 5 Loss: 1.1714307069778442 Batch Accuracy: 0.318
Validation accuracy: 0.40536512667660207
Epoch: 1 Batch: 0 Loss: 0.7045931220054626 Batch Accuracy: 0.405
Epoch: 1 Batch: 1 Loss: 0.6449160575866699 Batch Accuracy: 0.657
Epoch: 1 Batch: 2 Loss: 0.7529591917991638 Batch Accuracy: 0.633
Epoch: 1 Batch: 3 Loss: 0.8573747873306274 Batch Accuracy: 0.622
Epoch: 1 Batch: 4 Loss: 0.7753465175628662 Batch Accuracy: 0.656
Epoch: 1 Batch: 5 Loss: 0.6709977388381958 Batch Accuracy: 0.682
Validation accuracy: 0.6572280178837556
Epoch: 2 Batch: 0 Loss: 0.6443325877189636 Batch Accuracy: 0.671
Epoch: 2 Batch: 1 Loss: 0.637850284576416 Batch Accuracy: 0.66
Epoch: 2 Bat

Epoch: 19 Batch: 0 Loss: 0.10712306946516037 Batch Accuracy: 0.964
Epoch: 19 Batch: 1 Loss: 0.08305250108242035 Batch Accuracy: 0.976
Epoch: 19 Batch: 2 Loss: 0.07678007334470749 Batch Accuracy: 0.976
Epoch: 19 Batch: 3 Loss: 0.06640906631946564 Batch Accuracy: 0.982
Epoch: 19 Batch: 4 Loss: 0.08200797438621521 Batch Accuracy: 0.97
Epoch: 19 Batch: 5 Loss: 0.0782724991440773 Batch Accuracy: 0.976
Validation accuracy: 0.8479880774962743


In [82]:
my_review = "i dislike the hotel".strip().split(" ")
not_found_words,data_point = get_word_vec_matrix(model,my_review,60)
print(not_found_words)
data_point = torch.tensor(data_point,dtype=torch.float).reshape(1,100,60)
my_op = classifier_conv(data_point)
y_pred = torch.argmax(my_op,axis=1)
y_pred

[]


tensor([1])