In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import nltk
import time
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn import metrics

import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

import random

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xmh91\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
## Reload After restart
#w2v_google_model = api.load('word2vec-google-news-300')
model = Word2Vec.load("tained_word2vec.model")
#model = Word2Vec.load("tained_word2vec_random1.model")
w2v_review_model = model.wv

In [3]:
## Reload After restart
### Raw review data in df
df = pd.read_csv("250k_classified_reviews.csv", sep='\t')

# Calculate Accuracy
def getAccuracy(out, labels):
    _, predict = torch.max(out.data, 1)
    total = labels.shape[0]*1.0
    correct = (labels == predict).sum().item()
    return correct/total

In [4]:
# create 50 vec for each review
def paddedVec(w2v_model, train_input):
    res = []

    for review in train_input:
        docVec = []
        words = review.split()
        if len(words) >= 50 :
            words = words[:50]     
        else:
            #words = ["0" for _ in range(50 - len(words))] + words
            docVec = [np.zeros(300,) for _ in range(50 - len(words))]
        
        for word in words:
            if word in w2v_model:
                docVec.append(w2v_model[word])
            else:
                docVec.append(np.zeros(300,))
        
        res.append(docVec)

    return np.array(res, dtype="float64")

In [6]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

device = torch.device("cpu")
    
class SeqRNN(nn.Module):

    def __init__(self,vocab_size,hidden_size,output_size):
        super(SeqRNN,self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.rnn = nn.RNN(self.vocab_size,self.hidden_size,batch_first=True)
        self.linear = nn.Linear(self.hidden_size,self.output_size)
        
    def forward(self,input): 
        batch_size = len(input)
        h0 = torch.zeros(1,batch_size,self.hidden_size).to(device)
        output , hidden = self.rnn(input,h0)
        output = output[ : ,-1, : ]
        output = self.linear(output)
        output = torch.nn.functional.softmax(output,dim=1)
        return output

In [46]:
# Utilities
###### For for each loop ######
# Use Model to compute accracy from test_set and test_labels
def test_accuracy(model, test_set, test_labels):
    total = len(test_set)
    correct = 0
    
    for i in range(total):
        review = test_set[i]
        pred = predict_review(review, model)
        if pred == test_labels[i]:
            correct += 1
    return correct * 1.0 / total

# Ouput the predicted sentiment of a sentence
def predict_review(review, model):
    input_tensor = torch.tensor([review]).float().to(device)
    with torch.no_grad():
        out = model(input_tensor)
        out = torch.argmax(out).item()
    # Delete tensor from GPU
    del input_tensor
    return out


###### For dataloader enumerate loop ######

def test_accuracy_loader(model, test_loader):
    total = 0
    correct = 0
    for _, (review, label) in enumerate(test_loader):
        total += len(label)
        pred = predict_review_tensor(review, model)
        correct += (pred == label).sum()
    return correct.item() * 1.0 / total

def predict_review_tensor(review, model):
    with torch.no_grad():
        out = model(review)
        out = torch.argmax(out).item()
    return out



In [38]:
## Keep only class 1 and 2 for Binary Classification
df_sm = pd.concat([df[df['class'] == 1], df[df['class'] == 2]])
## Drop Null reviews
df_sm = df_sm.dropna(subset=['review'])

# Sample less data due to memory constraint
df_sm = df_sm.sample(2000)


# Split train and test data
reviews = df_sm["review"]
labels = df_sm["class"]
train_set, test_set, train_labels, test_labels = train_test_split(reviews, labels, test_size = 0.2)

## Turn Train and Test set and labels to numpy array
train_set = paddedVec(w2v_review_model, train_set)
test_set = paddedVec(w2v_review_model, test_set)

## Substract 1 from labels, so the classes starts from 0
train_labels = train_labels.to_numpy() - 1
test_labels = test_labels.to_numpy() - 1

In [39]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
    
## Data Loader

## Keep only class 1 and 2 for Binary Classification
df_sm = pd.concat([df[df['class'] == 1], df[df['class'] == 2]])
## Drop Null reviews
df_sm = df_sm.dropna(subset=['review'])

# Sample less data due to memory constraint
df_sm = df_sm.sample(2000)


# Split train and test data
reviews = df_sm["review"]
labels = df_sm["class"]
train_set, test_set, train_labels, test_labels = train_test_split(reviews, labels, test_size = 0.2)

## Turn Train and Test set and labels to numpy array
train_set = paddedVec(w2v_review_model, train_set)
test_set = paddedVec(w2v_review_model, test_set)

## Substract 1 from labels, so the classes starts from 0
train_labels = train_labels.to_numpy() - 1
test_labels = test_labels.to_numpy() - 1

## Train & test tensors
train_set = torch.from_numpy(train_set).float().to(device)
test_set = torch.from_numpy(test_set).float().to(device)
train_labels = torch.from_numpy(train_labels).type(torch.LongTensor).to(device)
test_labels = torch.from_numpy(test_labels).type(torch.LongTensor).to(device)


## Prepare TensorDataset and DataLoader
training_set = TensorDataset(train_set, train_labels)
testing_set = TensorDataset(test_set, test_labels)

train_loader = DataLoader(training_set, batch_size=1, shuffle=True)
test_loader = DataLoader(testing_set, batch_size=1, shuffle=True)

In [None]:
for (batch_idx, (reviews, labels)) in enumerate(train_loader):
    print("")
    print(reviews.size())
    print(len(labels))
    input()

In [48]:
model = SeqRNN(300,50,2)
model.to(device)
epoches = 2
every_epoch = 1
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0003)

start_time = time.time()

for e in range(epoches):
    # Train all sentences from train set
    for i, (review, label) in enumerate(train_loader):

        optimizer.zero_grad()
        pred = model(review)
        loss = loss_func(pred,label)
        loss.backward()
        optimizer.step()
        
    if e % every_epoch == 0:
        accuracy = test_accuracy_loader(model, test_loader)
        timetaken = time.time() - start_time
        print("Epoch: {} Timetaken: {:.2f}s train_loss: {:.4f} accuracy: {:.4f}".format(e, timetaken, loss, accuracy))
        start_time = time.time()

torch.cuda.empty_cache()

Epoch: 0 Timetaken: 3.60s train_loss: 0.3771 accuracy: 0.6225
Epoch: 1 Timetaken: 3.57s train_loss: 0.3356 accuracy: 0.6750


In [8]:
model = SeqRNN(300,50,2)
model.to(device)
epoches = 3
every_epoch = 1
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0003)

start_time = time.time()

for e in range(epoches):
    # Train all sentences from train set
    for i in range(len(train_set)):
        #index = random.randint(0, len(train_labels)-1)
        review = train_set[i]
        label = train_labels[i]
        input_tensor = torch.tensor([review]).float().to(device)
        label_tensor = torch.tensor([label]).type(torch.LongTensor).to(device)

        optimizer.zero_grad()
        pred = model(input_tensor)
        loss = loss_func(pred,label_tensor)
        loss.backward()
        optimizer.step()
        
        del input_tensor
        del label_tensor
        torch.cuda.empty_cache()
    # Report accuracy on test set every_epoch
    if e % every_epoch == 0:
        accuracy = test_accuracy(model, test_set, test_labels)
        timetaken = time.time() - start_time
        print("Epoch: {} Timetaken: {:.2f}s train_loss: {:.4f} accuracy: {:.4f}".format(e, timetaken, loss, accuracy))
        start_time = time.time()

Epoch: 0 Timetaken: 70.93s train_loss: 0.3152 accuracy: 0.7368
Epoch: 1 Timetaken: 70.89s train_loss: 0.3162 accuracy: 0.7260
Epoch: 2 Timetaken: 73.01s train_loss: 0.3161 accuracy: 0.7352


Cuda - For Loop
Epoch: 0 Timetaken: 53.23s train_loss: 0.3510 accuracy: 0.7465
Epoch: 1 Timetaken: 56.13s train_loss: 0.3261 accuracy: 0.7432
Epoch: 2 Timetaken: 53.85s train_loss: 0.3234 accuracy: 0.7458
Epoch: 3 Timetaken: 52.89s train_loss: 0.3162 accuracy: 0.7415
Epoch: 4 Timetaken: 52.95s train_loss: 0.3143 accuracy: 0.7532
Epoch: 5 Timetaken: 52.50s train_loss: 0.3139 accuracy: 0.7635
Epoch: 6 Timetaken: 52.86s train_loss: 0.3139 accuracy: 0.7620
Epoch: 7 Timetaken: 53.07s train_loss: 0.3138 accuracy: 0.7575
Epoch: 8 Timetaken: 57.89s train_loss: 0.3134 accuracy: 0.7692
Epoch: 9 Timetaken: 53.21s train_loss: 0.3161 accuracy: 0.7668

CPU - For Loop
Epoch: 0 Timetaken: 70.93s train_loss: 0.3152 accuracy: 0.7368
Epoch: 1 Timetaken: 70.89s train_loss: 0.3162 accuracy: 0.7260
Epoch: 2 Timetaken: 73.01s train_loss: 0.3161 accuracy: 0.7352

Num of Reviews = Batch
Num of words = Sequence
input_size = 300
hidden_size = 50
output_size = 3 or 2 accordingly