In [1]:
from torch import nn , optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
reviews = pd.read_csv("./data/yelp/reviews_with_splits_lite.csv")

In [3]:
reviews

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train
...,...,...,...
55995,positive,"great food . wonderful , friendly service . i ...",test
55996,positive,charlotte should be the new standard for moder...,test
55997,positive,get the encore sandwich ! ! make sure to get i...,test
55998,positive,i m a pretty big ice cream gelato fan . pretty...,test


In [4]:
class Vectorizer(object):
    def __init__(self,vocab):
        
        self.vocab = sorted(vocab)
        
        self.word_to_ix = {}
        self.ix_to_word = {}
        
        for ix,word in enumerate(vocab):
            self.word_to_ix[word] = ix
            self.ix_to_word[ix] = word
        
    @classmethod
    def create_vocab_and_vectorize_from_df(cls,reviews_df):
        
        word_count = {}
        
        all_words = [k for i in reviews_df.review.values for k in i.split(" ")]
        
        for word in all_words:
            if word in word_count.keys():
                word_count[word] = word_count[word] + 1
            else:
                word_count[word] = 1
        
        vocab = []
        
        for word , word_count in word_count.items():
            if(word_count > 25):
                vocab.append(word)
        return cls(vocab)
    
    def vectorize(self,review):
        one_hot = np.zeros(len(self.vocab))
        
        for word in review.split(" "):
            if word in self.vocab:
                one_hot[self.word_to_ix[word]] = 1
         
        return one_hot
            

In [5]:
vect = Vectorizer.create_vocab_and_vectorize_from_df(reviews)

In [6]:
# print(list(vect.vectorize(reviews.review.values[1])))

In [7]:
class ReviewDataset(Dataset):
    def __init__(self,review_df,vectorizer):
        
        self.df = review_df
        self.vectorizer = vectorizer

        self.train = review_df[review_df["split"] == "train"]
        self.test = review_df[review_df["split"] == "test"]
        self.valid = review_df[review_df["split"] == "valid"]
        
        self.set_split('train')
        
    def set_split(self, split):
        self.target_df = self.df[self.df["split"] == split]
        
        self.target_size= len(self.target_df)
    
    def __len__(self):
        return self.target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self.target_df.iloc[index]

        review_vector = \
            self.vectorizer.vectorize(row.review)
        
        if row.rating == "negative":
            rating_index = 0
        else:
            rating_index = 1
        
        return {'x_data': review_vector,
                'y_target': rating_index}

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict    

In [8]:
class RatingClassifier(nn.Module):
    
    def __init__(self,in_size,out_size):
        
        super(RatingClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=in_size, 
                             out_features=out_size)
    
    def forward(self,x_in):
        
        return self.fc1(x_in).squeeze()
            

In [9]:
dataset = ReviewDataset(reviews,vect)
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [10]:
batch_generator = generate_batches(dataset, batch_size=64, 
                                           device='cpu')

In [11]:
# list(batch_generator)

In [None]:
classifier = RatingClassifier(len(vect.vocab),1).to('cpu')

loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)


for epoch in range(10):
    
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=64, 
                                           device='cpu')
    
    running_acc = 0
    running_loss = 0
    classifier.train()
    
    
    for batch_index,batch in enumerate(batch_generator):
        
        optimizer.zero_grad()
        
        y_pred = classifier(x_in=batch["x_data"].float())
        
        loss = loss_fn(y_pred,batch["y_target"].float())
        loss_t = loss.item()

        loss.backward()
        optimizer.step()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        
    print(epoch,running_loss,running_acc)
        
    

0 0.4257361304526233 86.11621732026134
1 0.27875973237982793 91.28880718954245
2 0.23338933732190162 92.46578839869285


In [None]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=32, 
                                   device='cpu')
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch['x_data'].float())

    # compute the loss
    loss = loss_fn(y_pred, batch['y_target'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)
    

In [None]:
running_acc