# YelpClassifier: Sentiment prediction using PyTorch


In [1]:
import torch
from torch import nn
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import numpy as np
from argparse import Namespace
import os
import pandas as pd
import string

## Building vocabulary for the review text

Our dataset is in csv format which we need to convert into integers in order to use it with neural networks.

In [2]:
class Vocabulary(object):
    def __init__(self, token_to_idx = None, add_unk=True, unk_token='<UNK>'):
        """
        params:
            token_to_idx (dict): mapping from token to index
            add_unk (bool): flag to add a special token to the vocabulary for unknowns tokens
            unk_token (str): Token used as special token
        
        """
        if token_to_idx is None:
            token_to_idx ={}
        
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token,idx in token_to_idx}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serialize(self):
        """ function to serialize the content of vocabulary
        """
        return {'idx_to_token':self._idx_to_token,
               'add_unk':self._add_unk,
               'unk_token':self._unk_token}
    
    @classmethod
    def from_serializable(cls,contents):
        """
        class function to create a vocabulary from serialized content
        """
        return cls(**contents)
    
    def add_token(self,token):
        """
        Add token to the vocabulary
        
        params:
            token (str): token to add to the vocabulary
            
        returns:
            idx (int): index of token
        
        """
        if token in self._token_to_idx:
            return self._token_to_idx[token]
        else:
            idx = len(self)
            self._token_to_idx[token] = idx
            self._idx_to_token[idx] = token
        return idx
    
    
    def lookup_idx(self,idx):
        """
        Lookup vocabulary to fetch  token at idx
        
        params:
            idx(int) : index of token to be fetched
            
        returns:
            token (str): token stored at idx
        """
        if index not in self._idx_to_token:
            raise KeyError("Vocabulary does not have token with specified index:"%idx)
        return self._idx_to_token[idx]
    

    def lookup_token(self,token):
        """
        Lookup vocabulary to fetch index of a token
        
        params:
            token(str): token to lookup
            
        returns:
            idx (int): index of token
        """
        
        if token not in self._token_to_idx:
            return self.unk_index
        else:
            return self._token_to_idx[token]
    
    def __len__(self):
        return len(self._idx_to_token)
    
    
    def __str__(self):
        return "Vocabulary (size = %d)" % len(self)
    
    

## Examples of Vocabulary class

Let's see some demo examples of using Vocabulary class for the following text.

text = """This is a good example of illustrating the use of pytorch for natural language processing. The example shows how to build a vocabulary which is a collection of words and their mapping to their corresponding indices. """

In [3]:
# raw text
text = """This is a good example of illustrating the use of pytorch for natural language processing. The example shows how to build a vocabulary which is a collection of words and their mapping to their corresponding indices. """

#preparing a vocabulary object
voc = Vocabulary(add_unk=True)

#adding token to the vocabulary
for word in text.strip().split(' '):
    voc.add_token(word)

#printing vocabulary mapping
print(voc._token_to_idx)


{'<UNK>': 0, 'This': 1, 'is': 2, 'a': 3, 'good': 4, 'example': 5, 'of': 6, 'illustrating': 7, 'the': 8, 'use': 9, 'pytorch': 10, 'for': 11, 'natural': 12, 'language': 13, 'processing.': 14, 'The': 15, 'shows': 16, 'how': 17, 'to': 18, 'build': 19, 'vocabulary': 20, 'which': 21, 'collection': 22, 'words': 23, 'and': 24, 'their': 25, 'mapping': 26, 'corresponding': 27, 'indices.': 28}


## Vectorizer
Now, let's move to vectorization process. This is the process where we will transform review text into vectors. These vectors will contain indices of each word in the review text.

For example, let's say we have a text "how to build" which we want to vectorize. For that we need a mapping (token to index) or vocabulary where we have information about indices of tokens.

If we use our demo vocabulary from the above example then "how to build" will be transformed into a vector.


In [4]:
vector = [0]* len(voc)

for token in "how to build".split(' '):
    index = voc.lookup_token(token)
    vector[index] = 1
print('Vectorized version:',vector)

Vectorized version: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Now, as we have an idea of what vectorizer does, we will move to create a vectorizer class offering text to vector transformation functionality.

In [5]:
class ReviewVectorizer(object):
    """
    Vectorizer class to transform review text into vectors
    """
    def __init__(self,review_vocab, rating_vocab):
        """
        params:
            review_vocab (Vocabulary): vocabulary object for review text
            rating_vocab (Vocabulary): vocabulary obejct for rating 
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab 
        
    def vectorize(self,text):
        """
        perform vectorization of given text
        
        params:
            text (str): review text to transform into vector
            
        returns:
            one_hot (array): returns one-hot encoding of text
        """
        one_hot = np.zeros(len(self.review_vocab),dtype=np.float32)
        
        # iterate over each word in the  text
        for word in text.strip().split():
            # avoid if the word is a punctuation
            if word not in string.punctuation:
                # fetching index of the word 
                idx = self.review_vocab.lookup_token(word)
                
                # setting 1 at idx index
                one_hot[idx] = 1
                
        return one_hot
        
    @classmethod
    def from_dataframe(cls,review_df,cutoff=25):
        """
        This function builds vocabulary for review text and rating.
        
        params:
            review_df (pandas.DataFrame): dataframe containing yelp dataset
            cutoff (int): a threshold to store words into vocabulary
            
        returns:
            ReviewVectorizer object
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        # adding all unique rating to the rating_vocubulary
        for rating in review_df['rating'].unique():
            rating_vocab.add_token(rating)
            
        word_count = {}
        
        # counting frequency of each word which appeared in the review text
        for review in review_df.review:
            for word in review.strip().split(' '):
                if word not in string.punctuation:
                    if word in word_count.keys():
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
        
        # adding tokens from review text to the review vocabulary
        for word,count in word_count.items():
            if count > cutoff:
                review_vocab.add_token(word)
        
        return cls(review_vocab,rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        """
        class function to create ReviewVectorizer from serialzed contents
        
        params:
            contents(dict): a dictionary containing contents for review and rating vacabulary
        
        returns:
            ReviewVectorizer object
        """
        
        return cls(review_vocab = Vocabulary.from_serialiazable(contents['reivew_vocab']),
                  rating_voca = Vocabulary.from_serializable(contents['rating_vocab']))
    
    def to_serializable(self):
        """
        To serialize vocabularies 
        
        returns:
            contents (dict): contents of review and rating vocabularies
        
        """
        return {'review_vocab':self.review_vocab.to_serializable(),
               'rating_vocab':self.rating_vocab.to_serializable()}

Now, we have vocabulary and vectorizer classes ready. Next, we need to create a dataset class inherting pytorch's `Dataset` class. This class enables uses of pytorch functionality with out dataset.

In [6]:
class YelpDataset(Dataset):
    """
    Dataset class utilizing pytorch Dataset template
    
    """
    def __init__(self, review_df, vectorizer):
        """
        review_df (pandas.DataFrame): dataframe containing yelp data records
        vectorizer (ReviewVectorizer): ReviewVectorizer object
        """
        self.review_df = review_df
        self._vectorizer =  vectorizer 
        
        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = self.train_df.shape[0]
        
        self.val_df = self.review_df[self.review_df.split=='val']
        self.val_size = self.val_df.shape[0]
        
        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = self.test_df.shape[0]
        
        self._lookup_dict = {'train':(self.train_df,self.train_size),
                             'val':(self.val_df,self.val_size),
                             'test':(self.test_df,self.test_size)}
        
        self.set_split('train')
        
    def get_vectorizer(self):
        return self._vectorizer
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls,review_csv):
        """class function to load dataset and initialize the vectorizer
        
        params: 
            review_csv (str): file_name
        
        returns:
            ReviewDataset object
        """
        
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split=='train']
        
        return cls(review_df,ReviewVectorizer.from_dataframe(train_review_df))
        
    def set_split(self,split='train'):
        """
        function to set the current active dataset
        
        params:
            split (str): specify part of dataset to be used
        
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
        
    def __getitem__(self,idx):
        """
        function to fetch record at index idx
        
        params:
            idx (int): index of record to be fetched
        """
        row = self._target_df.iloc[idx]
        review_vector = self._vectorizer.vectorize(row.review)
        review_rating = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data':review_vector,
               'y_target':review_rating}
    
    def get_num_batches(self,batch_size):
        return self._target_size//batch_size

In [7]:
# checking dataset
yelpDB = YelpDataset.load_dataset_and_make_vectorizer('./yelp/reviews_with_splits_lite.csv')

In [8]:
for split in ['train','test','val']:
    yelpDB.set_split(split)
    print(f'Split:{split}     Size:{len(yelpDB)}')

Split:train     Size:39200
Split:test     Size:8400
Split:val     Size:8400


We have our dataset ready. We now make use of DataLoader from pytorch which will allow us to train our classifier with batches of data.

## DataLoader 

In [9]:
def generate_batches(dataset, batch_size, shuffle=True,drop_last=True):
    """
    function to get data batches
    @ This code is targetted for cpu machines.
    
    params:
        dataset (YelpDataset): YelpDataset object
        batch_size (int): data batch size
        shuffle (bool): Whether to shuffle data
        drop_last (bool): flag to drop the last batch if it is less than the batch size
        
    """
    
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        
        for name,tensor in data_dict.items():
            out_data_dict[name] = data_dict[name]
        yield out_data_dict

Let's see now how one batch look like.

In [14]:
batch = next(iter(generate_batches(yelpDB,batch_size=100)))

In [17]:
batch['x_data'].size()

torch.Size([100, 7356])

In [15]:
batch

{'x_data': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [1., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 1.,  ..., 0., 0., 0.],
         [1., 0., 1.,  ..., 0., 0., 0.]]),
 'y_target': tensor([1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
         1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
         1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
         0, 1, 1, 1])}

# Building Review Classifier
Now, we will move to build our neural network to classify review text into positive or negative categories of rating.

In [10]:
class ReviewClassifier(nn.Module):
    """
    ReviewClassifier class
    """
    def __init__(self,input_size):
        """
        Initialize the classifier
        params:
            input_size (int): number of features
        """
        super(ReviewClassifier,self).__init__()
        self.fc1 = nn.Linear(in_features = input_size,
                             out_features = 1)
        
    def forward(self,inputs,apply_sigmoid = False):
        """
        function performing forward pass
        params:
            inputs (tensor): input vectors
            apply_sigmoid (bool): flag whether to apply sigmoid function or not
            
        returns:
            y_out (tensors): shape (batch_size,)
        """
        y_out = self.fc1(inputs).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

# Training neural network


In [19]:
# loading the dataset
yelpDB = YelpDataset.load_dataset_and_make_vectorizer('./yelp/reviews_with_splits_lite.csv')

# getting the vectorizer
vectorizer = yelpDB.get_vectorizer()

# initializing the classifier
classifier = ReviewClassifier(len(vectorizer.review_vocab))

In [22]:
def compute_accuracy(y_pred,y_target):
    
    y_pred_indices = (torch.sigmoid(y_pred) > 0.5).long()
    #print(y_pred_indices.dim(),y_target.dim())
    n_correct = (y_pred_indices == y_target).sum().item()
    #print(len(y_pred),len(y_target),n_correct)
    return n_correct / len(y_pred_indices) * 100

In [23]:
vectorizer = yelpDB.get_vectorizer()
classifier = ReviewClassifier(len(vectorizer.review_vocab))

In [25]:
from torch import optim
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(),lr=.0001)

In [31]:

num_epochs = 20
batch_size = 100

for epoch in range(num_epochs):
    
    yelpDB.set_split('train')
    
    batch_generator = generate_batches(yelpDB,batch_size=batch_size)
    
    train_running_loss = 0.0
    train_running_acc = 0.0
    
    classifier.train()

    
    for batch_index, batch_dict in enumerate(batch_generator):
        
        optimizer.zero_grad()
        
        y_pred = classifier(batch_dict['x_data'].float())
        
        loss = loss_func(y_pred,batch_dict['y_target'].float())
        
        loss_t = loss.item()
        
        train_running_loss += (loss_t - train_running_loss)/ (batch_index + 1)
        
        acc_t = compute_accuracy(y_pred,batch_dict['y_target'])
        train_running_acc += (acc_t - train_running_acc) / (batch_index + 1)
        
        loss.backward()
        
        optimizer.step()

        

    
    yelpDB.set_split('val')
    
    batch_generator = generate_batches(yelpDB,batch_size=batch_size)
    
    val_running_loss = 0.0
    val_running_acc = 0.0
    
    classifier.eval()

    
    for batch_index, batch_dict in enumerate(batch_generator):
        
        y_pred = classifier(batch_dict['x_data'].float())
        
        loss = loss_func(y_pred,batch_dict['y_target'].float())
        
        loss_t = loss.item()
        
        val_running_loss += (loss_t - val_running_loss)/ (batch_index + 1)
        
        acc_t = compute_accuracy(y_pred,batch_dict['y_target'])
        val_running_acc += (acc_t - val_running_acc) / (batch_index + 1)
        
    print('\nEpoch :{}'.format(epoch))
    print('  Training   ==> Loss: {:.2f} Accuracy: {:.2f}'.format(train_running_loss,train_running_acc))
    print('  Validation ==> Loss: {:.2f} Accuracy: {:.2f}'.format(val_running_loss,val_running_acc))

    


Epoch :0
  Training   ==> Loss: 0.25 Accuracy: 91.22
  Validation ==> Loss: 0.18 Accuracy: 95.61

Epoch :1
  Training   ==> Loss: 0.25 Accuracy: 91.24
  Validation ==> Loss: 0.18 Accuracy: 95.76

Epoch :2
  Training   ==> Loss: 0.25 Accuracy: 91.28
  Validation ==> Loss: 0.17 Accuracy: 95.89

Epoch :3
  Training   ==> Loss: 0.24 Accuracy: 91.34
  Validation ==> Loss: 0.17 Accuracy: 96.00

Epoch :4
  Training   ==> Loss: 0.24 Accuracy: 91.18
  Validation ==> Loss: 0.16 Accuracy: 96.32

Epoch :5
  Training   ==> Loss: 0.24 Accuracy: 91.26
  Validation ==> Loss: 0.16 Accuracy: 96.32

Epoch :6
  Training   ==> Loss: 0.24 Accuracy: 91.39
  Validation ==> Loss: 0.15 Accuracy: 96.63

Epoch :7
  Training   ==> Loss: 0.24 Accuracy: 91.40
  Validation ==> Loss: 0.15 Accuracy: 96.71

Epoch :8
  Training   ==> Loss: 0.23 Accuracy: 91.41
  Validation ==> Loss: 0.14 Accuracy: 96.81

Epoch :9
  Training   ==> Loss: 0.23 Accuracy: 91.46
  Validation ==> Loss: 0.14 Accuracy: 96.96


## Evaluation on test data


In [32]:
yelpDB.set_split('val')
    
batch_generator = generate_batches(yelpDB,batch_size=batch_size)
    
test_running_loss = 0.0
test_running_acc = 0.0
    
classifier.eval()

    
for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = classifier(batch_dict['x_data'].float())
    loss = loss_func(y_pred,batch_dict['y_target'].float())
    loss_t = loss.item()
    test_running_loss += (loss_t - test_running_loss)/ (batch_index + 1)
    acc_t = compute_accuracy(y_pred,batch_dict['y_target'])
    test_running_acc += (acc_t - test_running_acc) / (batch_index + 1)
    

print('  Test   ==> Loss: {:.2f} Accuracy: {:.2f}'.format(test_running_loss,test_running_acc))
   

  Test   ==> Loss: 0.14 Accuracy: 97.09


# References
1. https://github.com/delip/PyTorchNLPBook
2. https://pytorch.org/tutorials/beginner/basics/data_tutorial.html