# Yelp Review - Sentiment Classification
- [Book: NLP with Pytorch](https://nbviewer.jupyter.org/github/joosthub/PyTorchNLPBook/blob/master/chapters/chapter_3/3_5_Classifying_Yelp_Review_Sentiment.ipynb)

## Import Packages

In [37]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from typing import Dict, Tuple, List

## General Utility Functions

In [2]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)
        
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

## Hyperparameter and other meta information setting

In [3]:
args = Namespace(frequency_cutoff = 25, 
                 model_state_file='model_yelp_classification.pth',
                 file_csv = 'data/yelp/reviews_with_splits_lite.csv',
                 save_dir='model_storage/ch3/yelp/',
                 vectorizer_file = 'vectorizer_yelp_review.json',
                 batch_size=128,
                 early_stopping_criteria = 5,
                 learning_rate = 0.001,
                 num_epochs = 3,
                 seed = 2019,
                 catch_keyboard_interrupt = True,
                 cuda = True,
                 expand_filepaths_to_save_dir = True,
                 reload_from_files = False
                )

In [4]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/ch3/yelp/vectorizer_yelp_review.json
	model_storage/ch3/yelp/model_yelp_classification.pth
Using CUDA: False


# Data Vectorization Class

## The Vocabulary

In [5]:
class Vocabulary(object):
    """Process text and extract vocab for mapping"""
    def __init__(self, 
                 token_to_idx:Dict=None, 
                 add_unk:bool=True, 
                 unk_token="<UNK>"):
        """
        Args:
            token_to_idx: a pre-existing map of token to indices
            add_unk: a flag to indicate whether to add unknown tokens
            unk_token: the UNK token to add into the vocabulary
        """
        
        if token_to_idx is None:
            token_to_idx = {}
        
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx : token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    
    def add_token(self, token:str):
        """Update mapping dictionary
        Return:
            the intiger index corresponding to the token
        """
        if token in self._token_to_idx:return self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            return index
        
    def to_serializable(self):
        """returns a dict that can be serialized"""
        return {'token_to_idx':self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, content):
        """instantiates the vocub from a serializable dict"""
        return cls(**contents)
    
    def add_many(self, tokens:List[str])->List[int]:
        """Add list of tokens into the Vocabulary
            
        """
        return [self.add_token(tok) for tok in tokens]
    
    def lookup_token(self, token:str)->int:
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >=0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    
    def lookup_index(self, index:int)->str:
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError(f"index: {index} not in the Vocabulary")
        
        return self._idx_to_token[index]
    
    def __repr__(self):
        return f"Vocabulary Size:{len(self)}"
    
    def __len__(self):
        return len(self._token_to_idx)

## The Vectorizer
        

In [6]:
class ReviewVectorizer(object):
    
    def __init__(self,
                 review_vocab: Vocabulary, 
                 rating_vocab: Vocabulary):
        """
        Args:
            review_vocab: maps words to integers
            rating_vocab: maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
    
    def vectorizer(self, review:str):
        """Creates a collapsed one-hot vector for the review
        Args:
            review (str): the review
        Returns:
            one_hot: the collapsed one hot encoding
        """
        # review_vocab: is the entire review vocabulary 
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        # add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
        
        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
                    
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab, rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents:Dict):
        """Instantiate a ReviewVectorizer from a serializable dictionary
        
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        
        return cls(review_vocab, rating_vocab)
    
    def to_serializable(self):
        """Create the serializable dictionary for caching
        
        Returns:
            contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()}


## The Dataset

In [7]:
class ReviewDataset(Dataset):
    def __init__(self, 
                 review_df:pd.DataFrame, 
                 vectorizer: ReviewVectorizer):
        """
        Args:
            review_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        """
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split == "train"]
        self.train_size = len(self.train_df)
        
        
        self.val_df = self.review_df[self.review_df.split == "val"]
        self.validation_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split == "test"]
        self.test_size = len(self.test_df)
        
        self._lookup_split = {"train": (self.train_df, self.train_size), 
                              "val": (self.val_df, self.validation_size), 
                              "test": (self.test_df, self.test_size)
                             }
        
        self.set_split("train")
    
    def set_split(self, 
                  split:str="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_split[split]
    
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls, 
                                         file_review:str):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            file_review (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(file_review)
        train_review_df = review_df[review_df.split == "train"]
        
        return cls(review_df, ReviewVectorizer.from_dataframe(train_review_df))
    
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, 
                                         file_review:str, 
                                         file_vectorizer:str):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
        
        Args:
            review_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(file_review)
        vectorizer = cls.load_vectorizer_only(file_vectorizer)
        return cls(review_df, vectorizer)
    
    
    def save_vectorizer(self, file_vectorizer:str):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(file_vectorizer, "w") as fout:
            json.dump(self._vectorizer.to_serializable(), fout)
    
    @staticmethod
    def load_vectorizer_only(file_vectorizer:str):
        """a static method for loading the vectorizer from file
        
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of ReviewVectorizer
        """
        with open(file_vectorize, "r") as fin:
            return ReviewVectorizer.from_serializable(json.load(fin))
    
    
    def get_vectorizer(self):
        """Returns the vectorizer"""
        return self._vectorizer
    

    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index:int):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        
        row = self._target_df.iloc[index]
        review_vector = self._vectorizer.vectorizer(row.review)
        
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
    
        return {'x_data': review_vector, 
                "y_target": rating_index}
    
    def get_num_batches(self, batch_size:int):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [8]:
def generate_batches(datase:Dataset, 
                     batch_size:int, 
                     shuffle:bool=True, 
                     drop_last:bool=True, 
                     device:str="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the correct device location.
    """
    
    dataloader = DataLoader(dataset = dataset, 
                            batch_size=batch_size,
                            shuffle = shuffle,
                            drop_last = drop_last)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        
        yield out_data_dict

## The Model: Classifier

In [9]:
class ReviewClassifier(nn.Module):
    """A simple perseptron based classifier
    """
    def __init__(self, num_features:int):
        """
        Args:
            num_features (int): size of the input features
        """
        super().__init__()
        self.fc1 = nn.Linear(in_features=num_features, 
                             out_features = 1)
        
    def forward(self, 
                x_in:torch.Tensor, 
                apply_sigmoid:bool = False):
        
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor
                                x_in.shape = (batch_size, num_features)
            apply_sigmoid (bool): a flag for sigmoid activation
                                should be False if used with cross entropy
                                losses
        Returns:
            the resulting tensor.
            shape: (batch_size,)
        
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid: y_out = torch.sigmoid(y_out)
        
        return y_out

In [10]:
a = torch.randn(5, 2, dtype=torch.float32)
a

tensor([[-0.1187,  0.2110],
        [ 0.7463, -0.6136],
        [-0.1186,  1.5565],
        [ 1.3662,  1.0199],
        [ 2.4644,  1.1630]])

In [11]:
o = nn.Linear(2,1)(a)
o, o.size()

(tensor([[-0.3541],
         [ 0.4962],
         [-1.0901],
         [-0.1113],
         [ 0.3172]], grad_fn=<AddmmBackward>), torch.Size([5, 1]))

In [12]:
o.squeeze()

tensor([-0.3541,  0.4962, -1.0901, -0.1113,  0.3172],
       grad_fn=<SqueezeBackward0>)

## Notes

### Q. Why we don't use sigmoid with Cross Entropy


### Q. What `torch.squeeze()` does? 

Returns a tensor with all the dimensions of input of size 1 removed.

For example, if input is of shape: $(A \times 1 \times B \times C \times 1 \times D)$ then the out tensor will be of shape: $(A \times B \times C \times D)$.

When `dim` is given, a `squeeze` operation is done only in the given dimension. If input is of shape: $(A \times 1 \times B)$, `squeeze(input, 0)` leaves the tensor unchanged, but `squeeze(input, 1)` will squeeze the tensor to the shape $(A \times B)$.

```py
>>> x = torch.zeros(2, 1, 2, 1, 2)
>>> x.size()
torch.Size([2, 1, 2, 1, 2])
>>> y = torch.squeeze(x)
>>> y.size()
torch.Size([2, 2, 2])
>>> y = torch.squeeze(x, 0)
>>> y.size()
torch.Size([2, 1, 2, 1, 2])
>>> y = torch.squeeze(x, 1)
>>> y.size()
torch.Size([2, 2, 1, 2])
```

## Training Routine

### Helper Function

In [13]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, 
                       model: ReviewClassifier, 
                       train_state: Dict):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

## Initialization

In [14]:
if args.reload_from_files:
    # training from a checkpoint
    print("Loading dataset and vectorizer")
    dataset = ReviewDataset.load_dataset_and_load_vectorizer(args.file_csv,
                                                            args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer")
    # create dataset and vectorizer
    dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.file_csv)
    dataset.save_vectorizer(args.vectorizer_file) 

Loading dataset and creating vectorizer


In [15]:
vectorizer = dataset.get_vectorizer()

In [16]:
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))

## Training Loop

In [17]:
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(classifier.parameters(), 
                       lr=args.learning_rate)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, 
                                                 mode = 'min', 
                                                 factor = 0.5, 
                                                 patience = 1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                  total=args.num_epochs,
                  position=0)


dataset.set_split("train")

train_bar = tqdm(desc='split=train',
                  total=dataset.get_num_batches(args.batch_size), 
                  position=1, 
                  leave=True)

dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                total=dataset.get_num_batches(args.batch_size), 
                position=1, 
                leave=True)

training routine:   0%|          | 0/3 [00:00<?, ?it/s]
split=train:   0%|          | 0/306 [00:00<?, ?it/s][A
split=val:   0%|          | 0/65 [00:00<?, ?it/s][A

In [18]:
loss_func = nn.BCEWithLogitsLoss()

In [19]:
optimizer = optim.Adam(classifier.parameters(), 
                       lr=args.learning_rate)

In [20]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, 
                                                 mode = 'min', 
                                                 factor = 0.5, 
                                                 patience = 1)

In [21]:
train_state = make_train_state(args)

In [22]:
epoch_bar = tqdm(desc='training routine', 
                  total=args.num_epochs,
                  position=0)


training routine:   0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
dataset.set_split("train")

In [24]:
train_bar = tqdm(desc='split=train',
                  total=dataset.get_num_batches(args.batch_size), 
                  position=1, 
                  leave=True)


split=train:   0%|          | 0/306 [00:00<?, ?it/s][A

In [25]:
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                total=dataset.get_num_batches(args.batch_size), 
                position=1, 
                leave=True)


split=val:   0%|          | 0/65 [00:00<?, ?it/s][A

In [18]:
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        
        # Iterate over training dataset
        # setup: batch generator, set loss and acc to 0, set train mode on
        
        dataset.set_split("train")
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device = args.device)

        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            
            optimizer.zero_grad()
            y_pred = classifier(x_in=batch_dict['x_data'].float())
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            loss.backward()
            optimizer.step()
            
            # compute accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            status = f"train\t epoch:{epoch_index}/{args.num_epochs}\t batch_index:{batch_index}\t train_acc: {running_acc:.4}\t train_loss: {running_loss:.4}"
            print(status)
            
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred = classifier(x_in=batch_dict['x_data'].float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            
            status = f"val\t epoch:{epoch_index}/{args.num_epochs}\t batch_index:{batch_index}\t val_acc: {running_acc:.4}\t val_loss: {running_loss:.4}"
            print(status)
            

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        
        if train_state['stop_early']:
            break
        
except KeyboardInterrupt:
    print("Exiting loop")

train	 epoch:0/3	 batch_index:0	 train_acc: 44.53	 train_loss: 0.696
train	 epoch:0/3	 batch_index:1	 train_acc: 47.27	 train_loss: 0.6935
train	 epoch:0/3	 batch_index:2	 train_acc: 50.0	 train_loss: 0.6928
train	 epoch:0/3	 batch_index:3	 train_acc: 49.61	 train_loss: 0.6916
train	 epoch:0/3	 batch_index:4	 train_acc: 50.31	 train_loss: 0.6886
train	 epoch:0/3	 batch_index:5	 train_acc: 50.91	 train_loss: 0.6871
train	 epoch:0/3	 batch_index:6	 train_acc: 50.89	 train_loss: 0.6845
train	 epoch:0/3	 batch_index:7	 train_acc: 51.17	 train_loss: 0.6835
train	 epoch:0/3	 batch_index:8	 train_acc: 51.74	 train_loss: 0.6819
train	 epoch:0/3	 batch_index:9	 train_acc: 51.8	 train_loss: 0.6815
train	 epoch:0/3	 batch_index:10	 train_acc: 52.34	 train_loss: 0.6797
train	 epoch:0/3	 batch_index:11	 train_acc: 53.58	 train_loss: 0.6771
train	 epoch:0/3	 batch_index:12	 train_acc: 53.37	 train_loss: 0.6774
train	 epoch:0/3	 batch_index:13	 train_acc: 54.19	 train_loss: 0.6766
train	 epoch:0/3	 b

train	 epoch:0/3	 batch_index:117	 train_acc: 78.15	 train_loss: 0.57
train	 epoch:0/3	 batch_index:118	 train_acc: 78.23	 train_loss: 0.5694
train	 epoch:0/3	 batch_index:119	 train_acc: 78.27	 train_loss: 0.5689
train	 epoch:0/3	 batch_index:120	 train_acc: 78.34	 train_loss: 0.5681
train	 epoch:0/3	 batch_index:121	 train_acc: 78.38	 train_loss: 0.5674
train	 epoch:0/3	 batch_index:122	 train_acc: 78.46	 train_loss: 0.5666
train	 epoch:0/3	 batch_index:123	 train_acc: 78.51	 train_loss: 0.5658
train	 epoch:0/3	 batch_index:124	 train_acc: 78.54	 train_loss: 0.5653
train	 epoch:0/3	 batch_index:125	 train_acc: 78.62	 train_loss: 0.5645
train	 epoch:0/3	 batch_index:126	 train_acc: 78.7	 train_loss: 0.5639
train	 epoch:0/3	 batch_index:127	 train_acc: 78.79	 train_loss: 0.5632
train	 epoch:0/3	 batch_index:128	 train_acc: 78.82	 train_loss: 0.5628
train	 epoch:0/3	 batch_index:129	 train_acc: 78.89	 train_loss: 0.5621
train	 epoch:0/3	 batch_index:130	 train_acc: 78.97	 train_loss: 0.

train	 epoch:0/3	 batch_index:232	 train_acc: 81.98	 train_loss: 0.5101
train	 epoch:0/3	 batch_index:233	 train_acc: 82.0	 train_loss: 0.5095
train	 epoch:0/3	 batch_index:234	 train_acc: 82.01	 train_loss: 0.5091
train	 epoch:0/3	 batch_index:235	 train_acc: 82.02	 train_loss: 0.5088
train	 epoch:0/3	 batch_index:236	 train_acc: 82.04	 train_loss: 0.5082
train	 epoch:0/3	 batch_index:237	 train_acc: 82.07	 train_loss: 0.5078
train	 epoch:0/3	 batch_index:238	 train_acc: 82.09	 train_loss: 0.5075
train	 epoch:0/3	 batch_index:239	 train_acc: 82.12	 train_loss: 0.5071
train	 epoch:0/3	 batch_index:240	 train_acc: 82.13	 train_loss: 0.5067
train	 epoch:0/3	 batch_index:241	 train_acc: 82.15	 train_loss: 0.5063
train	 epoch:0/3	 batch_index:242	 train_acc: 82.17	 train_loss: 0.506
train	 epoch:0/3	 batch_index:243	 train_acc: 82.18	 train_loss: 0.5057
train	 epoch:0/3	 batch_index:244	 train_acc: 82.2	 train_loss: 0.5052
train	 epoch:0/3	 batch_index:245	 train_acc: 82.22	 train_loss: 0.

val	 epoch:0/3	 batch_index:45	 val_acc: 88.4	 val_loss: 0.3844
val	 epoch:0/3	 batch_index:46	 val_acc: 88.35	 val_loss: 0.3845
val	 epoch:0/3	 batch_index:47	 val_acc: 88.31	 val_loss: 0.3848
val	 epoch:0/3	 batch_index:48	 val_acc: 88.33	 val_loss: 0.3845
val	 epoch:0/3	 batch_index:49	 val_acc: 88.3	 val_loss: 0.3844
val	 epoch:0/3	 batch_index:50	 val_acc: 88.27	 val_loss: 0.3843
val	 epoch:0/3	 batch_index:51	 val_acc: 88.16	 val_loss: 0.3854
val	 epoch:0/3	 batch_index:52	 val_acc: 88.18	 val_loss: 0.3857
val	 epoch:0/3	 batch_index:53	 val_acc: 88.18	 val_loss: 0.3859
val	 epoch:0/3	 batch_index:54	 val_acc: 88.21	 val_loss: 0.3858
val	 epoch:0/3	 batch_index:55	 val_acc: 88.23	 val_loss: 0.3849
val	 epoch:0/3	 batch_index:56	 val_acc: 88.25	 val_loss: 0.3844
val	 epoch:0/3	 batch_index:57	 val_acc: 88.23	 val_loss: 0.3843
val	 epoch:0/3	 batch_index:58	 val_acc: 88.19	 val_loss: 0.3848
val	 epoch:0/3	 batch_index:59	 val_acc: 88.14	 val_loss: 0.3853
val	 epoch:0/3	 batch_index

train	 epoch:1/3	 batch_index:102	 train_acc: 89.49	 train_loss: 0.3557
train	 epoch:1/3	 batch_index:103	 train_acc: 89.5	 train_loss: 0.3559
train	 epoch:1/3	 batch_index:104	 train_acc: 89.52	 train_loss: 0.3563
train	 epoch:1/3	 batch_index:105	 train_acc: 89.53	 train_loss: 0.3562
train	 epoch:1/3	 batch_index:106	 train_acc: 89.49	 train_loss: 0.3564
train	 epoch:1/3	 batch_index:107	 train_acc: 89.5	 train_loss: 0.3563
train	 epoch:1/3	 batch_index:108	 train_acc: 89.47	 train_loss: 0.3562
train	 epoch:1/3	 batch_index:109	 train_acc: 89.45	 train_loss: 0.3564
train	 epoch:1/3	 batch_index:110	 train_acc: 89.42	 train_loss: 0.3564
train	 epoch:1/3	 batch_index:111	 train_acc: 89.44	 train_loss: 0.3559
train	 epoch:1/3	 batch_index:112	 train_acc: 89.48	 train_loss: 0.3555
train	 epoch:1/3	 batch_index:113	 train_acc: 89.47	 train_loss: 0.3555
train	 epoch:1/3	 batch_index:114	 train_acc: 89.47	 train_loss: 0.3555
train	 epoch:1/3	 batch_index:115	 train_acc: 89.48	 train_loss: 0

train	 epoch:1/3	 batch_index:222	 train_acc: 89.92	 train_loss: 0.3402
train	 epoch:1/3	 batch_index:223	 train_acc: 89.9	 train_loss: 0.3402
train	 epoch:1/3	 batch_index:224	 train_acc: 89.91	 train_loss: 0.3402
train	 epoch:1/3	 batch_index:225	 train_acc: 89.9	 train_loss: 0.3403
train	 epoch:1/3	 batch_index:226	 train_acc: 89.9	 train_loss: 0.3403
train	 epoch:1/3	 batch_index:227	 train_acc: 89.93	 train_loss: 0.34
train	 epoch:1/3	 batch_index:228	 train_acc: 89.91	 train_loss: 0.34
train	 epoch:1/3	 batch_index:229	 train_acc: 89.91	 train_loss: 0.34
train	 epoch:1/3	 batch_index:230	 train_acc: 89.9	 train_loss: 0.34
train	 epoch:1/3	 batch_index:231	 train_acc: 89.89	 train_loss: 0.34
train	 epoch:1/3	 batch_index:232	 train_acc: 89.9	 train_loss: 0.3398
train	 epoch:1/3	 batch_index:233	 train_acc: 89.9	 train_loss: 0.3396
train	 epoch:1/3	 batch_index:234	 train_acc: 89.9	 train_loss: 0.3397
train	 epoch:1/3	 batch_index:235	 train_acc: 89.88	 train_loss: 0.3398
train	 ep

val	 epoch:1/3	 batch_index:38	 val_acc: 90.1	 val_loss: 0.3128
val	 epoch:1/3	 batch_index:39	 val_acc: 90.14	 val_loss: 0.3131
val	 epoch:1/3	 batch_index:40	 val_acc: 90.19	 val_loss: 0.3126
val	 epoch:1/3	 batch_index:41	 val_acc: 90.18	 val_loss: 0.3124
val	 epoch:1/3	 batch_index:42	 val_acc: 90.19	 val_loss: 0.312
val	 epoch:1/3	 batch_index:43	 val_acc: 90.22	 val_loss: 0.3113
val	 epoch:1/3	 batch_index:44	 val_acc: 90.26	 val_loss: 0.3118
val	 epoch:1/3	 batch_index:45	 val_acc: 90.32	 val_loss: 0.3103
val	 epoch:1/3	 batch_index:46	 val_acc: 90.31	 val_loss: 0.3095
val	 epoch:1/3	 batch_index:47	 val_acc: 90.32	 val_loss: 0.3099
val	 epoch:1/3	 batch_index:48	 val_acc: 90.29	 val_loss: 0.31
val	 epoch:1/3	 batch_index:49	 val_acc: 90.25	 val_loss: 0.3098
val	 epoch:1/3	 batch_index:50	 val_acc: 90.21	 val_loss: 0.31
val	 epoch:1/3	 batch_index:51	 val_acc: 90.19	 val_loss: 0.31
val	 epoch:1/3	 batch_index:52	 val_acc: 90.14	 val_loss: 0.3106
val	 epoch:1/3	 batch_index:53	 v

train	 epoch:2/3	 batch_index:92	 train_acc: 91.63	 train_loss: 0.2857
train	 epoch:2/3	 batch_index:93	 train_acc: 91.62	 train_loss: 0.2859
train	 epoch:2/3	 batch_index:94	 train_acc: 91.64	 train_loss: 0.2855
train	 epoch:2/3	 batch_index:95	 train_acc: 91.65	 train_loss: 0.2853
train	 epoch:2/3	 batch_index:96	 train_acc: 91.69	 train_loss: 0.2848
train	 epoch:2/3	 batch_index:97	 train_acc: 91.68	 train_loss: 0.2849
train	 epoch:2/3	 batch_index:98	 train_acc: 91.67	 train_loss: 0.285
train	 epoch:2/3	 batch_index:99	 train_acc: 91.65	 train_loss: 0.2848
train	 epoch:2/3	 batch_index:100	 train_acc: 91.68	 train_loss: 0.2846
train	 epoch:2/3	 batch_index:101	 train_acc: 91.65	 train_loss: 0.2844
train	 epoch:2/3	 batch_index:102	 train_acc: 91.66	 train_loss: 0.2844
train	 epoch:2/3	 batch_index:103	 train_acc: 91.66	 train_loss: 0.2844
train	 epoch:2/3	 batch_index:104	 train_acc: 91.64	 train_loss: 0.2845
train	 epoch:2/3	 batch_index:105	 train_acc: 91.64	 train_loss: 0.2842
t

train	 epoch:2/3	 batch_index:207	 train_acc: 91.59	 train_loss: 0.2794
train	 epoch:2/3	 batch_index:208	 train_acc: 91.6	 train_loss: 0.2793
train	 epoch:2/3	 batch_index:209	 train_acc: 91.61	 train_loss: 0.2793
train	 epoch:2/3	 batch_index:210	 train_acc: 91.6	 train_loss: 0.2793
train	 epoch:2/3	 batch_index:211	 train_acc: 91.59	 train_loss: 0.2792
train	 epoch:2/3	 batch_index:212	 train_acc: 91.6	 train_loss: 0.2791
train	 epoch:2/3	 batch_index:213	 train_acc: 91.58	 train_loss: 0.2792
train	 epoch:2/3	 batch_index:214	 train_acc: 91.57	 train_loss: 0.2794
train	 epoch:2/3	 batch_index:215	 train_acc: 91.58	 train_loss: 0.2792
train	 epoch:2/3	 batch_index:216	 train_acc: 91.57	 train_loss: 0.2793
train	 epoch:2/3	 batch_index:217	 train_acc: 91.59	 train_loss: 0.2792
train	 epoch:2/3	 batch_index:218	 train_acc: 91.6	 train_loss: 0.279
train	 epoch:2/3	 batch_index:219	 train_acc: 91.58	 train_loss: 0.2792
train	 epoch:2/3	 batch_index:220	 train_acc: 91.58	 train_loss: 0.27

val	 epoch:2/3	 batch_index:21	 val_acc: 90.45	 val_loss: 0.2744
val	 epoch:2/3	 batch_index:22	 val_acc: 90.56	 val_loss: 0.2724
val	 epoch:2/3	 batch_index:23	 val_acc: 90.69	 val_loss: 0.2724
val	 epoch:2/3	 batch_index:24	 val_acc: 90.53	 val_loss: 0.2742
val	 epoch:2/3	 batch_index:25	 val_acc: 90.41	 val_loss: 0.2761
val	 epoch:2/3	 batch_index:26	 val_acc: 90.57	 val_loss: 0.2756
val	 epoch:2/3	 batch_index:27	 val_acc: 90.62	 val_loss: 0.2747
val	 epoch:2/3	 batch_index:28	 val_acc: 90.52	 val_loss: 0.2766
val	 epoch:2/3	 batch_index:29	 val_acc: 90.49	 val_loss: 0.2767
val	 epoch:2/3	 batch_index:30	 val_acc: 90.57	 val_loss: 0.276
val	 epoch:2/3	 batch_index:31	 val_acc: 90.62	 val_loss: 0.2763
val	 epoch:2/3	 batch_index:32	 val_acc: 90.62	 val_loss: 0.2761
val	 epoch:2/3	 batch_index:33	 val_acc: 90.67	 val_loss: 0.2749
val	 epoch:2/3	 batch_index:34	 val_acc: 90.6	 val_loss: 0.2748
val	 epoch:2/3	 batch_index:35	 val_acc: 90.76	 val_loss: 0.2731
val	 epoch:2/3	 batch_index

## compute the loss & accuracy on the test set using the best available model

In [21]:
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in tqdm(enumerate(batch_generator)):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())

    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc



0it [00:00, ?it/s][A[A

2it [00:00, 16.72it/s][A[A

4it [00:00, 18.15it/s][A[A

6it [00:00, 16.98it/s][A[A

8it [00:00, 17.65it/s][A[A

11it [00:00, 18.86it/s][A[A

14it [00:00, 19.53it/s][A[A

17it [00:00, 20.06it/s][A[A

19it [00:00, 19.99it/s][A[A

21it [00:01, 19.80it/s][A[A

23it [00:01, 19.78it/s][A[A

25it [00:01, 19.68it/s][A[A

27it [00:01, 19.62it/s][A[A

29it [00:01, 19.15it/s][A[A

31it [00:01, 18.28it/s][A[A

33it [00:01, 18.33it/s][A[A

35it [00:01, 18.37it/s][A[A

37it [00:02, 18.42it/s][A[A

39it [00:02, 18.22it/s][A[A

41it [00:02, 17.64it/s][A[A

43it [00:02, 17.63it/s][A[A

45it [00:02, 17.68it/s][A[A

47it [00:02, 17.67it/s][A[A

49it [00:02, 17.24it/s][A[A

51it [00:03, 16.85it/s][A[A

53it [00:03, 16.51it/s][A[A

55it [00:03, 16.37it/s][A[A

57it [00:03, 16.46it/s][A[A

60it [00:03, 16.67it/s][A[A

62it [00:03, 16.74it/s][A[A

64it [00:03, 16.82it/s][A[A

65it [00:03, 16.83it/s][A[A

In [22]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))


Test loss: 0.275
Test Accuracy: 91.14


## Prediction (Inference)

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [26]:
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
    """Predict the rating of a review
    
    Args:
        review (str): the text of the review
        classifier (ReviewClassifier): the trained model
        vectorizer (ReviewVectorizer): the corresponding vectorizer
        decision_threshold (float): The numerical boundary which separates the rating classes
    """
    review = preprocess_text(review)
    
    vectorized_review = torch.tensor(vectorizer.vectorizer(review))
    result = classifier(vectorized_review.view(1, -1))
    
    probability_value = F.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)

In [28]:
import warnings

In [29]:
warnings.filterwarnings("ignore")

In [30]:
test_review = "Sudipa is a beautiful girl"

classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

Sudipa is a beautiful girl -> positive


## Interpretability

In [31]:
classifier.fc1.weight.shape

torch.Size([1, 7326])

### Sort weights

In [32]:
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

### Top 20 words

In [33]:
print("Influential words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Positive Reviews:
--------------------------------------
great
delicious
amazing
awesome
excellent
love
vegas
fantastic
perfect
best
favorite
friendly
definitely
wonderful
loved
always
yummy
helpful
highly
yum


### Top 20 negative words

In [34]:
print("Influential words in Negative Reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Negative Reviews:
--------------------------------------
worst
bland
horrible
mediocre
rude
terrible
overpriced
not
awful
ok
poor
meh
dirty
nothing
disgusting
tasteless
worse
disappointing
poorly
gross
