<a href="https://colab.research.google.com/github/mappls/Playfield/blob/master/%5Bpytorch%5D%201_classify_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [PyTorch] Feed-forward neural network on classifying movie reviews

I've re-done this small project in PyTorch from an earlier one done with Keras in this [Github link](https://github.com/mappls/Playfield/blob/master/keras/1_classify_movie_reviews.ipynb). Look at the Keras one for comparison, and more details.

In short, the idea here is to take movie reviews from an IMDB database and binary classify them as positive or negative. This is one of the simplest approaches possible: encode each sentence as a multi-hot vector, where 1s are placed for words that are present in the sentence and 0s otherwise. The sentence length is same as the word dictionary (I've used 10k). The model I use is simple feed-forward neural network with 3 layers in total.


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from keras.datasets import imdb
import plotly.express as px

In [None]:
ONE_HOT_DIM = 10000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Prepare data

In [None]:
# Dictionary and reverse dictionary
word_index = imdb.get_word_index()
reverse_word_index = dict([(y, x) for x, y in imdb.get_word_index().items()])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
def decode_review(review_in_numbers):
    # the indices are offset by 3, if a word is not in most frequent 10000 return a '?'
    return ' '.join([reverse_word_index.get(i-3, '?') for i in review_in_numbers])

In [None]:
imdb_data = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
class MovieDataset(Dataset):
    def __init__(self, data, group, one_hot_dim=ONE_HOT_DIM, transform=None, target_transform=None):
        (train_x_orig, train_y_orig), (test_x_orig, test_y_orig) = data

        if group == 'train':
            self.X = self.vectorize_sequences(train_x_orig[10000:], one_hot_dim).astype('float32')
            self.y = train_y_orig[10000:].astype('float32')
        elif group == 'dev':
            self.X = self.vectorize_sequences(train_x_orig[:10000], one_hot_dim).astype('float32')
            self.y = train_y_orig[:10000].astype('float32')
        elif group == 'test':
            self.X = self.vectorize_sequences(test_x_orig, one_hot_dim).astype('float32')
            self.y = test_y_orig.astype('float32')

        self.one_hot_dim = one_hot_dim
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.transform:
            self.X = self.transform(self.X)
        if self.target_transform:
            self.y = self.target_transform(self.y)
        return self.X[idx, :], self.y[idx]


    def vectorize_sequences(self, sequences, dimension=ONE_HOT_DIM):
        # Create an empty matrix
        results = np.zeros((len(sequences), dimension))
        for i, sequence in enumerate(sequences):
            # Pass a list `sequence` of words in the reveiew for which the encoding is 1
            results[i, sequence] = 1
        return results

In [None]:
ds_train = MovieDataset(imdb_data, 'train')
ds_dev = MovieDataset(imdb_data, 'dev')
ds_test = MovieDataset(imdb_data, 'test')
type(ds_train)

__main__.MovieDataset

In [None]:
BATCH_SIZE = 64

train_dataloader = DataLoader(ds_train, batch_size=BATCH_SIZE)
dev_dataloader = DataLoader(ds_dev, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(ds_test, batch_size=BATCH_SIZE)
print(type(train_dataloader))

<class 'torch.utils.data.dataloader.DataLoader'>


In [None]:
for x, y in train_dataloader:
    print('Shapes', x.shape, y.shape)
    print(type(x), type(y))
    print(x.dtype, y.dtype)
    break

Shapes torch.Size([64, 10000]) torch.Size([64])
<class 'torch.Tensor'> <class 'torch.Tensor'>
torch.float32 torch.float32


In [None]:
print(len(train_dataloader))
print(len(train_dataloader.dataset))

235
15000
234.375


# Model

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(ONE_HOT_DIM, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        output = self.layers(x)
        return output.reshape(output.shape[0])

In [None]:
def train(dataloader, model_, loss_fn, optimizer):
    num_samples = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, correct = 0, 0
    
    model_.train()
    for batch, (X, y) in enumerate(dataloader):
        y = y.float()
        X, y = X.to(DEVICE), y.to(DEVICE)

        pred = model_(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        pred_bin = pred > 0.5
        correct += (pred_bin == y).type(torch.float).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= num_batches
    correct /= num_samples

    print(f"Train loss: {loss:>7f}, Accuracy: {(100*correct):>0.1f}%")
    return (train_loss, correct)

In [None]:
def test(dataloader, model_, loss_fn):
    num_samples = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    model_.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            pred = model_(X)
            pred = pred.reshape(pred.shape[0])
            l = loss_fn(pred, y)
            test_loss += l.item()
            z = pred >= 0.5
            correct += (z == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= num_samples
    
    print(f"Test loss: {test_loss:>7f}, Accuracy: {(100*correct):>0.1f}%\n")
    return (test_loss, correct)

In [None]:
# The BCELoss is calculated in this way. We can plug this function instead for testing
import numpy as np

def my_loss(pred_, y_):
  pred_ = np.array(pred_)
  y_= np.array(y_)
  return -(np.log(pred_) * y_ + np.log(1-pred_) * (1 - y_))

In [None]:
l = my_loss([0.4864, 0.4886, 0.4903], [0., 0., 0.])
print(np.mean(l))

l = my_loss([0.4876, 0.4855, 0.4862], [0., 1., 0.])
print(np.mean(l))

0.6702822347891925
0.6857156315055564


In [None]:
m = NeuralNet().to(DEVICE)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(m.parameters(), lr=1e-2)
history = []

EPOCHS = 30
for t in range(EPOCHS):
    print(f"Epoch {t+1}\n----------------------")
    train_loss, train_acc = train(train_dataloader, m, loss_fn, optimizer)
    test_loss, test_acc = test(test_dataloader, m, loss_fn)
    history.append(
        {
            "epoch": t+1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "test_loss": test_loss,
            "test_acc": test_acc, 
        }
    )

Epoch 1
----------------------
Train loss: 0.696440, Accuracy: 50.4%
Test loss: 0.690059, Accuracy: 50.0%

Epoch 2
----------------------
Train loss: 0.685386, Accuracy: 52.8%
Test loss: 0.680348, Accuracy: 60.1%

Epoch 3
----------------------
Train loss: 0.666859, Accuracy: 73.6%
Test loss: 0.662459, Accuracy: 77.6%

Epoch 4
----------------------
Train loss: 0.628641, Accuracy: 80.1%
Test loss: 0.624223, Accuracy: 80.0%

Epoch 5
----------------------
Train loss: 0.553751, Accuracy: 81.3%
Test loss: 0.552815, Accuracy: 81.3%

Epoch 6
----------------------
Train loss: 0.443629, Accuracy: 83.1%
Test loss: 0.462246, Accuracy: 82.9%

Epoch 7
----------------------
Train loss: 0.339840, Accuracy: 84.9%
Test loss: 0.395120, Accuracy: 84.4%

Epoch 8
----------------------
Train loss: 0.265431, Accuracy: 86.5%
Test loss: 0.357096, Accuracy: 85.5%

Epoch 9
----------------------
Train loss: 0.214887, Accuracy: 87.5%
Test loss: 0.334942, Accuracy: 86.2%

Epoch 10
----------------------
Train

In [None]:
df = pd.DataFrame(history, columns=history[0].keys())
df_plot = df[["epoch", "train_loss", "train_acc"]]
df_plot["set"] = "train"
df_plot.rename(columns={"train_loss": "loss", "train_acc": "acc"}, inplace=True)

df_test = df[["epoch", "test_loss", "test_acc"]]
df_test["set"] = "test"
df_test.rename(columns={"test_loss": "loss", "test_acc": "acc"}, inplace=True)
df_plot=pd.concat([df_plot, df_test])
df_plot.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,epoch,loss,acc,set
0,1,0.693516,0.503533,train
1,2,0.685073,0.527733,train
2,3,0.671614,0.736333,train
3,4,0.643678,0.8006,train
4,5,0.587245,0.813133,train


In [None]:
px.line(df_plot, x="epoch", y="loss", color="set")

In [None]:
px.line(df_plot, x="epoch", y="acc", color="set")