# Machine Learning with PyTorch and Scikit-Learn  
# -- Code Examples

## Package version checks

Add folder to path in order to load from the check_packages.py script:

In [1]:
import sys
sys.path.insert(0, '..')

Check recommended package versions:

In [2]:
from python_environment_check import check_packages


d = {
    'torch': '1.8.0',
    'torchtext': '0.10.0'
}
check_packages(d)

[OK] Your Python version is 3.9.23 (main, Sep 18 2025, 19:32:12) 
[Clang 20.1.4 ]


OSError: dlopen(/Users/mihailmihaylov/Desktop/Study/DL/.venv/lib/python3.9/site-packages/torchtext/lib/libtorchtext.so, 0x0006): Symbol not found: __ZN3c105ErrorC1ENSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEES7_PKv
  Referenced from: <DF3ABA20-1A43-316C-A7EC-B234DC9E3163> /Users/mihailmihaylov/Desktop/Study/DL/.venv/lib/python3.9/site-packages/torchtext/lib/libtorchtext.so
  Expected in:     <D1057909-9D16-345A-8B8C-73FA6C3DB62C> /Users/mihailmihaylov/Desktop/Study/DL/.venv/lib/python3.9/site-packages/torch/lib/libc10.dylib

# Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (Part 2/3)

**Outline**

- [Implementing RNNs for sequence modeling in PyTorch](#Implementing-RNNs-for-sequence-modeling-in-PyTorch)
  - [Project one -- predicting the sentiment of IMDb movie reviews](#Project-one----predicting-the-sentiment-of-IMDb-movie-reviews)
    - [Preparing the movie review data](#Preparing-the-movie-review-data)
    - [Embedding layers for sentence encoding](#Embedding-layers-for-sentence-encoding)
    - [Building an RNN model](#Building-an-RNN-model)
    - [Building an RNN model for the sentiment analysis task](#Building-an-RNN-model-for-the-sentiment-analysis-task)
      - [More on the bidirectional RNN](#More-on-the-bidirectional-RNN)

In [2]:
from IPython.display import Image
%matplotlib inline

# Implementing RNNs for sequence modeling in PyTorch

## Project one: predicting the sentiment of IMDb movie reviews

### Preparing the movie review data



In [3]:
import torch
import torch.nn as nn

In [None]:
# pip install torchtext==0.10.0

**Attention**: To reproduce the code in the books, please make sure to use torchtext 0.10.0 (see https://pypi.org/project/torchtext/0.10.0/), which is the package I used for this chapter. 

There are a few adjustments in this notebooks that also make it compatible to newer versions of torchtext.

For newer versions of torchtext, installing portalocker may be necessary:

In [None]:
# pip install torchtext

In [None]:
# pip install portalocker

In [4]:
import os

def load_data(path, file_list, dataset, encoding='utf8'):
    """Read set of files from given directory and save returned lines to list.
    
    Parameters
    ----------
    path : str
        Absolute or relative path to given file (or set of files).
    file_list: list
        List of files names to read.
    dataset: list
        List that stores read lines.
    encoding: str, optional (default='utf8')
        File encoding.
        
    """
    for file in file_list:
        with open(os.path.join(path, file), 'r', encoding=encoding) as text:
            dataset.append(text.read())

In [5]:
# Path to dataset location
path = 'aclImdb/'

# Create lists that will contain read lines
train_pos, train_neg, test_pos, test_neg = [], [], [], []

# Create a dictionary of paths and lists that store lines (key: value = path: list)
sets_dict = {'train/pos/': train_pos, 'train/neg/': train_neg,
             'test/pos/': test_pos, 'test/neg/': test_neg}

# Load the data
for dataset in sets_dict:
	file_list = [f for f in os.listdir(os.path.join(path, dataset)) if f.endswith('.txt')]
	load_data(os.path.join(path, dataset), file_list, sets_dict[dataset])



In [6]:
import pandas as pd

# Concatenate training and testing examples into one dataset
dataset_train_pd = pd.concat([pd.DataFrame({'review': train_pos, 'label':1}),
                              pd.DataFrame({'review': train_neg, 'label':0})])
dataset_test_pd = pd.concat(
[                     pd.DataFrame({'review': test_pos, 'label':1}),
                     pd.DataFrame({'review': test_neg, 'label':0})],
                     axis=0, ignore_index=True)

dataset_train_pd.head()

Unnamed: 0,review,label
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


In [7]:
from torch.utils.data import Dataset

In [8]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index):
        row = self.dataframe.iloc[index].to_numpy()
        features = row[0]
        label = row[1]
        return features, label

    def __len__(self):
        return len(self.dataframe)

In [9]:
from torch.utils.data.dataset import random_split

# Step 1: load and create the datasets

train_dataset = CustomDataset(dataset_train_pd)
test_dataset = CustomDataset(dataset_test_pd)

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(
    train_dataset, [20000, 5000])


In [10]:
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for idx in range(len(train_dataset)):
    line, label = train_dataset[idx]
    tokens = tokenizer(line)
    token_counts.update(tokens)

    
print('Vocab-size:', len(token_counts))

Vocab-size: 69396


In [14]:
tokenizer("fisheris")

['fisheris']

In [11]:
## Step 3: encoding each unique token into integers
# from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
tokens = ["<pad>", "<unk>"]
# Save room for special tokens
for token, freq in ordered_dict.items():
	if freq >= 1:
		tokens.append(token)

# vocab = vocab(ordered_dict)

# vocab.insert_token("<pad>", 0)
# vocab.insert_token("<unk>", 1)
# vocab.set_default_index(1)

print([tokens.index(token) for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 459]


In [12]:
if not torch.cuda.is_available():
    print("Warning: this code may be very slow on CPU")



In [None]:
## Step 3-A: define the functions for transformation

device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print(device)

def text_pipeline(x):
    return [tokens.index(token) if token in tokens else tokens.index('<unk>') for token in tokenizer(x)]

# from torchtext import __version__ as torchtext_version

# if parse_version(torchtext.__version__) > parse_version("0.10"):
#     label_pipeline = lambda x: 1. if x == 2 else 0.         # 1 ~ negative, 2 ~ positive review
# else:
#     label_pipeline = lambda x: 1. if x == 'pos' else 0.


## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float32)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

mps


In [21]:
## Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[  11,    7,  684,  ..., 1787,  328, 1227],
        [  10,   68,  113,  ...,    0,    0,    0],
        [  10,  276,   21,  ...,    0,    0,    0],
        [  10,  212,   11,  ...,    0,    0,    0]], device='mps:0')
tensor([0., 0., 0., 1.], device='mps:0')
tensor([473, 288, 204, 252], device='mps:0')
torch.Size([4, 473])


In [22]:
## Step 4: batching the datasets

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

### Embedding layers for sentence encoding


 * `input_dim`: number of words, i.e. maximum integer index + 1.
 * `output_dim`: 
 * `input_length`: the length of (padded) sequence
    * for example, `'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]`   
    => input_lenght is 10
 
 

 * When calling the layer, takes integr values as input,   
 the embedding layer convert each interger into float vector of size `[output_dim]`
   * If input shape is `[BATCH_SIZE]`, output shape will be `[BATCH_SIZE, output_dim]`
   * If input shape is `[BATCH_SIZE, 10]`, output shape will be `[BATCH_SIZE, 10, output_dim]`

In [None]:
Image(filename='figures/15_10.png', width=600)

In [23]:
embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3, 
                         padding_idx=0)
 
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))


tensor([[[ 0.7040, -0.5137, -0.4007],
         [-0.6880,  0.0967, -0.9361],
         [-0.3003,  0.3909, -0.3700],
         [-0.3637,  0.6608, -0.8047]],

        [[-0.3003,  0.3909, -0.3700],
         [-0.2845, -0.9217, -1.1328],
         [-0.6880,  0.0967, -0.9361],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


### Building an RNN model

* **RNN layers:**
  * `nn.RNN(input_size, hidden_size, num_layers=1)`
  * `nn.LSTM(..)`
  * `nn.GRU(..)`
  * `nn.RNN(input_size, hidden_size, num_layers=1, bidirectional=True)`
 
 

In [19]:
## An example of building a RNN model
## with simple RNN layer

# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, 
                          hidden_size, 
                          num_layers=2, 
                          batch_first=True)
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out

model = RNN(64, 32) 

print(model) 
 
model(torch.randn(5, 3, 64)) 

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[ 0.3183],
        [ 0.1230],
        [ 0.1772],
        [-0.1052],
        [-0.1259]], grad_fn=<AddmmBackward0>)

### Building an RNN model for the sentiment analysis task

In [24]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(tokens)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [25]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [26]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')


Epoch 0 accuracy: 0.5988 val_accuracy: 0.6400
Epoch 1 accuracy: 0.7103 val_accuracy: 0.6214
Epoch 2 accuracy: 0.7403 val_accuracy: 0.7342
Epoch 3 accuracy: 0.8066 val_accuracy: 0.8184
Epoch 4 accuracy: 0.8655 val_accuracy: 0.8358
Epoch 5 accuracy: 0.8917 val_accuracy: 0.8492
Epoch 6 accuracy: 0.8650 val_accuracy: 0.7790
Epoch 7 accuracy: 0.8976 val_accuracy: 0.8638
Epoch 8 accuracy: 0.9352 val_accuracy: 0.8718
Epoch 9 accuracy: 0.9533 val_accuracy: 0.8684


In [27]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8594


#### More on the bidirectional RNN

 * **Trying bidirectional recurrent layer**

In [28]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [29]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6602 val_accuracy: 0.7598
Epoch 1 accuracy: 0.7685 val_accuracy: 0.5862
Epoch 2 accuracy: 0.7177 val_accuracy: 0.8282
Epoch 3 accuracy: 0.8851 val_accuracy: 0.8692
Epoch 4 accuracy: 0.9302 val_accuracy: 0.8802
Epoch 5 accuracy: 0.9587 val_accuracy: 0.8798
Epoch 6 accuracy: 0.9774 val_accuracy: 0.8754
Epoch 7 accuracy: 0.9872 val_accuracy: 0.8746
Epoch 8 accuracy: 0.9925 val_accuracy: 0.8744
Epoch 9 accuracy: 0.9961 val_accuracy: 0.8724


In [None]:
# test_dataset = IMDB(split='test')
# test_dl = DataLoader(test_dataset, batch_size=batch_size,
#                      shuffle=False, collate_fn=collate_batch)

In [30]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8594


## Optional exercise: 

### Uni-directional SimpleRNN with full-length sequences

<br>
<br>

---



Readers may ignore the next cell.


In [None]:
! python ../.convert_notebook_to_script.py --input ch15_part2.ipynb --output ch15_part2.py