In [2]:
#%pip install wget
#%pip install torch
import data_rnn
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [3]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = data_rnn.load_imdb(final=False)

# Part 1: Classification: data loading

In [4]:
x_train[0], y_train[0] # 0 is positive, 1 is negative

([14, 19, 9, 379, 22, 11, 50, 52, 53, 290], 1)

In [5]:
[i2w[word] for word in x_train[0]] # what is vocab?
len(i2w) # what is vocab size?

99430

In [6]:
lengths = [len(l) for l in x_train]
print(np.min(lengths), np.max(lengths), np.average(lengths)) # stats on sequences

10 2514 240.6318


In [7]:
w2i['.pad'], w2i['.start'], w2i['.end'], w2i['.unk'] # special tokens and their ids

(0, 1, 2, 3)

## Question 1: Padding and Conversion

In [8]:
# defining a pad  function
def pad(seq, pad_length):
    padded = np.zeros(pad_length) # 0 is for padding
    padded[0:len(seq)] = seq
    return torch.tensor(padded, dtype=torch.long)

In [9]:
padded = pad(x_train[0], 12).reshape(1,-1)
padded.shape

torch.Size([1, 12])

# Part 2: Classification, baseline model

## Question 2: Baseline model

In [10]:
# Take a tensor x, and return max across time dimension
def MaxPoolTime(x):
    return torch.amax(x, dim=1)

class MLPModel(torch.nn.Module):
    def __init__(self, batch_size=1):
        super().__init__()
        timestep = 12
        numcls = 2
        hidden = 300
        embedding_size = 300
        n_embeddings = len(i2w)
        self.emb = torch.nn.Embedding(n_embeddings, embedding_size, padding_idx=0)
        self.fc1 = torch.nn.Linear(embedding_size, hidden)
        self.fc2 = torch.nn.Linear(hidden, numcls)
    
    def forward(self, x):
        x = self.emb(x)
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        x = MaxPoolTime(x)
        x = self.fc2(x)

        return x

In [11]:
# quick test
baseline_model = MLPModel()
y = baseline_model.forward(padded)
torch.nn.functional.softmax(y)

  torch.nn.functional.softmax(y)


tensor([[0.5154, 0.4846]], grad_fn=<SoftmaxBackward0>)

## Question 3: Training Loop

Pad train and validation

In [12]:
# pad all
padding_size = np.max(lengths) # how big should the padding be? --> max seq length

# pad train and validation set
padded_train = torch.stack([pad(x, padding_size) for x in x_train])
padded_val = torch.stack([pad(x, padding_size) for x in x_val])

In [13]:
padded_train.shape # check shape

torch.Size([20000, 2514])

Create train and validation datalaoders

In [22]:
batch_size = 1024

# set to device
padded_train = padded_train.to(device)
padded_val = padded_val.to(device)
y_train_tensor = torch.tensor(y_train).to(device)
y_val_tensor = torch.tensor(y_val).to(device)

# create train and val datasets with instance and label pairs
train_dataset = TensorDataset(padded_train, torch.tensor(y_train))
validation_dataset = TensorDataset(padded_val, torch.tensor(y_val))

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)


In [16]:
def validate(model, validation_loader):
    val_acc = 0
    val_correct = 0
    total_samples = 0
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(validation_loader):
            instances, labels = data
            fwd = model(instances)
            predictions = torch.argmax(fwd, dim=1)
            val_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    # Compute and return accuracy
    val_acc = val_correct / total_samples
    return val_acc

Training loop

In [26]:
def train_model(model, trainloader, testloader, optimizer, nr_epochs=5):
    model.train()
    for epoch in range(nr_epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, batch in enumerate(trainloader, 0):
            instances, labels = batch

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(instances)
            loss = torch.nn.functional.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % len(trainloader)/len(batch)*2 == 0:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 200:.3f}')
                running_loss = 0.0
        val_acc = validate(model, testloader)
        print(f'Epoch {epoch}, validation acc.: {val_acc}')
    print('Finished Training')
    return model

[1,   200] loss: 0.713
[1,   400] loss: 0.708
[1,   600] loss: 0.713
Epoch 0, validation acc.: 0.4926
[2,   200] loss: 0.714


Train baseline model

In [None]:
optimizer = optim.Adam(baseline_model.parameters(), lr=0.001)
baseline_model = MLPModel(batch_size).to(device)
final_model = train_model(baseline_model, trainloader, testloader, optimizer, 5)

# Part 3: Writing your own Elman RNN

This only answers "complete the missing parts" question. The full implementation on the dataset is in q4.py

In [None]:
class Elman(torch.nn.Module):
    def __init__(self, insize=300, outsize=300, hsize=300):
        super().__init__()
        self.lin1 = torch.nn.Linear(insize+hsize, hsize)
        self.lin2 = torch.nn.Linear(hsize, outsize)

    def forward(self, x, hidden=None): 
        '''
        b: batch size
        t: time steps (ie. sequence length)
        e: dimension of each input vector
        '''

        b, t, e = x.size() 
        if hidden is None:
            hidden = torch.zeros(b, e, dtype=torch.float) #make a tensor of inputs (bxe) 
            
        outs = []
        for i in range(t): #iterate through each value of the sequence
            inp = torch.cat([x[:, i, :], hidden], dim=1) #take only the value being iterated 
            inp = self.lin1(inp)
            hidden = torch.nn.functional.sigmoid(inp)
            out = self.lin2(hidden)
            outs.append(out[:, None, :])

In [None]:
class ElmanModel(nn.Module):
    def __init__(self):
        super().__init__()
        hidden = 300
        embedding_size = 300
        num_classes = 2
        vocab_size = len(i2w)

        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.fc1 = Elman(embedding_size, hidden)
        self.fc2 = nn.Linear(hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = nn.functional.relu(x[0])
        x = torch.amax(x, dim=1)  # Max pooling across the time dimension
        x = self.fc2(x)
        return x

Question 5

In [None]:
from question5.hypertuning import main as run_hypertuning

run_hypertuning()