In [2]:
#%pip install wget
#%pip install torch
import data_rnn
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = data_rnn.load_imdb(final=False)

# Part 1: Classification: data loading

In [8]:
x_train[0], y_train[0] # 0 is positive, 1 is negative

([14, 19, 9, 379, 22, 11, 50, 52, 53, 290], 1)

In [9]:
[i2w[word] for word in x_train[0]] # what is vocab?
len(i2w) # what is vocab size?

99430

In [10]:
lengths = [len(l) for l in x_train]
print(np.min(lengths), np.max(lengths), np.average(lengths)) # stats on sequences

10 2514 240.6318


In [11]:
w2i['.pad'], w2i['.start'], w2i['.end'], w2i['.unk'] # special tokens and their ids

(0, 1, 2, 3)

## Question 1: Padding and Conversion

In [12]:
# defining a pad  function
def pad(seq, pad_length):
    padded = np.zeros(pad_length) # 0 is for padding
    padded[0:len(seq)] = seq
    return torch.tensor(padded, dtype=torch.long)

In [13]:
padded = pad(x_train[0], 12).reshape(1,-1)
padded.shape

torch.Size([1, 12])

# Part 2: Classification, baseline model

## Question 2: Baseline model

In [14]:
# Take a tensor x, and return max across time dimension
def MaxPoolTime(x):
    return torch.amax(x, dim=1)

class MlpModel(torch.nn.Module):
    def __init__(self, batch_size=1):
        super().__init__()
        timestep = 12
        numcls = 2
        hidden = 300
        embedding_size = 300
        n_embeddings = len(i2w)
        self.emb = torch.nn.Embedding(n_embeddings, embedding_size, padding_idx=0)
        self.fc1 = torch.nn.Linear(embedding_size, hidden)
        self.fc2 = torch.nn.Linear(hidden, numcls)
    
    def forward(self, x):
        x = self.emb(x)
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        x = MaxPoolTime(x)
        x = self.fc2(x)

        return x

In [15]:
# quick test
baseline_model = MlpModel()
y = baseline_model.forward(padded)
torch.nn.functional.softmax(y)

  torch.nn.functional.softmax(y)


tensor([[0.3995, 0.6005]], grad_fn=<SoftmaxBackward0>)

## Question 3: Training Loop

Pad train and validation

In [16]:
# pad all
padding_size = np.max(lengths) # how big should the padding be? --> max seq length

# pad train and validation set
padded_train = torch.stack([pad(x, padding_size) for x in x_train])
padded_val = torch.stack([pad(x, padding_size) for x in x_val])

In [17]:
padded_train.shape # check shape

torch.Size([20000, 2514])

Create train and validation datalaoders

In [18]:
batch_size = 1024

# set to device
padded_train = padded_train.to(device)
padded_val = padded_val.to(device)
y_train_tensor = torch.tensor(y_train).to(device)
y_val_tensor = torch.tensor(y_val).to(device)

# create train and val datasets with instance and label pairs
train_dataset = TensorDataset(padded_train, torch.tensor(y_train))
validation_dataset = TensorDataset(padded_val, torch.tensor(y_val))

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)


In [19]:
def validate(model, validation_loader):
    val_acc = 0
    val_correct = 0
    total_samples = 0
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(validation_loader):
            instances, labels = data
            fwd = model(instances)
            predictions = torch.argmax(fwd, dim=1)
            val_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    # Compute and return accuracy
    val_acc = val_correct / total_samples
    return val_acc

Training loop

In [20]:
def train_model(model, trainloader, testloader, optimizer, nr_epochs=5):
    model.train()
    for epoch in range(nr_epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, batch in enumerate(trainloader, 0):
            instances, labels = batch

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(instances)
            loss = torch.nn.functional.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % len(trainloader)/len(batch)*2 == 0:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 200:.3f}')
                running_loss = 0.0
        val_acc = validate(model, testloader)
        print(f'Epoch {epoch}, validation acc.: {val_acc}')
    print('Finished Training')
    return model

Train baseline model

In [None]:
optimizer = optim.Adam(baseline_model.parameters(), lr=0.001)
baseline_model = MlpModel(batch_size).to(device)
final_model = train_model(baseline_model, trainloader, testloader, optimizer, 5)

# Part 3: Writing your own Elman RNN

This only answers "complete the missing parts" question. The full implementation on the dataset is in q4.py

In [None]:
class Elman(torch.nn.Module):
    def __init__(self, insize=300, outsize=300, hsize=300):
        super().__init__()
        self.lin1 = torch.nn.Linear(insize+hsize, hsize)
        self.lin2 = torch.nn.Linear(hsize, outsize)

    def forward(self, x, hidden=None): 
        '''
        b: batch size
        t: time steps (ie. sequence length)
        e: dimension of each input vector
        '''

        b, t, e = x.size() 
        if hidden is None:
            hidden = torch.zeros(b, e, dtype=torch.float) #make a tensor of inputs (bxe) 
            
        outs = []
        for i in range(t): #iterate through each value of the sequence
            inp = torch.cat([x[:, i, :], hidden], dim=1) #take only the value being iterated 
            inp = self.lin1(inp)
            hidden = torch.nn.functional.sigmoid(inp)
            out = self.lin2(hidden)
            outs.append(out[:, None, :])

In [None]:
class ElmanModel(nn.Module):
    def __init__(self):
        super(self).__init__()
        hidden = 300
        embedding_size = 300
        num_classes = 2
        vocab_size = len(i2w)

        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.fc1 = Elman(embedding_size, hidden)
        self.fc2 = nn.Linear(hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = nn.functional.relu(x[0])
        x = torch.amax(x, dim=1)  # Max pooling across the time dimension
        x = self.fc2(x)
        return x

Question 5:

In [None]:
from question5.hypertuning import main as run_hypertuning
run_hypertuning()

# Part 4: Autoregressive Models

In [4]:
x_train, (i2w, w2i) = data_rnn.load_ndfa(n=150_000)

In [24]:
x_train, (i2w, w2i) = data_rnn.load_brackets(n=150_000)

In [25]:
x_train[0], i2w, w2i, len(i2w)

([5, 4],
 ['.pad', '.start', '.end', '.unk', ')', '('],
 {'.pad': 0, '.start': 1, '.end': 2, '.unk': 3, ')': 4, '(': 5},
 6)

In [26]:
seq = [i2w[x] for x in x_train[100_000-2]]
print(seq)
print(len(seq))
16-14

['(', '(', ')', ')']
4


2

## Question 6

In [27]:
# Padding for the autoregressive task
def pad_ar(seq, pad_length):
    assert len(seq) <= pad_length-2, f"pad length {pad_length} too short for sequence of length {len(seq)}"

    padded = np.zeros(pad_length) # 0 is for '.pad'
    padded[0] = 1 # 1 is for '.start'
    padded[1:len(seq)+1] = seq # insert sequence
    padded[len(seq)+1] = 2 # 2 is for '.end'
    
    return torch.tensor(padded, dtype=torch.long)

In [28]:
print(x_train[100_000-2])
pad_ar(x_train[100_000-2], 16)

[5, 5, 4, 4]


tensor([1, 5, 5, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
lengths = [len(x) for x in x_train]
np.min(lengths), np.average(lengths), np.max(lengths)

(np.int64(2), np.float64(9.01628), np.int64(1022))

In [30]:
padding_size = np.max(lengths)+2 # accounts for start and end tokens
padded_train = torch.stack([pad_ar(x, padding_size) for x in x_train])

In [31]:
len(padded_train), padded_train.shape
padded_train[0:10].shape

torch.Size([10, 1024])

In [11]:
padded_train[:, 1:160].shape
# torch.select(padded_train, 1, 1:-1)

torch.Size([150000, 159])

In [32]:
# shifts tensor values by 1 to the left
def create_target(tensor):
    shifted = tensor[:, 1:tensor.shape[1]]
    shifted = torch.cat((shifted, torch.zeros((shifted.shape[0], 1))), dim=1)
    return shifted

In [33]:
padded_test = create_target(padded_train)

In [14]:
padded_train[0].reshape(1,-1).shape

torch.Size([1, 1024])

In [15]:
#assert(all(padded_train[:,-1]) == 0)
max(padded_train[:,-1])

tensor(2)

In [16]:
# verify the last entry in shifted is always 0
torch.unique(padded_test[:,-1])

tensor([0.])

In [17]:
#torch.cat((shifted, torch.zeros(shifted.shape[0]).reshape(1,-1)), dim=1).shape

Create the train and target datasets

In [91]:
class ARModel(torch.nn.Module):
    def __init__(self, batch_size=1, e = 32, h = 16):
        super().__init__()
        num_chars = len(i2w) # num chars given by i2w
        embedding_size = 32
        hidden = 16
        n_embeddings = len(i2w)
        self.emb = torch.nn.Embedding(num_chars, embedding_size)
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden,
                            num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(hidden, num_chars)
    
    def forward(self, x):
        e = self.emb(x)
        #print(f"e {type(e)} shape: {e.shape}")
        h = self.lstm(e)[0]
        #print(f"h {type(h)} shape: {h.shape}")
        y = self.fc1(h)
        #print(f"y {type(y)} shape: {y.shape}")
        
        return y


In [36]:
batch_size = 32

# set to device
padded_x = padded_train.to(device)
padded_y = padded_test.to(device)

# create train dataset with instance and label pairs
train_dataset = TensorDataset(padded_x, padded_y)

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [37]:
def train_model(model, trainloader, optimizer, nr_epochs = 3):
    for epoch in range(nr_epochs):
        running_loss = 0.0
        for i, batch in enumerate(trainloader, 0):
            instances, targets = batch
            targets = targets.long()
            optimizer.zero_grad()

            outputs = model(instances)
            outputs = torch.transpose(outputs, 1, 2)
            #print(f'outputs: {outputs.shape}')
            #print(f'targets: {targets.shape}')
            #print(outputs.dtype)
            #print(targets.dtype)
            loss = torch.nn.functional.cross_entropy(outputs, targets, reduction='sum')#/len(instances)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}, loss: {running_loss/len(trainloader)}")
    print('Finished Training')
    return model

In [39]:
q6model = ARModel(batch_size=32).to(device)
optimizer = optim.Adam(q6model.parameters(), lr=0.001)
final_model = train_model(q6model, trainloader, optimizer, 20)

KeyboardInterrupt: 

## Question 7

In [67]:
import torch.distributions as dist
def sample(lnprobs, temperature=1.0):
    """
    Sample an element from a categorical distribution
    :param lnprobs: Outcome logits
    :param temperature: Sampling temperature. 1.0 follows the given
    distribution, 0.0 returns the maximum probability element.
    :return: The index of the sampled element.
    """
    if temperature == 0.0:
        return lnprobs.argmax()
    p = torch.nn.functional.softmax(lnprobs / temperature, dim=1)
    cd = dist.Categorical(p)
    return cd.sample()

Sequence completion

In [57]:
seq = [w2i['.start'], w2i['('], w2i['('], w2i[')']]

In [127]:
def complete_sequence(seq, model, temp = 0.5, max_len = 25):
    generated = seq.copy()
    seq_tensor = torch.tensor(generated).reshape(1,-1).to(device)
    for i in range(max_len):
        next_index = sample(model(seq_tensor).select(dim=1, index=-1), temp)
        next_char = i2w[next_index]
        if next_char == '.end':
            print(next_char)
            break
        else:
            print(next_char)
            generated.append(next_index)
            seq_tensor = torch.tensor(generated).reshape(1,-1).to(device)
            
    # correctness check
    generated_chars = [i2w[idx] for idx in generated]
    left = generated_chars.count("(")
    right = generated_chars.count(")")
    print(f"left {left}, right {right}")

    return True if left==right else False

In [158]:
seq = [w2i['.start'], w2i['('], w2i['('], w2i[')']]
complete_sequence(seq, final_model, temp=0.5)

(
)
)
.end
left 3, right 3


True

In [159]:
def train_model_sampling(model, trainloader, optimizer, seq, temp = 0.5, nr_epochs = 3):
    for epoch in range(nr_epochs):
        running_loss = 0.0
        for i, batch in enumerate(trainloader, 0):
            instances, targets = batch
            targets = targets.long()
            optimizer.zero_grad()

            outputs = model(instances)
            outputs = torch.transpose(outputs, 1, 2)
            #print(f'outputs: {outputs.shape}')
            #print(f'targets: {targets.shape}')
            #print(outputs.dtype)
            #print(targets.dtype)
            loss = torch.nn.functional.cross_entropy(outputs, targets, reduction='sum')
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}, loss: {running_loss/len(trainloader)}")
        for j in range(10):
            print(f"Completion of sequence (E{epoch}S{j}):")
            complete_sequence(seq, model)
    print('Finished Training')
    return model

In [160]:
seq = [w2i['.start'], w2i['('], w2i['('], w2i[')']]
q6model = ARModel(batch_size=1).to(device)
optimizer = optim.Adam(q6model.parameters(), lr=0.003)
final_model = train_model_sampling(q6model, trainloader, optimizer, seq, 0.25, 20)

Epoch 1, loss: 420.47772384666337
Completion of sequence (E0S0):
)
.end
left 2, right 2
Completion of sequence (E0S1):
)
.end
left 2, right 2
Completion of sequence (E0S2):
)
.end
left 2, right 2
Completion of sequence (E0S3):
(
)
(
)
(
)
)
.end
left 5, right 5
Completion of sequence (E0S4):
(
)
(
(
)
)
)
.end
left 5, right 5
Completion of sequence (E0S5):
)
.end
left 2, right 2
Completion of sequence (E0S6):
)
.end
left 2, right 2
Completion of sequence (E0S7):
(
)
)
.end
left 3, right 3
Completion of sequence (E0S8):
)
.end
left 2, right 2
Completion of sequence (E0S9):
(
)
)
.end
left 3, right 3
Epoch 2, loss: 176.77880371471315
Completion of sequence (E1S0):
)
.end
left 2, right 2
Completion of sequence (E1S1):
)
.end
left 2, right 2
Completion of sequence (E1S2):
(
)
)
.end
left 3, right 3
Completion of sequence (E1S3):
)
.end
left 2, right 2
Completion of sequence (E1S4):
(
(
)
)
)
.end
left 4, right 4
Completion of sequence (E1S5):
(
)
)
.end
left 3, right 3
Completion of sequen