In [1]:
from __future__ import unicode_literals, print_function, division
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
from preprocessing import get_dataframes
train_url = "https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_train_simple.txt"
test_url = "https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_test_simple.txt"
train_df, test_df = get_dataframes(train_url, test_url)

train_df.head()

Unnamed: 0,IN,OUT
0,"[jump, opposite, right, twice, and, turn, oppo...","[I_TURN_RIGHT, I_TURN_RIGHT, I_JUMP, I_TURN_RI..."
1,"[run, opposite, left, after, walk, right, <eos...","[I_TURN_RIGHT, I_WALK, I_TURN_LEFT, I_TURN_LEF..."
2,"[walk, after, run, around, right, twice, <eos>...","[I_TURN_RIGHT, I_RUN, I_TURN_RIGHT, I_RUN, I_T..."
3,"[look, around, right, thrice, and, turn, left,...","[I_TURN_RIGHT, I_LOOK, I_TURN_RIGHT, I_LOOK, I..."
4,"[walk, opposite, left, twice, and, walk, oppos...","[I_TURN_LEFT, I_TURN_LEFT, I_WALK, I_TURN_LEFT..."


In [3]:
from preprocessing import get_vocab

train_vocab_in = get_vocab(train_df['IN'], 'in')
train_vocab_out = get_vocab(train_df['OUT'], 'out')

print(train_vocab_in.word2index)
print(train_vocab_out.word2index)

def series2idx(row):
    x = train_vocab_in.col2idx(row, 'IN')
    y = train_vocab_out.col2idx(row, 'OUT')
    return x, y
    
train_df[['IN_idx','OUT_idx']] = train_df.apply(series2idx, axis=1, result_type='expand')
test_df[['IN_idx','OUT_idx']] = test_df.apply(series2idx, axis=1, result_type='expand')

train_df.head()

{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'jump': 3, 'opposite': 4, 'right': 5, 'twice': 6, 'and': 7, 'turn': 8, 'thrice': 9, 'run': 10, 'left': 11, 'after': 12, 'walk': 13, 'around': 14, 'look': 15}
{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'I_TURN_RIGHT': 3, 'I_JUMP': 4, 'I_WALK': 5, 'I_TURN_LEFT': 6, 'I_RUN': 7, 'I_LOOK': 8}


Unnamed: 0,IN,OUT,IN_idx,OUT_idx
0,"[jump, opposite, right, twice, and, turn, oppo...","[I_TURN_RIGHT, I_TURN_RIGHT, I_JUMP, I_TURN_RI...","[tensor(3), tensor(4), tensor(5), tensor(6), t...","[tensor(3), tensor(3), tensor(4), tensor(3), t..."
1,"[run, opposite, left, after, walk, right, <eos...","[I_TURN_RIGHT, I_WALK, I_TURN_LEFT, I_TURN_LEF...","[tensor(10), tensor(4), tensor(11), tensor(12)...","[tensor(3), tensor(5), tensor(6), tensor(6), t..."
2,"[walk, after, run, around, right, twice, <eos>...","[I_TURN_RIGHT, I_RUN, I_TURN_RIGHT, I_RUN, I_T...","[tensor(13), tensor(12), tensor(10), tensor(14...","[tensor(3), tensor(7), tensor(3), tensor(7), t..."
3,"[look, around, right, thrice, and, turn, left,...","[I_TURN_RIGHT, I_LOOK, I_TURN_RIGHT, I_LOOK, I...","[tensor(15), tensor(14), tensor(5), tensor(9),...","[tensor(3), tensor(8), tensor(3), tensor(8), t..."
4,"[walk, opposite, left, twice, and, walk, oppos...","[I_TURN_LEFT, I_TURN_LEFT, I_WALK, I_TURN_LEFT...","[tensor(13), tensor(4), tensor(11), tensor(6),...","[tensor(6), tensor(6), tensor(5), tensor(6), t..."


In [4]:
from dataloading import Text_dataset
from torch.utils.data import DataLoader
from utilities import set_seed

set_seed(42)

# Create datasets
training_data = Text_dataset(train_df[['IN_idx', 'OUT_idx']], True, 100000)
test_data = Text_dataset(test_df[['IN_idx', 'OUT_idx']], False)

# Define dataloaders
bs = 1 # Batch size
train_dataloader = DataLoader(training_data, batch_size=bs, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=bs, shuffle=False)

Random seed set as 42


In [5]:
import models
from training import train
max_len = train_df.OUT_idx.apply(len).max()

# Best in experiment 1
layers = 2
hidden_size = 200
dropout = 0

for i in range(2,6):
    set_seed(i)
    encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
    decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
    train(train_dataloader, encoder, decoder, device, save_name='ex1_best_'+str(i))

Random seed set as 2


100%|██████████| 100000/100000 [1:05:51<00:00, 25.30it/s]


65m 52s (- 0m 0s) (1 100%) 0.0121
Random seed set as 3


100%|██████████| 100000/100000 [1:05:52<00:00, 25.30it/s]


65m 52s (- 0m 0s) (1 100%) 0.0116
Random seed set as 4


100%|██████████| 100000/100000 [1:05:37<00:00, 25.40it/s]


65m 37s (- 0m 0s) (1 100%) 0.0130
Random seed set as 5


100%|██████████| 100000/100000 [1:06:00<00:00, 25.25it/s]

66m 0s (- 0m 0s) (1 100%) 0.0124





In [None]:
# Overall best
layers = 2
hidden_size = 200
dropout = 0.5

for i in range(5):
    set_seed(i)
    encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
    decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
    train(train_dataloader, encoder, decoder, device, save_name='ex1_overall_'+str(i))

In [None]:
layers = 2
hidden_size = 200
dropout = 0

encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
encoder.load_state_dict(torch.load('models/encoder_ex1_1.pth'))
decoder.load_state_dict(torch.load('models/decoder_ex1_1.pth'))


In [None]:
from training import evaluate

acc = evaluate(encoder, decoder, test_dataloader, device)

Second part of experiment

In [None]:
# best overall
max_len = train_df.OUT_idx.apply(len).max()
layers = 2
hidden_size = 200
dropout = 0.5

bs = 1 # Batch size

percentages = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64]
replications = 5
accurcies = torch.zeros(len(percentages), replications)
for i, p in enumerate(percentages):
    set_seed(42)

    idx = int(len(train_df) * p)
    training_data = Text_dataset(train_df[['IN_idx', 'OUT_idx']][:idx], True, 100000)
    
    for j in range(replications):
            set_seed(i)
            encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
            decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
            train_dataloader = DataLoader(training_data, batch_size=bs, shuffle=False)
            name = 'ex1_pt2_p'+str(p)[2:]+'_'+str(j)
            train(train_dataloader, encoder, decoder, device, save_name=name)
            acc = evaluate(encoder, decoder, test_dataloader, device)
            accurcies[i,j] = acc

txt = str(accurcies)
f = open("ex1_pt2.txt", "a")
f.write(txt)
f.close()