In [1]:
#import sys
#!{sys.executable} -m pip install torchtext

In [2]:
import torch
from preprocessing import get_dataframes
from dataloading import Text_dataset, get_dataloaders
from utilities import set_seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

# Enforce reproducability
set_seed(42)

# Create dataframes
train_url = "https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_train_simple.txt"
test_url = "https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_test_simple.txt"
train_df, test_df, voc_in, voc_out = get_dataframes(train_url, test_url)

# Create datasets
train_data = Text_dataset(train_df[['IN_idx', 'OUT_idx']], sample=True, size=1000)
test_data = Text_dataset(test_df[['IN_idx', 'OUT_idx']], sample=False)

# Create
train_dataloader, test_dataloader= get_dataloaders(train_data, test_data)

  from .autonotebook import tqdm as notebook_tqdm


Device:  cpu
Random seed set as 42
Loading data...


RuntimeError: Token jump not found and default index is not set

In [None]:
import models
from training import train


# Best in experiment 1
layers = 2
hidden_size = 200
dropout = 0
max_len = train_df.OUT_idx.apply(len).max()

for i in range(5):
    set_seed(i)
    encoder = models.EncoderRNN('lstm', len(voc_in), hidden_size, layers, dropout).to(device)
    decoder = models.DecoderRNN('lstm', hidden_size, len(voc_out), layers, device, max_len).to(device)      
    train(train_dataloader, encoder, decoder, device, save_name='ex1_best_'+str(i))

Random seed set as 0


KeyboardInterrupt: 

In [None]:
# Overall best
layers = 2
hidden_size = 200
dropout = 0.5
max_len = train_df.OUT_idx.apply(len).max()

for i in range(5):
    set_seed(i)
    encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
    decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
    train(train_dataloader, encoder, decoder, device, save_name='ex1_overall_'+str(i))

In [None]:
import models
from training import evaluate
# Overall best
layers = 2
hidden_size = 200
dropout = 0.5
max_len = train_df.OUT_idx.apply(len).max()

acc = 0
for i in range(1,6):
    encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
    decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
    encoder.load_state_dict(torch.load('models/encoder_ex1_best_'+str(i)+'.pth'))
    decoder.load_state_dict(torch.load('models/decoder_ex1_best_'+str(i)+'.pth'))
    acc += evaluate(encoder, decoder, test_dataloader, device)

print(acc/5)

In [None]:


acc = evaluate(encoder, decoder, test_dataloader, device)

Second part of experiment

In [None]:
# best overall
max_len = train_df.OUT_idx.apply(len).max()
layers = 2
hidden_size = 200
dropout = 0.5

bs = 1 # Batch size

percentages = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64]
replications = 5
accurcies = torch.zeros(len(percentages), replications)
for i, p in enumerate(percentages):
    set_seed(42)

    idx = int((len(train_df) + len(test_df)) * p)
    training_data = Text_dataset(train_df[['IN_idx', 'OUT_idx']][:idx], True, 100000)
    
    for j in range(replications):
            set_seed(i)
            encoder = models.EncoderRNN('lstm', train_vocab_in.n_words, hidden_size, layers, dropout).to(device)
            decoder = models.DecoderRNN('lstm', hidden_size, train_vocab_out.n_words, layers, device, max_len).to(device)
            train_dataloader = DataLoader(training_data, batch_size=bs, shuffle=False)
            name = 'ex1_pt2_p'+str(p)[2:]+'_'+str(j)
            train(train_dataloader, encoder, decoder, device, save_name=name)
            acc = evaluate(encoder, decoder, test_dataloader, device)
            accurcies[i,j] = acc

txt = str(accurcies)
f = open("ex1_pt2.txt", "a")
f.write(txt)
f.close()