In [1]:
import torch
import random
import numpy as np  # numpy
import torch.nn as nn  # nn objects
import torch.optim as optim  # nn optimizers
import matplotlib.pyplot as plt
%matplotlib notebook

## custom packages ##
from networkUtils import recurrentNet as rn
from taskUtils import generate as gen
from trainUtils import trainer as tn
from testUtils import test, plot

# Set device to gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Hyperparameters
input_size = 5
num_classes = 2  # the number of units in the output layer
hidden_size = 4  # the number of units in the recurrent layer - kinda arbitrary
batch_size = 1  # batch size = # of samples to average when computing gradient
num_layers = 1  # number of stacked RNN layers
eta = 0.001  # learning rate
epochs = 500  # epochs = # of full pases through dataset

In [19]:
# creating the networks
# general_net = net1 = net2 = net3 = net4 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size).to(device)
net1 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size, random_h0=True).to(device)
net2 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size, random_h0=True).to(device)

# Loss function, optimizer, and schedule (for decaying learning rate)
criterion = nn.CrossEntropyLoss()  # loss function

# general_optimizer = optimizer1 = optimizer2 = optimizer3 = optimizer4 = optim.Adam(general_net.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta
optimizer1 = optim.Adam(net1.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta
optimizer2 = optim.Adam(net2.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta

sheduler1 = optim.lr_scheduler.ReduceLROnPlateau(optimizer1, patience=5, verbose=False) # lowers lr if the loss global min doesn't decrease for 5 epochs
sheduler2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer2, patience=5, verbose=False) # lowers lr if the loss global min doesn't decrease for 5 epochs

# generate takes in inputs of: same_distractions, input_size, seqlen1, seqlen2, seqlen3
dataset1, targets1, sequence_length1 = gen.generate_dataset(False, input_size, 0, 5, 0) # small train in the middle
dataset2, targets2, sequence_length2 = gen.generate_dataset(False, input_size, 0, 50, 0) # large train in the middle

In [20]:
# trainer takes in inputs of: network, dataset, targets, sequence_length, input_size, batch_size, epochs, optimizer, criterion, sheduler
print("First network: \n")
loss1 = tn.train_network(net1, dataset1, targets1, sequence_length1, input_size, batch_size, epochs, optimizer1, criterion, sheduler1)
print("\nSecond network: \n")
loss2 = tn.train_network(net2, dataset2, targets2, sequence_length2, input_size, batch_size, epochs, optimizer2, criterion, sheduler2)

First network: 

Cost at epoch 0 is 0.8262879848480225
Cost at epoch 125 is 0.5394719839096069
Cost at epoch 250 is 0.07368704676628113
Cost at epoch 375 is 0.022944718599319458
Cost at epoch 499 is 0.011416414752602577

Second network: 

Cost at epoch 0 is 0.726830005645752
Cost at epoch 125 is 0.4575463533401489
Cost at epoch 250 is 0.0788794681429863
Cost at epoch 375 is 0.03208708390593529
Cost at epoch 499 is 0.01741422899067402


In [21]:
# plot losses
fig1, ax1 = plt.subplots()
plot. plot_four_losses("Effect of Distraction Train Length on Network Loss", 
                 loss1, loss2)
ax1.legend(["Smallest middle train", "Large middle train"])
plt.show()

<IPython.core.display.Javascript object>

In [22]:
# creating the networks
# general_net = net1 = net2 = net3 = net4 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size).to(device)
net1 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size, random_h0=True).to(device)
net2 = rn.RecurrentXORNet(input_size, hidden_size, num_layers, num_classes, batch_size, random_h0=True).to(device)

# Loss function, optimizer, and schedule (for decaying learning rate)
criterion = nn.CrossEntropyLoss()  # loss function

# general_optimizer = optimizer1 = optimizer2 = optimizer3 = optimizer4 = optim.Adam(general_net.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta
optimizer1 = optim.Adam(net1.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta
optimizer2 = optim.Adam(net2.parameters(), eta)  # tells optimizer to adjust all parameter weights with steps based on eta

sheduler1 = optim.lr_scheduler.ReduceLROnPlateau(optimizer1, patience=5, verbose=False) # lowers lr if the loss global min doesn't decrease for 5 epochs
sheduler2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer2, patience=5, verbose=False) # lowers lr if the loss global min doesn't decrease for 5 epochs

# generate takes in inputs of: same_distractions, input_size, seqlen1, seqlen2, seqlen3
dataset1, targets1, sequence_length1 = gen.generate_dataset(False, input_size, 0, 5, 0) # small train in the middle
dataset2, targets2, sequence_length2 = gen.generate_dataset(False, input_size, 0, 50, 0) # large train in the middle

In [23]:
# trainer takes in inputs of: network, dataset, targets, sequence_length, input_size, batch_size, epochs, optimizer, criterion, sheduler
print("First network: \n")
loss1 = tn.train_network(net1, dataset1, targets1, sequence_length1, input_size, batch_size, epochs, optimizer1, criterion, sheduler1)
print("\nSecond network: \n")
loss2 = tn.train_network(net2, dataset2, targets2, sequence_length2, input_size, batch_size, epochs, optimizer2, criterion, sheduler2)

First network: 

Cost at epoch 0 is 0.6273472309112549
Cost at epoch 125 is 0.18291807174682617
Cost at epoch 250 is 0.0487830676138401
Cost at epoch 375 is 0.02072319947183132
Cost at epoch 499 is 0.010899985209107399

Second network: 

Cost at epoch 0 is 0.7089681029319763
Cost at epoch 125 is 0.10386445373296738
Cost at epoch 250 is 0.026694128289818764
Cost at epoch 375 is 0.012188887223601341
Cost at epoch 499 is 0.006892910692840815


In [24]:
# plot losses
fig2, ax2 = plt.subplots()
plot.plot_four_losses("Effect of Distraction Train Length on Network Loss", 
                 loss1, loss2)
plt.legend(["Smallest middle train", "Large middle train"])
plt.show()

<IPython.core.display.Javascript object>

In [90]:
# dataset first network is trained on
print("Smallest middle train dataset shape: ", dataset1.shape)
print("\nSmallest middle train dataset \n \n", dataset1)

# dataset second network is trained on
print("\nLargest middle train dataset shape: ", dataset2.shape)
print("\nLargest middle train dataset \n \n", dataset2)

Smallest middle train dataset shape:  torch.Size([4, 7, 5])

Smallest middle train dataset 
 
 tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4745, 0.0441, 0.5339, 0.0947, 0.8921],
         [0.7621, 0.5036, 0.5100, 0.8712, 0.0343],
         [0.7578, 0.7490, 0.5420, 0.3158, 0.0268],
         [0.9041, 0.5860, 0.6316, 0.9248, 0.6058],
         [0.3826, 0.7560, 0.5525, 0.5208, 0.4223],
         [1.0000, 0.0000, 0.0000, 0.0000, 0.0000]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.8135, 0.5184, 0.6018, 0.3311, 0.8252],
         [0.0593, 0.0645, 0.6626, 0.5672, 0.4977],
         [0.3649, 0.3292, 0.4874, 0.8284, 0.9390],
         [0.4638, 0.3407, 0.1816, 0.3718, 0.7842],
         [0.2094, 0.2971, 0.4798, 0.2001, 0.7441],
         [0.0000, 1.0000, 0.0000, 0.0000, 0.0000]],

        [[0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
         [0.4607, 0.3046, 0.5164, 0.1151, 0.8737],
         [0.6603, 0.9128, 0.6701, 0.8019, 0.2146],
         [0.2760, 0.5946, 0.9696, 

In [97]:
print(dataset1[0])
sample1 = dataset1[0]
print(sample1.shape)
print(batch_size, sequence_length1, input_size)
sample1 = sample1.view(batch_size, sequence_length1, input_size)
print(sample1.shape)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4745, 0.0441, 0.5339, 0.0947, 0.8921],
        [0.7621, 0.5036, 0.5100, 0.8712, 0.0343],
        [0.7578, 0.7490, 0.5420, 0.3158, 0.0268],
        [0.9041, 0.5860, 0.6316, 0.9248, 0.6058],
        [0.3826, 0.7560, 0.5525, 0.5208, 0.4223],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
torch.Size([7, 5])
1 7 5
torch.Size([1, 7, 5])


In [74]:
# Tests rounded network outputs against correct network outputs based on sample
# test takes in inputs of: sample_number, dataset, targets, network, input_size, batch_size, sequence_length
test.test_network(random.randint(0, 3), dataset1, targets1, net1, input_size, batch_size, sequence_length1)



Test of network: 
input is [[[0.         1.         0.         0.        ]
  [0.8150105  0.8084732  0.44883597 0.6583052 ]
  [0.32915318 0.7853104  0.8065853  0.233213  ]
  [0.992198   0.62383765 0.33365077 0.2642792 ]
  [0.85932815 0.4986812  0.9997319  0.6256024 ]
  [0.89234453 0.57245517 0.5912131  0.09162319]
  [1.         0.         0.         0.        ]]]
out is [[0. 1.]]
expected out is [1. 0.]
