In [29]:
import torch
import pandas as pd


"""Select device (GPU)"""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Selected device:', device)

"""Load Tokens from CSV"""
df = pd.read_csv(
    '../data-sets/Zinparen in Engels-Nederlands - 2024-10-21.csv',
    names=['ENG_TOKENS', 'NLD_TOKENS'])


Selected device: cpu


In [30]:
TRAIN_SPLIT = 0.9
MAX_SIZE = 20
FORBIDDEN_CHARS = ['€', '$']

"""Filtering"""
def filter(row):
	eng_tokens = row['ENG_TOKENS'].split()
	nld_tokens = row['NLD_TOKENS'].split()

	if (len(eng_tokens) > MAX_SIZE):
		return False
    
	if (len(nld_tokens) > MAX_SIZE):
		return False
	
	if any(char in eng_tokens or char in nld_tokens for char in FORBIDDEN_CHARS):
		return False

	return True

df = df[df.apply(filter, axis=1)]

print('Total rows from file:', len(df))
df

Total rows from file: 152194


Unnamed: 0,ENG_TOKENS,NLD_TOKENS
0,ENG_TOKENS,NLD_TOKENS
1,<SOS> Let 's try something . <EOS>,<SOS> Laten we iets proberen ! <EOS>
2,<SOS> Let 's try something . <EOS>,<SOS> Laat ons iets proberen . <EOS>
3,<SOS> I have to go to sleep . <EOS>,<SOS> Ik moet gaan slapen . <EOS>
4,<SOS> Today is June 18th and it is Muiriel 's ...,<SOS> Vandaag is het 18 juni en het is de verj...
...,...,...
153805,<SOS> Cotton candy is usually sold and made at...,<SOS> Suikerspinnen worden gewoonlijk verkocht...
153806,<SOS> At the moment I am looking for a job . <...,<SOS> Op het moment ben ik op zoek naar werk ....
153807,<SOS> The unthinkable happened . <EOS>,<SOS> Het ondenkbare is gebeurd . <EOS>
153808,<SOS> Let 's wait until she rings . <EOS>,<SOS> Laten we wachten tot ze belt ! <EOS>


In [31]:
from utils.Vocabulary import Vocabulary

"""Create a vocabulary to lookup indices"""
eng_vocab = Vocabulary('ENG')
nld_vocab = Vocabulary('NLD')

for sentence in df['ENG_TOKENS']:
    eng_vocab.add_sentence(sentence)

for sentence in df['NLD_TOKENS']:
    nld_vocab.add_sentence(sentence)

eng_vocab.trim()
nld_vocab.trim()


print('English vocab size:', len(eng_vocab))
print('Dutch vocab size:', len(nld_vocab))

English vocab size: 8390
Dutch vocab size: 9715


In [32]:
pad_index = 0 # Same for both ENG and NLD

"""Vectorize tokens"""
def build_dataset(dataset):
	X, Y = [], []
        
	for _, row in dataset.iterrows():
		eng_ixs = eng_vocab.lookup_indices(row['ENG_TOKENS'].split())
		eng_ixs = eng_ixs + [pad_index] * (MAX_SIZE - len(eng_ixs))
		X.append(eng_ixs)

		nld_ixs = nld_vocab.lookup_indices(row['NLD_TOKENS'].split())
		nld_ixs = nld_ixs + [pad_index] * (MAX_SIZE - len(nld_ixs))
		Y.append(nld_ixs)

	# Convert python arrays to PyTorch tensors
	return torch.tensor(X, dtype=torch.long), torch.tensor(Y, dtype=torch.long)

X, Y = build_dataset(df)

import random

print('Xtr shape:', X.shape)
print('Random vector:')
ix = random.randint(0, len(X))
print(X[ix].tolist())
print(Y[ix].tolist())
print()
print(eng_vocab.lookup_tokens(X[ix].tolist()))
print(nld_vocab.lookup_tokens(Y[ix].tolist()))


Xtr shape: torch.Size([152194, 20])
Random vector:
[1, 233, 2920, 149, 557, 205, 738, 48, 49, 1991, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 505, 3255, 155, 169, 18, 126, 1161, 55, 22, 2235, 11, 2, 0, 0, 0, 0, 0, 0, 0]

['<SOS>', 'One', 'million', 'people', 'lost', 'their', 'lives', 'in', 'the', 'war', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<SOS>', 'Een', 'miljoen', 'mensen', 'hebben', 'het', 'leven', 'gelaten', 'in', 'de', 'oorlog', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [33]:
""" Hyperparameters: """

# Model parameters
emb_dim       = 300
hidden_size   = 250 # The same hidden size for encoder and decoder
num_layers    =   2
dropout       = 0.1

# Training parameters
batch_size    = 64
iterations    = 50000
learning_rate = 0.0001 # The lower the batch size, the lower the learning rate
weight_decay  = 0 # Penalize complexity by couting weights into the loss function
step_size     = 50000 # Period of learning rate decay
gamma         = 0.1 # Multiplicative factor of learning rate decay
ft_start_ratio = 0.9
ft_final_ratio = 0

In [34]:
from RNN.Seq2Seq import Seq2Seq

""" Construct the model """
model = Seq2Seq(
    len(eng_vocab),
    len(nld_vocab),
    emb_dim,
    hidden_size,
    hidden_size,
    num_layers,
    dropout,
).to(device)


print("Total parameters:", sum(p.numel() for p in model.parameters()))

Total parameters: 11367465


In [35]:
import torch.nn as nn


criterion = nn.CrossEntropyLoss() # Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Optimizer

In [36]:
"""
Functions for the training loop
"""

""" Calculate a teaching ratio, starts with a high ratio and lowers throughout training """
def get_forced_teaching_ratio(current_epoch, total_epochs):
    progress = current_epoch / total_epochs
    return ft_start_ratio - (ft_start_ratio - ft_final_ratio) * progress

""" Calculate loss on train and test data """
def log_statistics(eval_batch_size=64):

    inputs = torch.zeros((eval_batch_size, MAX_SIZE), dtype=torch.long).to(device)
    inputs[:, 0] = 1 # <SOS> has index of 1

    model.eval()
    with torch.inference_mode():
    
        # Sample the models performence on a subset of training data
        ix = torch.randint(0, Xtr.shape[0], (eval_batch_size,))
        Xb, Yb = Xtr[ix].to(device), Ytr[ix].to(device)
        
        outputs = model(Xb, inputs)
        train_loss = criterion(outputs, Yb)


        # Sample the models performence on a subset of testing data
        ix = torch.randint(0, Xte.shape[0], (eval_batch_size,))
        Xb, Yb = Xte[ix].to(device), Yte[ix].to(device)

        outputs = model(Xb, inputs)
        test_loss = criterion(outputs, Yb)
        
        
        return train_loss.item(), test_loss.item()

In [38]:
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from ray import tune

from RNN.Seq2Seq import Seq2Seq

iterations = 50000 # Non-tunable for now

""" Training function for hyper parameter tuning """
def train_model(config):
    # Step 1: Create TensorDataset from X and Y
    dataset = TensorDataset(X, Y)

    # Step 2: Split dataset into training and testing sets (80% train, 20% test)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    # Step 3: Create DataLoaders for training and testing
    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model
    model = Seq2Seq(
        len(eng_vocab),
        len(nld_vocab),
        emb_dim,
        hidden_size,
        hidden_size,
        num_layers,
        dropout=config["dropout"],
    ).to(device)
    
    # Initialize loss function, optimizer, and learning rate scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["step_size"], gamma=config["step_size_gamma"])

    train_loader_iter = iter(train_loader) # Iterator for the DataLoader

    # Step a pre-defined amount of times
    for i in range(iterations):
        print(next(train_loader_iter))
        Xb, Yb = next(train_loader_iter)
        Xb, Yb = Xb.to(device), Yb.to(device)
        ft_ratio = get_forced_teaching_ratio(i, iterations)
        
        model.train()
        optimizer.zero_grad()
        output = model(Xb, Yb, ft_ratio)
        loss = criterion(output, Yb)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Report the average MSE to Ray Tune every so often
        if ((i + 1) % (iterations / 50) == 0):
            model.eval()
            total_mse = 0
            with torch.inference_mode():
                for batch in test_loader:
                    Xb, Yb = batch
                    Xb, Yb = Xb.to(device), Yb.to(device)

                    output = model(Xb, Yb)
                    mse = nn.MSELoss(output, Yb).item()
                    total_mse += mse
            
            avg_mse = total_mse / len(test_loader)
            tune.report(iteration=i, mse=avg_mse)



In [14]:
from ray.tune.schedulers import ASHAScheduler

search_space = {
    "lr": tune.loguniform(1e-4, 1e-1),  # Learning rate between 0.0001 and 0.1
    "batch_size": tune.choice([32, 64, 128]),  # Discrete batch sizes
    "dropout": tune.uniform(0.2, 0.5),         # Dropout rate between 0.2 and 0.5
    "weight_decay": tune.loguniform(1e-5, 1e03)
}

scheduler = ASHAScheduler(
    metric="mse",  # We are now optimizing based on MSE
    mode="min",    # Minimize MSE
    max_t=50000,  # Maximum number of iterations
    grace_period=100,
    reduction_factor=2
)

# Run Ray Tune with the new iteration-based training
tuner = tune.run(
    train_model,
    resources_per_trial={"cpu": 1, "gpu": 0},  # Adjust for GPU if needed
    config=search_space,
    num_samples=10,  # Number of hyperparameter configurations to try
    scheduler=scheduler
)

# Get the best result (lowest MSE)
best_trial = tuner.get_best_trial("mse", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation MSE: {best_trial.last_result['mse']}")


2024-10-24 19:12:59,982	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-10-24 20:52:44
Running for:,01:39:44.33
Memory:,17.5/31.0 GiB

Trial name,# failures,error file
train_model_389a8_00000,1,"/tmp/ray/session_2024-10-24_18-53-56_886860_42744/artifacts/2024-10-24_19-13-00/train_model_2024-10-24_19-13-00/driver_artifacts/train_model_389a8_00000_0_batch_size=32,dropout=0.2115,lr=0.0003,weight_decay=0.0283_2024-10-24_19-13-02/error.txt"
train_model_389a8_00002,1,"/tmp/ray/session_2024-10-24_18-53-56_886860_42744/artifacts/2024-10-24_19-13-00/train_model_2024-10-24_19-13-00/driver_artifacts/train_model_389a8_00002_2_batch_size=128,dropout=0.2756,lr=0.0008,weight_decay=0.0391_2024-10-24_19-13-02/error.txt"
train_model_389a8_00003,1,"/tmp/ray/session_2024-10-24_18-53-56_886860_42744/artifacts/2024-10-24_19-13-00/train_model_2024-10-24_19-13-00/driver_artifacts/train_model_389a8_00003_3_batch_size=32,dropout=0.2013,lr=0.0015,weight_decay=0.0003_2024-10-24_19-13-03/error.txt"
train_model_389a8_00006,1,"/tmp/ray/session_2024-10-24_18-53-56_886860_42744/artifacts/2024-10-24_19-13-00/train_model_2024-10-24_19-13-00/driver_artifacts/train_model_389a8_00006_6_batch_size=32,dropout=0.3472,lr=0.0001,weight_decay=0.0043_2024-10-24_19-13-03/error.txt"
train_model_389a8_00008,1,"/tmp/ray/session_2024-10-24_18-53-56_886860_42744/artifacts/2024-10-24_19-13-00/train_model_2024-10-24_19-13-00/driver_artifacts/train_model_389a8_00008_8_batch_size=64,dropout=0.4475,lr=0.0028,weight_decay=0.0001_2024-10-24_19-13-03/error.txt"

Trial name,status,loc,batch_size,dropout,lr,weight_decay
train_model_389a8_00001,RUNNING,192.168.1.121:54836,128,0.445411,0.0792894,0.00206416
train_model_389a8_00004,RUNNING,192.168.1.121:54837,32,0.495315,0.0672401,3.74708e-05
train_model_389a8_00005,RUNNING,192.168.1.121:54841,64,0.296397,0.00136544,0.00156452
train_model_389a8_00007,RUNNING,192.168.1.121:54844,64,0.358363,0.0100613,14.0259
train_model_389a8_00009,RUNNING,192.168.1.121:54842,128,0.257771,0.0436454,124.991
train_model_389a8_00000,ERROR,192.168.1.121:54838,32,0.211473,0.000289081,0.0282967
train_model_389a8_00002,ERROR,192.168.1.121:54845,128,0.275578,0.000798602,0.039131
train_model_389a8_00003,ERROR,192.168.1.121:54839,32,0.201269,0.00146492,0.000297425
train_model_389a8_00006,ERROR,192.168.1.121:54843,32,0.347245,0.000120381,0.00433787
train_model_389a8_00008,ERROR,192.168.1.121:54840,64,0.447494,0.00281628,7.08784e-05


2024-10-24 19:45:06,171	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_389a8_00003
Traceback (most recent call last):
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 2745, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                  

Trial name
train_model_389a8_00000
train_model_389a8_00002
train_model_389a8_00003
train_model_389a8_00006
train_model_389a8_00008


2024-10-24 19:45:54,371	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_389a8_00006
Traceback (most recent call last):
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/marijn/Projecten/vscode_notebooks/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 2745, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                  

In [None]:
import matplotlib.pyplot as plt

# Create the figure and the primary y-axis
fig, ax1 = plt.subplots(1, 1, figsize=(12, 6))
plt.grid()

# Plot the loss on the primary y-axis
ax1.plot(epoch_count, train_loss_values, label='Training Loss')
ax1.plot(epoch_count, test_loss_values, label='Test Loss', linestyle='--')
ax1.set_xlabel('Iterations')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Progression')

# Create the secondary y-axis for the learning rate
ax2 = ax1.twinx()
ax2.plot(epoch_count, learning_rates, label='Learning Rate', color='green')
ax2.set_ylabel('Learning Rate', color='green')
ax2.tick_params(axis='y', labelcolor='green')

# Add legends to each y-axis
ax1.legend(loc='upper right')
ax2.legend(loc='upper left')

# Show the plot
plt.show()

In [13]:
torch.save(model.state_dict(), '../models/RNN-Attention_24-10-2024.pt')

In [None]:
import os.path

if os.path.isfile('../models/RNN-Attention_24-10-2024.pt'):
    print('Found saved state dictionary!')
    model.load_state_dict(torch.load('../models/RNN-Attention_24-10-2024.pt'))

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download the tokenizer models from nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize_sentence(sentence: str) -> list:
    """Tokenize a single sentence."""
    return ['<SOS>'] + word_tokenize(sentence) + ['<EOS>']


In [16]:
import re

def translate(sentence, max_output_length=20):
	input_tokens = tokenize_sentence(sentence)
	input_indices = eng_vocab.lookup_indices(input_tokens)

	output_indices, _ = model.evaluate(input_indices, device, max_output_length)
	output_tokens = nld_vocab.lookup_tokens(output_indices)

	new_sentence = ' '.join(output_tokens).capitalize()
	new_sentence = re.sub(r'\s+([.,!?])', r'\1', new_sentence)

	return new_sentence

In [None]:
### Use model ###

print(translate('Go!'))
print(translate('It\'s not Fine.'))
print(translate('I\'m very happy today.'))
print(translate('He\'s very sad.'))
print(translate('That man is wearing a white shirt.'))
print(translate('He\'s very afraid of spiders.'))
print(translate('He\'s going home to his wife.'))
print(translate('Those guys are walking to work.'))
print(translate('I\'m not going with you today.'))
print(translate('My girlfriend will not come over tomorrow.'))
print(translate('He bought his friends a present for christmas.'))


In [18]:
import matplotlib.pyplot as plt

def evaluateAndShowAttention():

    ''' Sample a random sentence from the test data '''
    input_tokens = test_data.sample()['ENG_TOKENS'].iloc[0].split()
    input_indices = eng_vocab.lookup_indices(input_tokens)

    output_indices, attentions = model.evaluate(input_indices, device)
    output_indices = output_indices[1:] # Remove single batch dimension
    output_tokens = nld_vocab.lookup_tokens(output_indices)

    attentions = attentions.cpu().numpy()
    
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions, cmap='plasma')
    fig.colorbar(cax)
                       
    # Set up axes
    ax.set_xticks(range(len(input_tokens)))
    ax.set_yticks(range(len(output_tokens)))

    ax.set_xticklabels(input_tokens, rotation=90)
    ax.set_yticklabels(output_tokens)

    plt.show()

In [None]:
evaluateAndShowAttention()