In [1]:
# Suppress warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Check torchtext version

import torchtext
print(torchtext.__version__)

In [7]:
# Import necessary libraries

import pandas as pd
from torch.utils.data import Dataset,DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k,Multi30k
from typing import Iterable,List
from torch.nn.utils.rnn import pad_sequence
from torchdata.datapipes.iter import IterableWrapper,Mapper
import torchtext

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import random

In [8]:
# Create custom dataset and dataloader

sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

class CustomDataset(Dataset):
  def __init__(self,sentences):
    self.sentences = sentences

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self,idx):
    return self.sentences[idx]

custom_dataset = CustomDataset(sentences)

batch_size = 2

dataloader = DataLoader(custom_dataset,batch_size=batch_size,shuffle=True)

for batch in dataloader:
  print(batch)

["Fame's a fickle friend, Harry.", 'Soon we must all face the choice between what is right and what is easy.']
["If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.", 'Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.']
['You are awesome!', 'It is our choices, Harry, that show what we truly are, far more than our abilities.']


In [9]:
# Different custom dataset and dataloader

class CustomDataset(Dataset):
  def __init__(self,sentences,tokenizer,vocab):
    self.sentences = sentences
    self.tokenizer = tokenizer
    self.vocab = vocab

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self,idx):
    tokens = self.tokenizer(self.sentences[idx])
    tensor_indices = [self.vocab[token] for token in tokens]
    return torch.tensor(tensor_indices)

tokenizer = get_tokenizer('basic_english')

vocab = build_vocab_from_iterator(map(tokenizer,sentences))

custom_dataset = CustomDataset(sentences,tokenizer,vocab)

print('Custom Dataset Length:', len(custom_dataset))
print('Sample Items:')
for i in range(6):
  sample_item = custom_dataset[i]
  print(f'Item {i + 1}: {sample_item}')

Custom Dataset Length: 6
Sample Items:
Item 1: tensor([11, 19, 63, 17, 13,  2,  3, 47,  6, 16, 45,  0, 55,  3, 41, 46, 24, 10,
        43, 61,  9, 44,  0, 14,  9, 33,  1])
Item 2: tensor([35,  6, 16,  3, 38, 40,  0,  8,  1])
Item 3: tensor([12,  5, 15, 31,  0,  8,  0, 57, 53,  2, 18, 62,  4,  0, 36, 49, 56, 15,
        21,  1])
Item 4: tensor([54, 18, 50, 23, 34, 58, 30, 27,  2,  5, 52,  7,  2,  5, 32,  1])
Item 5: tensor([66, 29, 14, 13, 10, 22, 60,  7, 37,  1, 28, 51, 48,  4, 42, 11, 59, 39,
         2, 12, 64, 17, 26, 65,  1])
Item 6: tensor([19,  4, 25, 20])


In [10]:
# Custom collate function

def collate_fn(batch):
  padded_batch = pad_sequence(batch,batch_first=True,padding_value=0)
  return padded_batch

vocab.insert_token("<pad>", 0)

In [11]:
# dataloader with custom collate function

dataloader = DataLoader(custom_dataset,batch_size=batch_size,collate_fn=collate_fn)

for batch in dataloader:
    for row in batch:
        words = [vocab.get_itos()[idx] for idx in row]
    print(words)

['fame', "'", 's', 'a', 'fickle', 'friend', ',', 'harry', '.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['soon', 'we', 'must', 'all', 'face', 'the', 'choice', 'between', 'what', 'is', 'right', 'and', 'what', 'is', 'easy', '.', '<pad>', '<pad>', '<pad>', '<pad>']
['you', 'are', 'awesome', '!', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [12]:
# collate function with batch_first=False

def collate_fn_bfFalse(batch):
  padded_batch = pad_sequence(batch,padding_value=0)
  return padded_batch

In [13]:
# dataloader with custom collate function

dataloader_bfFalse = DataLoader(custom_dataset,batch_size=batch_size,collate_fn=collate_fn_bfFalse)

for seq in dataloader_bfFalse:
  for row in seq:
    #print(row)
    words = [vocab.get_itos()[idx] for idx in row]
    print(words)

['if', 'fame']
['you', "'"]
['want', 's']
['to', 'a']
['know', 'fickle']
['what', 'friend']
['a', ',']
['man', 'harry']
["'", '.']
['s', '<pad>']
['like', '<pad>']
[',', '<pad>']
['take', '<pad>']
['a', '<pad>']
['good', '<pad>']
['look', '<pad>']
['at', '<pad>']
['how', '<pad>']
['he', '<pad>']
['treats', '<pad>']
['his', '<pad>']
['inferiors', '<pad>']
[',', '<pad>']
['not', '<pad>']
['his', '<pad>']
['equals', '<pad>']
['.', '<pad>']
['it', 'soon']
['is', 'we']
['our', 'must']
['choices', 'all']
[',', 'face']
['harry', 'the']
[',', 'choice']
['that', 'between']
['show', 'what']
['what', 'is']
['we', 'right']
['truly', 'and']
['are', 'what']
[',', 'is']
['far', 'easy']
['more', '.']
['than', '<pad>']
['our', '<pad>']
['abilities', '<pad>']
['.', '<pad>']
['youth', 'you']
['can', 'are']
['not', 'awesome']
['know', '!']
['how', '<pad>']
['age', '<pad>']
['thinks', '<pad>']
['and', '<pad>']
['feels', '<pad>']
['.', '<pad>']
['but', '<pad>']
['old', '<pad>']
['men', '<pad>']
['are', '<pa

In [14]:
# check batches

for batch in dataloader:
  print(batch)
  print('Length of sequences in the batch:',batch.shape[1])

tensor([[12, 20, 64, 18, 14,  3,  4, 48,  7, 17, 46,  1, 56,  4, 42, 47, 25, 11,
         44, 62, 10, 45,  1, 15, 10, 34,  2],
        [36,  7, 17,  4, 39, 41,  1,  9,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0]])
Length of sequences in the batch: 27
tensor([[13,  6, 16, 32,  1,  9,  1, 58, 54,  3, 19, 63,  5,  1, 37, 50, 57, 16,
         22,  2],
        [55, 19, 51, 24, 35, 59, 31, 28,  3,  6, 53,  8,  3,  6, 33,  2,  0,  0,
          0,  0]])
Length of sequences in the batch: 20
tensor([[67, 30, 15, 14, 11, 23, 61,  8, 38,  2, 29, 52, 49,  5, 43, 12, 60, 40,
          3, 13, 65, 18, 27, 66,  2],
        [20,  5, 26, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0]])
Length of sequences in the batch: 25


In [15]:
# different custom dataset

class CustomDataset(Dataset):
  def __init__(self,sentences):
    self.sentences = sentences

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self,idx):
    return self.sentences[idx]

In [16]:
custom_dataset = CustomDataset(sentences)

In [17]:
custom_dataset[0]

"If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals."

In [18]:
# different collate function

def collate_fn(batch):
  tensor_batch = []
  for sample in batch:
    tokens = tokenizer(sample)
    tensor_batch.append(torch.tensor([vocab[token] for token in tokens]))

  padded_batch = pad_sequence(tensor_batch,batch_first=True,padding_value=0)

  return padded_batch

In [19]:
dataloader = DataLoader(
    dataset=custom_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

In [20]:
for batch in dataloader:
  print(batch)
  print('shape of sample:', batch.shape)

tensor([[13,  6, 16, 32,  1,  9,  1, 58, 54,  3, 19, 63,  5,  1, 37, 50, 57, 16,
         22,  2],
        [20,  5, 26, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
shape of sample: torch.Size([2, 20])
tensor([[67, 30, 15, 14, 11, 23, 61,  8, 38,  2, 29, 52, 49,  5, 43, 12, 60, 40,
          3, 13, 65, 18, 27, 66,  2],
        [36,  7, 17,  4, 39, 41,  1,  9,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0]])
shape of sample: torch.Size([2, 25])
tensor([[55, 19, 51, 24, 35, 59, 31, 28,  3,  6, 53,  8,  3,  6, 33,  2,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0],
        [12, 20, 64, 18, 14,  3,  4, 48,  7, 17, 46,  1, 56,  4, 42, 47, 25, 11,
         44, 62, 10, 45,  1, 15, 10, 34,  2]])
shape of sample: torch.Size([2, 27])


In [21]:
# french sentences

corpus = [
    "Ceci est une phrase.",
    "C'est un autre exemple de phrase.",
    "Voici une troisième phrase.",
    "Il fait beau aujourd'hui.",
    "J'aime beaucoup la cuisine française.",
    "Quel est ton plat préféré ?",
    "Je t'adore.",
    "Bon appétit !",
    "Je suis en train d'apprendre le français.",
    "Nous devons partir tôt demain matin.",
    "Je suis heureux.",
    "Le film était vraiment captivant !",
    "Je suis là.",
    "Je ne sais pas.",
    "Je suis fatigué après une longue journée de travail.",
    "Est-ce que tu as des projets pour le week-end ?",
    "Je vais chez le médecin cet après-midi.",
    "La musique adoucit les mœurs.",
    "Je dois acheter du pain et du lait.",
    "Il y a beaucoup de monde dans cette ville.",
    "Merci beaucoup !",
    "Au revoir !",
    "Je suis ravi de vous rencontrer enfin !",
    "Les vacances sont toujours trop courtes.",
    "Je suis en retard.",
    "Félicitations pour ton nouveau travail !",
    "Je suis désolé, je ne peux pas venir à la réunion.",
    "À quelle heure est le prochain train ?",
    "Bonjour !",
    "C'est génial !"
]

In [22]:
# another custom collate function

def collate_fn_fr(batch):
  tensor_batch = []
  for sample in batch:
    tokens = tokenizer(sample)
    tensor_batch.append(torch.tensor([vocab[token] for token in tokens]))

  padded_batch = pad_sequence(tensor_batch,batch_first=True)
  return padded_batch

tokenizer = get_tokenizer('spacy',language='fr_core_news_sm')

vocab = build_vocab_from_iterator(map(tokenizer,corpus))

sorted_data = sorted(corpus, key=lambda x: len(tokenizer(x)))

dataloader = DataLoader(sorted_data,batch_size=4,shuffle=False,collate_fn=collate_fn_fr)

In [23]:
for batch in dataloader:
  print(batch)

tensor([[ 27,   2,   0],
        [ 26,  45,   2],
        [ 35,   8,   2],
        [ 25, 101,   2]])
tensor([[  1, 105,  41,   0],
        [  1,   3,  76,   0],
        [  1,   3,  82,   0],
        [ 11,   4,  74,   2]])
tensor([[ 28,   4,  10,   9,   0],
        [ 38,  10, 107,   9,   0],
        [ 12,  69,  51,  49,   0],
        [  1,  16, 103,  17,   0]])
tensor([[  1,   3,  14, 100,   0,   0],
        [ 37,   4,  19,  92,  95,   7],
        [ 33,  71, 122, 117,  52,   2],
        [ 32,  85,  42,  80,  87,   0]])
tensor([[ 30,  18,  19,  88,  21,   2,   0],
        [ 31,  43,   8,  15,  57,  73,   0],
        [ 36,  62,  90, 110,  60,  83,   0],
        [ 34, 112, 104, 106, 108,  56,   0]])
tensor([[ 11,   4, 111,  50,  68,   5,   9,   0],
        [  1, 113,  55,   6,  86,  53,  47,   0],
        [  1,   3,  98,   5, 116,  99,  66,   2],
        [120,  97,  75,   4,   6,  93,  20,   7]])
tensor([[  1,   3,  14,  20,  58,  44,   6,  72,   0,   0],
        [  1,  63,  40,  13,  89, 

In [24]:
multi30k.URL["train"] = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0205EN-SkillsNetwork/training.tar.gz"
multi30k.URL["valid"] = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0205EN-SkillsNetwork/validation.tar.gz"

In [25]:
# define source language and target language

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

In [29]:
train_iter = Multi30k(split='train',language_pair=(SRC_LANGUAGE,TGT_LANGUAGE))

In [31]:
data_set = iter(train_iter)

for n in range(5):
  src,tgt = next(data_set)

  print(f'sample: {str(n+1)}')
  print(f'Source ({SRC_LANGUAGE}): {src}\nTarget ({TGT_LANGUAGE}): {tgt}')

sample: 1
Source (de): Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
Target (en): Two young, White males are outside near many bushes.
sample: 2
Source (de): Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.
Target (en): Several men in hard hats are operating a giant pulley system.
sample: 3
Source (de): Ein kleines Mädchen klettert in ein Spielhaus aus Holz.
Target (en): A little girl climbing into a wooden playhouse.
sample: 4
Source (de): Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.
Target (en): A man in a blue shirt is standing on a ladder cleaning a window.
sample: 5
Source (de): Zwei Männer stehen am Herd und bereiten Essen zu.
Target (en): Two men are at the stove preparing food.


In [32]:
german,english = next(data_set)
print(f"Source German ({SRC_LANGUAGE}): {german}\nTarget English  ({TGT_LANGUAGE}): { english }")

Source German (de): Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.
Target English  (en): A man in green holds a guitar while the other man observes his shirt.


In [33]:
# tokenizers

token_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy',language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy',language='en_core_web_sm')

In [34]:
token_transform['de'](german)
token_transform['en'](english)

['A',
 'man',
 'in',
 'green',
 'holds',
 'a',
 'guitar',
 'while',
 'the',
 'other',
 'man',
 'observes',
 'his',
 'shirt',
 '.']

In [35]:
# special token indices

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0,1,2,3

special_symbols = ['<unk>','<pad>','<bos>','<eos>']

In [36]:
vocab_transform = {}

def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
  language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

  for data_sample in data_iter:
    yield token_transform[language](data_sample[language_index[language]])

In [37]:
# vocab numericalizer

for ln in [SRC_LANGUAGE,TGT_LANGUAGE]:
  train_iterator = Multi30k(split='train',language_pair=(SRC_LANGUAGE,TGT_LANGUAGE))

  sorted_dataset = sorted(train_iterator,key=lambda x: len(x[0].split()))

  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(sorted_dataset,ln),
                                                  min_freq=1,
                                                  specials=special_symbols,
                                                  special_first=True)

In [38]:
for ln in [SRC_LANGUAGE,TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [39]:
seq_en = vocab_transform['en'](token_transform['en'](english))
print(f'English text string: {english}\nEnglish Sequence: {seq_en}')

seq_de = vocab_transform['de'](token_transform['de'](german))
print(f'German text string: {german}\nGerman Sequence: {seq_de}')

English text string: A man in green holds a guitar while the other man observes his shirt.
English Sequence: [6, 12, 7, 51, 144, 4, 126, 29, 8, 75, 12, 1748, 27, 23, 5]
German text string: Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.
German Sequence: [5, 12, 7, 657, 39, 18, 133, 8, 37, 16, 105, 12, 136, 41, 1779, 4]


In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
# add BOS token in front and EOS token at the end

def tensor_transform_s(token_ids: List[int]):
  return torch.cat((torch.tensor([BOS_IDX]),
                    torch.flip(torch.tensor(token_ids),dims=(0,)),
                    torch.tensor([EOS_IDX])))

def tensor_transform_t(token_ids: List[int]):
  return torch.cat((torch.tensor([BOS_IDX]),
                    torch.tensor(token_ids),
                    torch.tensor([EOS_IDX])))

In [42]:
seq_en1 = tensor_transform_t(seq_en)
seq_en1

tensor([   2,    6,   12,    7,   51,  144,    4,  126,   29,    8,   75,   12,
        1748,   27,   23,    5,    3])

In [43]:
seq_en2 = tensor_transform_s(seq_en)
seq_en2

tensor([   2,    5,   23,   27, 1748,   12,   75,    8,   29,  126,    4,  144,
          51,    7,   12,    6,    3])

In [44]:
# transform pipeline

def sequential_transforms(*transforms):
  def func(text_input):
    for transform in transforms:
      text_input = transform(text_input)
    return text_input
  return func

text_transform = {}

text_transform[SRC_LANGUAGE] = sequential_transforms(token_transform[SRC_LANGUAGE],
                                                     vocab_transform[SRC_LANGUAGE],
                                                     tensor_transform_s)

text_transform[TGT_LANGUAGE] = sequential_transforms(token_transform[TGT_LANGUAGE],
                                                     vocab_transform[TGT_LANGUAGE],
                                                     tensor_transform_t)

In [45]:
# custom collate function

def collate_fn(batch):
  src_batch, tgt_batch = [], []
  for src_sample,tgt_sample in batch:
    src_sequences = text_transform[SRC_LANGUAGE](src_sample.rstrip('\n'))
    src_sequences = torch.tensor(src_sequences,dtype=torch.int64)
    tgt_sequences = text_transform[TGT_LANGUAGE](tgt_sample.rstrip('\n'))
    tgt_sequences = torch.tensor(tgt_sequences,dtype=torch.int64)
    src_batch.append(src_sequences)
    tgt_batch.append(tgt_sequences)

  src_batch = pad_sequence(src_batch,padding_value=PAD_IDX,batch_first=True)
  tgt_batch = pad_sequence(tgt_batch,padding_value=PAD_IDX,batch_first=True)

  return src_batch.to(device), tgt_batch.to(device)

In [46]:
# training and validation dataloaders

BATCH_SIZE = 4

train_iterator = Multi30k(split='train', language_pair=(SRC_LANGUAGE,TGT_LANGUAGE))
sorted_train_iterator = sorted(train_iterator,key = lambda x: len(x[0].split()))
train_dataloader = DataLoader(sorted_train_iterator,batch_size=BATCH_SIZE,collate_fn=collate_fn,drop_last=True)

valid_iterator = Multi30k(split='valid',language_pair=(SRC_LANGUAGE,TGT_LANGUAGE))
sorted_valid_dataloader = sorted(valid_iterator,key = lambda x: len(x[0].split()))
valid_dataloader = DataLoader(sorted_valid_dataloader,batch_size=BATCH_SIZE,collate_fn=collate_fn,drop_last=True)

src,trg = next(iter(train_dataloader))
src,trg

(tensor([[    2,     3,     1,     1,     1],
         [    2,  5510,     3,     1,     1],
         [    2,  5510,     3,     1,     1],
         [    2,  1701,     8, 12642,     3]]),
 tensor([[   2,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1],
         [   2, 6650, 4623,  259,  172, 9953,  115,  692, 3428,    5,    3],
         [   2,  216,  110, 3913, 1650, 3823,   71, 2808, 2187,    5,    3],
         [   2,    6, 3398,  202,  109,   37,    3,    1,    1,    1,    1]]))