<a href="https://colab.research.google.com/github/maktaurus/ML-Work/blob/main/Torch_Notebooks/Translation_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q pytorch_lightning

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.2/866.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import pytorch_lightning as pl
# from torchinfo import summary
import pandas as pd
import zipfile
import sentencepiece as spm
import numpy as np

In [3]:
!kaggle datasets download devicharith/language-translation-englishfrench

Dataset URL: https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench
License(s): CC0-1.0
Downloading language-translation-englishfrench.zip to /content
  0% 0.00/3.51M [00:00<?, ?B/s]
100% 3.51M/3.51M [00:00<00:00, 104MB/s]


In [4]:
data = zipfile.ZipFile('language-translation-englishfrench.zip').extractall()

In [5]:
data = pd.read_csv('/content/eng_-french.csv')
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
# data["English words/sentences"] = data["English words/sentences"]
data["target"] = data["French words/sentences"] + "<eos>"
data["French words/sentences"] = "<sos>" + data["French words/sentences"]
data.head()

Unnamed: 0,English words/sentences,French words/sentences,target
0,Hi.,<sos>Salut!,Salut!<eos>
1,Run!,<sos>Cours !,Cours !<eos>
2,Run!,<sos>Courez !,Courez !<eos>
3,Who?,<sos>Qui ?,Qui ?<eos>
4,Wow!,<sos>Ça alors !,Ça alors !<eos>


In [7]:
class MyDataSet(torch.utils.data.Dataset):
  def __init__(self,data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    en = self.data.iloc[idx,0]
    fr = self.data.iloc[idx,1]
    tar = self.data.iloc[idx,2]
    return en,fr,tar

df = MyDataSet(data)

In [8]:
for x in df:
  print(x)
  break

('Hi.', '<sos>Salut!', 'Salut!<eos>')


In [9]:
with open("eng.txt","w") as f:
  for x in df:
    f.write(x[0]+"\n")

with open("fre.txt","w") as f:
  for x in df:
    f.write(x[0]+"\n")

In [10]:
en_options = dict(
    input = "eng.txt",
    model_prefix = "eng",
    vocab_size = 4000,
    model_type = "bpe",
    user_defined_symbols = ["<sos>","<eos>"],
    split_digits = True,
    max_sentencepiece_length = 64,
    byte_fallback = True,
    character_coverage = 0.9995,
    split_by_whitespace = True,
    pad_id = 0,
    unk_id = 1,
    bos_id = 2,
    eos_id = 3
)

fr_options = dict(
    input = "fre.txt",
    model_prefix = "fre",
    vocab_size = 4000,
    model_type = "bpe",
    user_defined_symbols = ["<sos>","<eos>"],
    split_digits = True,
    max_sentencepiece_length = 64,
    byte_fallback = True,
    character_coverage = 0.9995,
    split_by_whitespace = True,
    pad_id = 0,
    unk_id = 1,
    bos_id = 2,
    eos_id = 3
)

In [11]:
spm.SentencePieceTrainer.train(**en_options)
spm.SentencePieceTrainer.train(**fr_options)

In [12]:
en_encoder = spm.SentencePieceProcessor()
en_encoder.load("eng.model")

fr_encoder = spm.SentencePieceProcessor()
fr_encoder.load("fre.model")

True

In [13]:
en_encoder.encode("Hello there")

[336, 3618, 520]

In [14]:
en_encoder.id_to_piece([3333, 3624, 511,0])

['lies', 'lect', '▁too', '<pad>']

In [15]:
en_encoder.decode(4)

'<sos>'

In [16]:
pad = torch.nn.ConstantPad1d((0,100),0)

In [17]:
def collate_fn(batch):
  en = torch.stack([pad(torch.tensor(en_encoder.encode(x[0])))[:64] for x in batch])
  fr = torch.stack([pad(torch.tensor(fr_encoder.encode(x[1])))[:64] for x in batch])
  tgt = torch.stack([pad(torch.tensor(fr_encoder.encode(x[2])))[:64] for x in batch])
  return en,fr,tgt

In [18]:
len(df)

175621

In [19]:
train, val = torch.utils.data.random_split(df,[170000,5621])

In [20]:
train_df = torch.utils.data.DataLoader(df,batch_size=128,shuffle=True,collate_fn=collate_fn)
val_df = torch.utils.data.DataLoader(val,batch_size=128,collate_fn=collate_fn)

In [89]:
for x in train_df:
  print(en_encoder.decode(x[0][0].tolist()))
  print(x[0][0])
  print(x[1][0])
  print(fr_encoder.decode(x[2][0].tolist()))
  break

I'm not even sure if this is my key.
tensor([ 266, 3965, 3958,  363, 1094,  654,  585,  355,  308,  368, 1698, 3956,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
tensor([3943,    4, 3993, 3944,  357,  407,  277,  278,  201,  176, 3828,  298,
         301,  270,  201,  193, 3952,  561, 3944, 2171,  429,  291, 2403,  598,
         201,  175, 3956,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
Je ne suis même pas sûr que ce soit ma clé.<eos>


In [22]:
en_encoder.decode([1527,  527, 2875, 3085,  320,  273,  974, 3970,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

'Which search engine do you use?'

In [23]:
en_encoder.pad_id()

0

In [24]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

**Positional encoding** is a technique used in transformer models to inject information about the position of tokens in a sequence. Positional encoding helps the model understand the order of tokens, which is crucial for tasks like language translation and text generation.

In [25]:
def positional_encoding(seq_len,emb_dim):
  pos = torch.arange(seq_len,dtype=torch.float).unsqueeze(1)
  div = torch.exp(torch.arange(0,emb_dim,2).float() * (-torch.log(torch.tensor(10000.0)) / emb_dim))
  pos_enc = torch.zeros(seq_len,emb_dim)
  pos_enc[:,0::2] = torch.sin(pos * div)
  pos_enc[:,1::2] = torch.cos(pos * div)
  return pos_enc

**src_mask:** This is used to mask out certain positions in the source sequence. It’s typically a square matrix (size (N \times N), where (N) is the sequence length) that prevents the model from attending to future tokens in autoregressive models. For example, in a sequence-to-sequence model, this mask ensures that the model only attends to previous tokens and not future ones.

**tgt_mask:** Similar to src_mask, but for the target sequence. It ensures that the model only attends to previous tokens in the target sequence during training. This is crucial for tasks like language modeling where the model should not see future tokens.

**src_key_padding_mask:** This mask is used to ignore padding tokens in the source sequence. It’s a binary mask (size ( \text{batch_size} \times N )) where padding positions are marked, ensuring that the model does not attend to these positions.

**tgt_key_padding_mask:** Similar to src_key_padding_mask, but for the target sequence. It masks out padding tokens in the target sequence to prevent the model from attending to them.


In [26]:
def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = torch.nn.Transformer.generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == 0)
    tgt_padding_mask = (tgt == 0)
    return src_mask.to(device), tgt_mask.to(device), src_padding_mask.to(device), tgt_padding_mask.to(device)

In [27]:
pos = positional_encoding(64,256)
emb = torch.nn.Embedding(4000,256)
for x in train_df:
  en,fr,tgt = x
  ss = emb(en) + pos
  src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(x[0],x[1])
  print(src_mask[2])
  print(tgt_mask[2])
  print(src_padding_mask[2])
  print(tgt_padding_mask.shape)
  print(x[0][0])
  break

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False], device='cuda:0')
tensor([0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
       device='cuda:0')
tensor([False, False, False, False, False, False, F

In [28]:
class Transformer(torch.nn.Module):
  def __init__(self,emb_dim, vocab_size, num_layers, num_heads, dropout, pos_enc):
    super().__init__()
    self.src_emb = torch.nn.Embedding(vocab_size,emb_dim)
    self.tgt_emb = torch.nn.Embedding(vocab_size,emb_dim)
    self.pos_layer = pos_enc(64,emb_dim).to(device)
    self.transformer = torch.nn.Transformer(d_model=emb_dim,nhead=num_heads,num_encoder_layers=num_layers,num_decoder_layers=num_layers,dim_feedforward=1048,dropout=dropout,batch_first=True)
    self.lin = torch.nn.Linear(emb_dim,1048)
    self.relu = torch.nn.ReLU()
    self.out_layer = torch.nn.Linear(1048,vocab_size)

  def forward(self,src,tgt):
    # src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src,tgt)
    # src_mask = torch.zeros((src.shape[1], src.shape[1]),device=device).type(torch.bool)
    tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.shape[1]).to(device)
    src = self.src_emb(src).to(device) + self.pos_layer
    tgt = self.tgt_emb(tgt).to(device) + self.pos_layer
    out = self.transformer(src,tgt, tgt_mask =tgt_mask, tgt_is_causal=True)
    out = self.lin(out)
    out = self.relu(out)
    out = self.out_layer(out)
    return out

In [29]:
model = Transformer(256,4000,4,4,0.1, positional_encoding)
model.to(device)

Transformer(
  (src_emb): Embedding(4000, 256)
  (tgt_emb): Embedding(4000, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=1048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-3): 4 x 

In [30]:
class TransformerModel(pl.LightningModule):
  def __init__(self, model):
    super().__init__()
    self.model = model.to(device)
    self.loss = torch.nn.CrossEntropyLoss()

  def forward(self,en,fr):
    return self.model(en,fr)

  def training_step(self,batch,batch_idx):
    en,fr,tgt = batch
    out = self.model(en.to(device),fr.to(device))
    loss = self.loss(out.view(-1,out.shape[-1]),tgt.view(-1))
    self.log("train_loss",loss, prog_bar=True,on_step=True,on_epoch=True)
    return loss

  def validation_step(self,batch,batch_idx):
    en,fr,tgt = batch
    out = self.model(en.to(device),fr.to(device))
    loss = self.loss(out.view(-1,out.shape[-1]),tgt.view(-1))
    self.log("val_loss",loss, prog_bar=True,on_step=True,on_epoch=True)
    return loss

  def configure_optimizers(self):
    return torch.optim.AdamW(self.model.parameters(),lr=0.0001)

In [32]:
pl_model = TransformerModel(model).to(device).train()
trainer = pl.Trainer(max_epochs=4)
trainer.fit(pl_model,train_df,val_df)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | model | Transformer      | 14.0 M | train
1 | loss  | CrossEntropyLoss | 0      | train
---------------------------------------------------
14.0 M    Trainable params
0         Non-trainable params
14.0 M    Total params
55.943    Total estimated model params size (MB)
110       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


In [33]:
pl_model.eval().to(device)

TransformerModel(
  (model): Transformer(
    (src_emb): Embedding(4000, 256)
    (tgt_emb): Embedding(4000, 256)
    (transformer): Transformer(
      (encoder): TransformerEncoder(
        (layers): ModuleList(
          (0-3): 4 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
            )
            (linear1): Linear(in_features=256, out_features=1048, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=1048, out_features=256, bias=True)
            (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      )
      (decode

In [114]:
def translate(sentence):
  en_tokens = (pad(torch.tensor(en_encoder.encode(sentence)))[:64]).unsqueeze(0).to(device)
  fr_start = "<sos>"
  for i in range(64):
    fr_tokens = (pad(torch.tensor(fr_encoder.encode(fr_start)))[:64]).unsqueeze(0).to(device)
    out = pl_model(en_tokens,fr_tokens)
    soft_out = torch.nn.Softmax(dim=-1)(out)
    word_index = torch.argmax(soft_out,dim=-1)[0].detach().cpu().numpy()
    fr_word = fr_encoder.decode(word_index.tolist())
    fr_start += fr_word
  print(fr_start)

In [115]:
translate("I'm not even sure if this is my key.")

<sos>JeJe ne suJe ne su ne suis pJe ne su ne suis p��is pis pas sJe ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr clJe ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���éJe ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���é.<eos><eos>.<eos>.<eos>.<eos>Je ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���é.<eos><eos>.<eos>.<eos>.<eos>Je ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���é.<eos><eos>.<eos>.<eos>.<eos>Je ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���é.<eos><eos>.<eos>.<eos>.<eos>Je ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.�.�.<eos>.<eos>..<eos>.<eos>.<eos>���é.<eos><eos>.<eos>.<eos>.<eos>Je ne su ne suis p��is pis pas s�asis pas pas mon� mon� mon fr cl���.�.

In [None]:
Que devez-vous faire maintenant ?

In [163]:
text = "How about going out for dinner"
en_tokens = (pad(torch.tensor(en_encoder.encode(text)))[:64]).to(device)
fr_start = "<sos>Que va sortez d�"

fr_tokens = (pad(torch.tensor(fr_encoder.encode(fr_start)))[:64]).to(device)
out = pl_model(en_tokens.unsqueeze(0),fr_tokens.unsqueeze(0))
soft_out = torch.nn.Softmax(dim=-1)(out)
word_index = torch.argmax(soft_out,dim=-1)[0].detach().cpu().numpy()
word_index
fr_word = fr_encoder.id_to_piece(word_index.tolist())
fr_word = fr_encoder.decode(fr_word)

fr_word

'Que va sortez d�'

In [164]:
text = "How about going out for dinner"
en_tokens = (pad(torch.tensor(en_encoder.encode(text)))[:64]).to(device)
fr_start = "<sos>"
for x in range(10):
  fr_tokens = (pad(torch.tensor(fr_encoder.encode(fr_start)))[:64]).to(device)
  out = pl_model(en_tokens.unsqueeze(0),fr_tokens.unsqueeze(0))
  soft_out = torch.nn.Softmax(dim=-1)(out)
  word_index = torch.argmax(soft_out,dim=-1)[0].detach().cpu().numpy()
  word_index
  fr_word = fr_encoder.decode(word_index.tolist())
  fr_start += fr_word
print(fr_word)
fr_word

Queue vue v vaue d da daa dit d d� d�� d d����î��..�.���.�î����������î�!�..�.


'Queue vue v vaue d da daa dit d d� d�� d d����î��..�.���.�î����������î�!�..�.'

In [71]:
fr_encoder.decode([503,])

'L'

In [None]:
torch.cuda.empty_cache()