In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.optim as optim
import re
import math
import time
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using device :{device}")


using device :cuda


In [2]:
df = pd. read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
df = df.rename(columns={'English words/sentences': 'english', 'French words/sentences': 'french'})

print(f"Dataset Size: {len(df)}")
print(df.head())


Dataset Size: 175621
  english      french
0     Hi.      Salut!
1    Run!     Cours !
2    Run!    Courez !
3    Who?       Qui ?
4    Wow!  Ça alors !


In [3]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {'<pad>':0,'<sos>':1, '<eos>':2,'<unk>':3}
        self.idx2word = {0:'<pad>',1:'<sos>',2:'<eos>',3:'<unk>'}
        self.vocab_size = 4

    def build_vocab(self, sentences, min_freq = 2):
        word_freq={}

        for sentence in sentences:
            for word in sentence.split():
                word_freq[word]= word_freq.get(word,0)+1

        for word,freq in word_freq.items():
            if freq >= min_freq:
                self.word2idx[word]= self.vocab_size
                self.idx2word[self.vocab_size]=word
                self.vocab_size+=1
                
    def sentence_to_indices(self,sentence):
        indices = [self.word2idx.get(word, self.word2idx['<unk>']) for word in sentence.split()]
        return indices

    def indices_to_sentence(self,indices):
        word = [self.idx2word.get(idx, self.idx2word['<unk>'])for idx in indices]
        return ' '.join(word)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    return text.strip()

df['english'] = df['english'].apply(preprocess_text)
df['french'] = df['french'].apply(preprocess_text)

eng_vocab = Vocabulary()
fr_vocab = Vocabulary()

eng_vocab.build_vocab(df['english'])
fr_vocab.build_vocab(df['french'])

print(f"English Vocabulary size : {eng_vocab.vocab_size}")
print(f"French Vocabulary size : {fr_vocab.vocab_size}")       

English Vocabulary size : 9680
French Vocabulary size : 13376


In [87]:
class MultiheadAttention(nn.Module):
    def __init__(self,d_model,num_head, dropout = 0.1):
        super(MultiheadAttention,self).__init__()
        assert d_model%num_head == 0
        
        self.d_model = d_model
        self.num_head = num_head
        self.dim_head = d_model//num_head
        
        self.w_q = nn.Linear(d_model,d_model)
        self.w_k = nn.Linear(d_model,d_model)
        self.w_v = nn.Linear(d_model,d_model)
        self.w_o = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.dim_head])).to(device)

    def forward(self,query,key,value,mask=None):
        batch_size = query.shape[0]
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)

        Q = Q.view(batch_size, -1, self.num_head, self.dim_head).transpose(1,2)
        K = K.view(batch_size , -1, self.num_head,self.dim_head).transpose(1,2)
        V = V.view(batch_size,-1,self.num_head,self.dim_head).transpose(1,2)

        scores = torch.matmul(Q,K.transpose(-2,-1))/self.scale
        
        if mask is not None:
            scores = scores.masked_fill(mask==0, -1e9)

        attention = torch.softmax(scores,dim=-1)
        attention = self.dropout(attention)

        x = torch.matmul(attention,V)
        
        x = x.transpose(1,2).contiguous().view(batch_size,-1,self.d_model)
        x = self.w_o(x)
        return x, attention

In [88]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self,d_model,dff,dropout = 0.1):
        super(PositionwiseFeedForward,self).__init__()
        self.linear1 = nn.Linear(d_model,dff)
        self.linear2 = nn.Linear(dff,d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self,x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [96]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model, num_head,dff, dropout=0.1):
        super(EncoderLayer,self).__init__()
        self.selfAttention = MultiheadAttention(d_model,num_head,dropout)
        self.feed_forward = PositionwiseFeedForward(d_model,dff,dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        atten_out,_ = self.selfAttention(x,x,x,mask)
        x = self.norm1(x+ self.dropout(atten_out))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [97]:
class DecoderLayer(nn.Module):
    def __init__(self,d_model,num_head,dff,dropout=0.1):
        super(DecoderLayer,self).__init__()
        self.selfAttention = MultiheadAttention(d_model,num_head,dropout)
        self.crossAttention = MultiheadAttention(d_model,num_head,dropout)
        self.feed_forward = PositionwiseFeedForward(d_model,dff,dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x,encoder_out,src_mask,tgt_mask):
        atten_out,_ = self.selfAttention(x,x,x,tgt_mask)
        x = self.norm1(x+self.dropout(atten_out))

        atten_out,atten_weight = self.crossAttention(x,encoder_out,encoder_out,src_mask)
        x = self.norm2(x+self.dropout(atten_out))
        
        ff_output = self.feed_forward(x)
        x = self.norm3(x+self.dropout(ff_output))

        return x, atten_weight

In [98]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [103]:
class Transformer(nn.Module):
    def __init__(self,src_vocab_size,tgt_vocab_size,d_model = 512,num_head= 8, num_layers = 6, dff= 2048,max_len = 100,dropout =0.1):
        super(Transformer,self).__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model,max_len)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model,num_head,dff,dropout) for _ in range(num_layers) ])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model,num_head,dff,dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model,tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def encode(self,src,src_mask):
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src)* math.sqrt(self.d_model)))
        for layer in self.encoder_layers:
            src_embedded = layer(src_embedded,src_mask)
        return src_embedded #Contextualized source representations

    def decode(self,tgt,encoder_output,src_mask,tgt_mask):
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt)*math.sqrt(self.d_model)))

        attention_weights= {}
        for i, layer in enumerate(self.decoder_layers):
            tgt_embedded,atten_weights = layer(tgt_embedded,encoder_output,src_mask,tgt_mask)
            attention_weights[f'decoder_layer_{i+1}'] = atten_weights
        return tgt_embedded, attention_weights


    def forward(self,src,tgt,src_mask,tgt_mask):
        encoder_output = self.encode(src,src_mask)
        decoder_output , attention_weights= self.decode(tgt,encoder_output,src_mask,tgt_mask)
        output = self.fc_out(decoder_output)
        return output,attention_weights
            
        

In [104]:
class translationDataset(Dataset):
    def __init__(self,english_sentences,french_sentences,eng_vocab,fr_vocab,max_len=100):
        self.english_sentences = english_sentences
        self.french_sentences = french_sentences
        self.eng_vocab = eng_vocab
        self.fr_vocab= fr_vocab
        self.max_len = max_len


    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self,idx): # gets one training sample at index idx
        english_sentence = self.english_sentences.iloc[idx]
        french_sentence = self.french_sentences.iloc[idx]

        eng_indices = [self.eng_vocab.word2idx['<sos>']] + \
                     self.eng_vocab.sentence_to_indices(english_sentence) + \
                     [self.eng_vocab.word2idx['<eos>']]
        
        fr_indices = [self.fr_vocab.word2idx['<sos>']] + \
                    self.fr_vocab.sentence_to_indices(french_sentence) + \
                    [self.fr_vocab.word2idx['<eos>']]
        
        eng_indices = self.pad_sequence(eng_indices, self.max_len)
        fr_indices = self.pad_sequence(fr_indices,self.max_len)

        return torch.tensor(eng_indices),torch.tensor(fr_indices)

    def pad_sequence(self,sequence,max_len):
        if len(sequence) < max_len:
             sequence = sequence + [self.eng_vocab.word2idx['<pad>']] * (max_len - len(sequence))
        else :
            sequence = sequence[:max_len-1]+ [self.eng_vocab.word2idx['<eos']]

        return sequence

train_df, val_df = train_test_split(df, test_size = 0.1 , random_state=42) 

train_dataset = translationDataset(train_df['english'],train_df['french'], eng_vocab,fr_vocab)
val_dataset = translationDataset( val_df['english'],val_df['french'],eng_vocab,fr_vocab)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = DataLoader(val_dataset,batch_size = batch_size,shuffle = True)
         

In [105]:
def create_masks(src, tgt):
    src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
    
    tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
    seq_length = tgt.size(1)
    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
    nopeak_mask = nopeak_mask.to(device)
    tgt_mask = tgt_mask & nopeak_mask
    
    return src_mask, tgt_mask

model = Transformer(
    src_vocab_size = eng_vocab.vocab_size,
    tgt_vocab_size = fr_vocab.vocab_size,
    d_model = 256,
    num_head = 8,
    num_layers = 3 ,
    dff=512,
    max_len = 100,
    dropout = 0.1    
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(),lr = 0.0001, betas =(0.9,0.98),eps = 1e-9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.95)

In [109]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    
    for batch_idx, (src, tgt) in enumerate(dataloader):
        src = src.to(device)
        tgt = tgt.to(device)
        
        # Create masks
        src_mask, tgt_mask = create_masks(src, tgt[:, :-1])
        
        optimizer.zero_grad()
        
        # Forward pass
        output, _ = model(src, tgt[:, :-1], src_mask, tgt_mask)
        
        # Calculate loss
        loss = criterion(output.contiguous().view(-1, output.shape[-1]), 
                        tgt[:, 1:].contiguous().view(-1))
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for src, tgt in dataloader:
            src = src.to(device)
            tgt = tgt.to(device)
            
            src_mask, tgt_mask = create_masks(src, tgt[:, :-1])
            
            output, _ = model(src, tgt[:, :-1], src_mask, tgt_mask)
            
            loss = criterion(output.contiguous().view(-1, output.shape[-1]), 
                            tgt[:, 1:].contiguous().view(-1))
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Training loop
num_epochs = 10
train_losses = []
val_losses = []

print("Starting Training...")
print("=" * 50)

for epoch in range(num_epochs):
    start_time = time.time()
    
   
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    
   
    val_loss = evaluate(model, test_loader, criterion)
    
  
    scheduler.step()
    
  
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    epoch_time = time.time() - start_time
    
  
    print(f'Epoch {epoch+1:02}/{num_epochs} | Time: {epoch_time:.2f}s')
    print(f'  Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    

print("Training Completed!")


Starting Training...
Epoch 01/10 | Time: 383.10s
  Train Loss: 2.2306 | Val Loss: 2.0065
Epoch 02/10 | Time: 382.66s
  Train Loss: 2.0290 | Val Loss: 1.8700
Epoch 03/10 | Time: 382.80s
  Train Loss: 1.8934 | Val Loss: 1.7560
Epoch 04/10 | Time: 382.76s
  Train Loss: 1.7940 | Val Loss: 1.6984
Epoch 05/10 | Time: 382.53s
  Train Loss: 1.7164 | Val Loss: 1.6368
Epoch 06/10 | Time: 382.38s
  Train Loss: 1.6524 | Val Loss: 1.5953
Epoch 07/10 | Time: 382.69s
  Train Loss: 1.5999 | Val Loss: 1.5529
Epoch 08/10 | Time: 382.20s
  Train Loss: 1.5562 | Val Loss: 1.5238
Epoch 09/10 | Time: 382.27s
  Train Loss: 1.5200 | Val Loss: 1.5006
Epoch 10/10 | Time: 382.51s
  Train Loss: 1.4901 | Val Loss: 1.4918
Training Completed!


In [120]:
def translate_sentence(sentence,model,eng_vocab,fr_vocab,max_len=100):
    model.eval()
    sentence = preprocess_text(sentence)
    tokens = [eng_vocab.word2idx['<sos>']] + eng_vocab.sentence_to_indices(sentence) + [eng_vocab.word2idx['<eos>']]
    
    if len(tokens)<max_len:
        tokens = tokens + [eng_vocab.word2idx['<pad>']] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len-1]+ [eng_vocab.word2idx['<eos>']]

    src = torch.tensor(tokens).unsqueeze(0).to(device)
    src_mask = (src!=0).unsqueeze(1).unsqueeze(2)

    tgt_tokens = [fr_vocab.word2idx['<sos>']]

    for i in range(max_len):
        tgt = torch.tensor(tgt_tokens).unsqueeze(0).to(device)
        tgt_mask = create_masks(src,tgt)[1]

        with torch.no_grad():
            output,_ = model(src,tgt,src_mask,tgt_mask)
        pred_token = output.argmax(2)[:,-1].item()
        tgt_tokens.append(pred_token)

        if pred_token == fr_vocab.word2idx['<eos>']:
            break

    translated_tokens = [fr_vocab.idx2word[idx] for idx in tgt_tokens if idx not in [fr_vocab.word2idx['<sos>'], fr_vocab.word2idx['<pad>']]]
    if translated_tokens and translated_tokens[-1] == '<eos>':
        translated_tokens = translated_tokens[:-1]
    
    return ' '.join(translated_tokens)


test_sentence = "Good morning"
translation = translate_sentence(test_sentence,model,eng_vocab,fr_vocab)
print(f"English :{test_sentence}")
print(f"French : {translation}")
        
        
    

English :Good morning
French : bon matin
