In [21]:
!pip install sacremoses
!pip install -U nltk
!pip install rouge




In [22]:
import collections
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge



In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [24]:
import os
def load_data(path):
    data_file = os.path.join(path)
    with open(data_file,"r") as f:
        lang_data = f.read()
    return lang_data.split('\n')


In [25]:
# Load English data
english_sentences = load_data('data/small_vocab_en.txt')
# Load French data
french_sentences = load_data('data/small_vocab_fr.txt')
print('Dataset Loaded')

Dataset Loaded


In [26]:
print(len(english_sentences))
print(len(french_sentences))

137861
137861


In [27]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [28]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')

print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [29]:
from transformers import MarianTokenizer
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

def tokenize_sentences(sentences):
    """Tokenize a list of sentences using a pre-trained MarianMT tokenizer."""
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Tokenize the English and French sentences
tokenized_english = tokenize_sentences(english_sentences).to(device)
tokenized_french = tokenize_sentences(french_sentences).to(device)
# print(tokenized_english['input_ids'].shape) 
# print(tokenized_french['input_ids'].shape)

In [30]:
print(tokenized_english)
print(tokenized_french)


{'input_ids': tensor([[  191, 44431,    32,  ..., 59513, 59513, 59513],
        [    4, 19710,  2667,  ..., 59513, 59513, 59513],
        [ 7418,   263,  2636,  ..., 59513, 59513, 59513],
        ...,
        [12502,    51,    32,  ..., 59513, 59513, 59513],
        [    4, 12610,    32,  ..., 59513, 59513, 59513],
        [    0, 59513, 59513,  ..., 59513, 59513, 59513]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]], device='cuda:0')}
{'input_ids': tensor([[  191, 44431,    43,  ..., 59513, 59513, 59513],
        [   16,    49,  2505,  ..., 59513, 59513, 59513],
        [ 7418,   263,  2636,  ..., 59513, 59513, 59513],
        ...,
        [    8, 12502,    51,  ..., 59513, 59513, 59513],
        [   14,     6,   247,  ..., 59513, 59513, 59513],
        [    0, 59513, 59513,  ...,

In [31]:
# Print the tokenized output
# print("Tokenized English input IDs:", tokenized_english['input_ids'])
# print("Tokenized French input IDs:", tokenized_french['input_ids'])
english_tokens_counter = collections.Counter(token for sentence in tokenized_english['input_ids'].cpu() for token in sentence.numpy())
french_tokens_counter = collections.Counter(token for sentence in tokenized_french['input_ids'].cpu() for token in sentence.numpy())

print('{} English tokens.'.format(len([token for sentence in tokenized_english['input_ids'].cpu() for token in sentence.numpy()])))
print('{} unique English tokens.'.format(len(english_tokens_counter)))
print('10 Most common tokens in the English dataset:')
# print('"' + '" "'.join(list(zip(*english_tokens_counter.most_common(10)))[0]) + '"')
counts = list(zip(*english_tokens_counter.most_common(10)))[0]  # Extract the counts
print('"' + '" "'.join(map(str, counts)) + '"')
# num_unique_english_tokens = len(english_tokens_counter)
# input_size = num_unique_english_tokens
# print("Input size",input_size)
print()
print('{} French tokens.'.format(len([token for sentence in tokenized_french['input_ids'].cpu() for token in sentence.numpy()])))
print('{} unique French tokens.'.format(len(french_tokens_counter)))
print('10 Most common tokens in the French dataset:')
counts = list(zip(*french_tokens_counter.most_common(10)))[0]  # Extract the counts
print('"' + '" "'.join(map(str, counts)) + '"')
# num_unique_french_tokens = len(french_tokens_counter)

# output_size = num_unique_french_tokens
# print("Output size",output_size)


3722247 English tokens.
221 unique English tokens.
10 Most common tokens in the English dataset:
"59513" "32" "49" "2" "0" "250" "18" "61" "475" "4"

6479467 French tokens.
386 unique French tokens.
10 Most common tokens in the French dataset:
"59513" "43" "49" "9" "0" "250" "2" "23" "92" "1101"


In [32]:
from transformers import MarianTokenizer

# Load the MarianMT tokenizer (English to French)
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

# Example token IDs
token_ids = [
    [34536,  6236,     9,     9,  1054,    32,     4,   877,  6548,  8957,
            32,   479,     0],  # Example token IDs
    [7418, 263, 2636, 59513, 59513]  # Another example
]

# Decode the token IDs into readable text
decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in token_ids]
print(decoded_texts)



['hello yassine is the best student Morocco is here', 'califor']


In [33]:
import yaml
with open('configs/config.yaml', 'r') as file:
    config = yaml.safe_load(file)



In [34]:
input_size = tokenizer.vocab_size
hidden_size = config['model']['hidden_size']
num_layers = config['model']['num_layers']
output_size = tokenizer.vocab_size
print('Config Loaded')
print('Input Size: ', input_size)
print('Output Size: ', output_size)
print('Hidden Size: ', hidden_size)
print('Number of Layers: ', num_layers)

Config Loaded
Input Size:  59514
Output Size:  59514
Hidden Size:  64
Number of Layers:  2


In [58]:

class StackedRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, num_layers=2):
        super(StackedRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Embedding layer: maps vocab indices to embedding vectors
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        
        # RNN layer
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        # Embed input word indices to word vectors
        x = self.embedding(x)
        
        # Pass through RNN layer
        x, hidden = self.rnn(x, hidden)
        
        # Reshape output for the fully connected layer
        x = x.contiguous().view(-1, self.hidden_size)
        
        # Pass through the fully connected layer
        x = self.fc(x)
        
        return x, hidden
    
    # Helper method to initialize hidden state
    def init_hidden(self, batch_size, device):
        # Return a zero-initialized hidden state for RNN (num_layers, batch_size, hidden_size)
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

# Example initialization
model_rnn = StackedRNN(vocab_size=input_size, hidden_size=hidden_size, output_size=output_size, num_layers=num_layers)


In [36]:
class StackedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(StackedLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Pass through stacked LSTM
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return nn.functional.log_softmax(out, dim=2)

# Example usage
model_lstm = StackedLSTM(input_size, hidden_size, output_size, num_layers)


In [37]:
class StackedGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(StackedGRU, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Pass through stacked GRU
        x = self.embedding(x)
        out, _ = self.gru(x)
        out = self.fc(out)
        return nn.functional.log_softmax(out, dim=2)

# Example usage
model_gru = StackedGRU(input_size, hidden_size, output_size, num_layers)


In [65]:
def train(model, tokenized_english, tokenized_french, num_epochs, batch_size, learning_rate, device):
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.to(device)
    # Initialize the loss function
    criterion = nn.NLLLoss()

    bleu_scores = []
    meteor_scores = []
    rouge_scores = Rouge()

    for epoch in range(num_epochs):
        for i in range(0, len(tokenized_english['input_ids']), batch_size):
            # Get batch inputs and targets
            input_ids = tokenized_english['input_ids'][i:i + batch_size].to(device)
            target_ids = tokenized_french['input_ids'][i:i + batch_size].to(device)
            hidden = model.init_hidden(input_ids.size(0), device)

            # Forward pass
            model.train()
            optimizer.zero_grad()
            
            # Unpack outputs and hidden from the model
            outputs, hidden = model(input_ids, hidden)

            # Reshape target and outputs for loss computation
            target_ids = target_ids.view(-1)  # Flatten target tensor
            outputs = outputs.view(-1, outputs.size(-1))  # Flatten outputs tensor
            
            # Calculate loss
            loss = criterion(outputs, target_ids)
            loss.backward()
            optimizer.step()

        # Calculate metrics after each epoch
        model.eval()
        with torch.no_grad():
            generated_sentences = []  # Store generated sentences for BLEU and METEOR
            reference_sentences = []  # Store reference sentences
            
            for input_id in tokenized_english['input_ids']:
                # Forward pass to get the model output, ignore hidden
                output, _ = model(input_id.unsqueeze(0).to(device), hidden=None)  
                predicted_ids = torch.argmax(output, dim=-1)
                generated_sentences.append(predicted_ids.cpu())  # Detach and append predictions
                reference_sentences.append(target_ids)  # Collect reference sentences

            # Calculate BLEU score
            bleu = corpus_bleu(reference_sentences, generated_sentences)
            bleu_scores.append(bleu)
            
            # Calculate ROUGE score
            rouge_score = rouge_scores.get_scores(generated_sentences, reference_sentences, avg=True)
            rouge_scores.append(rouge_score)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, BLEU: {bleu:.4f}, ROUGE: {rouge_score:.4f}')

    return bleu_scores, meteor_scores, rouge_scores


In [39]:
num_epochs = config['training']['num_epochs']
batch_size = config['training']['batch_size']
learning_rate = config['training']['learning_rate']


In [66]:
models = [model_rnn, model_lstm, model_gru]
for model in models:
    train(model, tokenized_english, tokenized_french, num_epochs=num_epochs, batch_size=batch_size, learning_rate=learning_rate,device=device)
    print(model.__class__.__name__, 'trained successfully!')

ValueError: Expected input batch_size (864) to match target batch_size (1504).