In [1]:
import warnings
warnings.filterwarnings("ignore")

### Import Libraries

In [2]:
import pandas as pd
import re
import spacy
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

from transformers import AutoTokenizer, AutoModel

### Load Dataset

In [3]:
# Load French Monolingual Dataset
sentences = []

with open('./Dataset/Monolingual/fra_news_2023_1M/fra_news_2023_1M-sentences.txt', 'r', encoding='utf-8') as file:
    for line in file:
        if not line:
            continue

        parts = line.split('\t', 1)
        if len(parts) > 1:
            sentence = parts[1]
            sentences.append(sentence)

french_df = pd.DataFrame(sentences, columns=['Sentence'])
french_df

Unnamed: 0,Sentence
0,¤ Une coalition (BBY) composée de plus d’une c...
1,$Mais le groupe a aussi annoncé la suppression...
2,"$Toutefois, l’étude menée a également livré de..."
3,"€Dans les locaux d’Atem, à Solliès-Pont, les é..."
4,° 1.000.000 de dirhams hors taxes pour les mar...
...,...
999995,С’est son premier voyage à l’étranger depuis s...
999996,С’est un établissement d'enseignement supérieu...
999997,Сергей ГунеевAccéder à la base multimédiaVladi...
999998,"يا تونِسَ الأُنسِ يا خَضرا المَيادينِ », a écr..."


In [4]:
# Load English Monolingual Dataset
with open('./Dataset/Monolingual/news-commentary-v18.txt', 'r', encoding='utf-8') as file:
    sentences = file.readlines()

sentences = [sentence.strip() for sentence in sentences]
english_df = pd.DataFrame(sentences, columns=['Sentence'])
english_df

Unnamed: 0,Sentence
0,"$10,000 Gold?"
1,SAN FRANCISCO – It has never been easy to have...
2,"Lately, with gold prices up more than 300% ove..."
3,"Just last December, fellow economists Martin F..."
4,Wouldn’t you know it?
...,...
901593,"At the same time, Zuma’s revolutionary generat..."
901594,"In a region that reveres the elderly, Zuma’s a..."
901595,Three in ten South Africans are younger than 1...
901596,Somehow Zuma must find a way to honor his own ...


In [5]:
x = 20
french_df = french_df[:x]
english_df = english_df[:x]

### Preprocess Data

In [6]:
# Remove empty sentences if any
english_df = english_df[english_df['Sentence'].notna() & (english_df['Sentence'] != '')]
french_df = french_df[french_df['Sentence'].notna() & (french_df['Sentence'] != '')]

In [7]:
def preprocess_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)  # Remove punctuation
    return sentence

In [8]:
english_df.loc[:, 'Sentence'] = english_df['Sentence'].apply(lambda x: preprocess_text(x))
french_df.loc[:, 'Sentence'] = french_df['Sentence'].apply(lambda x: preprocess_text(x))

In [9]:
def remerge_sent(sent):
    # merges tokens which are not separated by white-space
    # does this recursively until no further changes
    changed = True
    while changed:
        changed = False
        i = 0
        while i < sent.__len__() - 1:
            tok = sent[i]
            if not tok.whitespace_:
                ntok = sent[i + 1]
                # in-place operation.
                with sent.retokenize() as retokenizer:
                    retokenizer.merge(sent[i: i + 2])
                changed = True
            i += 1
    return sent

In [10]:
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')

In [11]:
def tokenize_with_spacy(sentence, nlp):
    doc = nlp(sentence)
    spacy_sentence = remerge_sent(doc)
    return [token.text for token in spacy_sentence]

# Tokenize
english_df['Tokens'] = english_df['Sentence'].apply(lambda x: tokenize_with_spacy(x, nlp_en))
french_df['Tokens'] = french_df['Sentence'].apply(lambda x: tokenize_with_spacy(x, nlp_fr))

In [12]:
# Drop sentences longer than 50 words
english_df = english_df[english_df['Tokens'].apply(len) <= 50]
french_df = french_df[french_df['Tokens'].apply(len) <= 50]

### Generate BERT Embeddings

We can use pre trained transformer models for generating embeddings for the tokens, we can use different versions of BERT for generation embeddings for French, German etc.

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# Load the pre-trained BERT tokenizer and model
# Load the fast version of the tokenizer

tokenizer_en = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
bert_model_en = AutoModel.from_pretrained('bert-base-uncased')
bert_model_en = bert_model_en.to(device)

tokenizer_fr = AutoTokenizer.from_pretrained('camembert-base', use_fast=True)
bert_model_fr = AutoModel.from_pretrained('camembert-base')
bert_model_fr = bert_model_fr.to(device)

In [15]:
# Retrieves BERT embeddings for a list of tokens using a fast tokenizer, enabling accurate aggregation of subword embeddings into their original token representations.
def get_bert_embeddings(tokens, tokenizer, bert_model):
    inputs = tokenizer(tokens, return_tensors='pt', is_split_into_words=True, padding=False, truncation=True)

    # Get BERT embeddings from the model
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Get the embeddings for each subword
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (sequence_length, hidden_size)
    # Get word_ids to align subword tokens with the original tokens
    word_ids = inputs.word_ids()

    # Aggregate subword embeddings back to their original tokens
    aggregated_embeddings = []
    current_token_embeddings = []

    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if len(current_token_embeddings) > 0 and word_id != word_ids[idx - 1]:
            aggregated_embeddings.append(torch.mean(torch.stack(current_token_embeddings), dim=0))
            current_token_embeddings = []
        current_token_embeddings.append(token_embeddings[idx])
    
    if len(current_token_embeddings) > 0:
        aggregated_embeddings.append(torch.mean(torch.stack(current_token_embeddings), dim=0))

    return torch.stack(aggregated_embeddings)

In [16]:
# Function to generate BERT embeddings for dataFrame
def generate_embeddings(df, tokenizer, bert_model):
    embeddings_list = []
    for _, row in df.iterrows():
        tokenized_sentence = row['Tokens']
        embeddings = get_bert_embeddings(tokenized_sentence, tokenizer, bert_model)
        embeddings_list.append(embeddings)
    return embeddings_list

In [17]:
def generate_padded_embeddings(df, embedding_list):
    # Checking the length of longest sentence
    max_len = max(embedding.shape[0] for embedding in embedding_list)
    print(f"The length of the longest sentence is: {max_len}")

    # Pad the embeddings to ensure uniformity across dataset as model will be trained in batches
    padded_embeddings = pad_sequence(embedding_list, batch_first=True)
    df['Embeddings'] = [padded_embeddings[i] for i in range(padded_embeddings.shape[0])]
    print(padded_embeddings.shape)

In [18]:
english_embeddings = generate_embeddings(english_df, tokenizer_en, bert_model_en)
english_embeddings[0].shape

torch.Size([2, 768])

In [19]:
generate_padded_embeddings(english_df, english_embeddings)

The length of the longest sentence is: 31
torch.Size([20, 31, 768])


In [20]:
french_embeddings = generate_embeddings(french_df, tokenizer_fr, bert_model_fr)
french_embeddings[0].shape

torch.Size([30, 768])

In [21]:
generate_padded_embeddings(french_df, french_embeddings)

The length of the longest sentence is: 35
torch.Size([20, 35, 768])


### Denoising Auto Encoding (DAE)

Noise is added to make sure that self auto encoding mechanism is not just returning the same sequence of words, and rather learns the structure of the sentence, otherwise for a random sequence of words also it will return the same output.

In [22]:
def apply_word_dropout(sentence, pwd=0.1):
    noisy_sentence = []

    for word in sentence:
        if random.random() > pwd:
            # Drop the word with probability pwd
            noisy_sentence.append(word)
    return noisy_sentence

In [23]:
def apply_sentence_shuffling(sentence, k=3, alpha=0.5):
    n = len(sentence)

    # Generating random permutation vector q
    q = np.array([i + random.uniform(0, alpha) for i in range(n)])
    
    # Sort the indices based on q, which gives the new positions for the words
    permuted_indices = np.argsort(q)
    
    shuffled_sentence = [sentence[i] for i in permuted_indices]
    
    # Ensure the distance condition |σ(i) - i| <= k
    for i in range(n):
        if abs(permuted_indices[i] - i) > k:
            # Revert to original if the condition fails
            shuffled_sentence[i] = sentence[i]  
    
    return shuffled_sentence

In [24]:
def denoising_autoencoder(x, pwd=0.1, k=3, alpha=0.5):
    # Apply word dropout
    x_noisy = apply_word_dropout(x, pwd)
    
    # Apply sentence shuffling
    x_noisy = apply_sentence_shuffling(x_noisy, k, alpha)
    
    return x_noisy

### BiLSTM Encoder

Generates sequence of hidden (latent) states for a given sequence of input embeddings (src/tgt language).

In [25]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True, num_layers=3)
        
        # Fully connected layers for final prediction
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)

        self.activation = nn.ReLU()  # Activation function
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)

    def forward(self, inputs):
        lstm_out, _ = self.lstm(inputs)  # (batch_size, seq_len, hidden_dim * 2)
        
        # Apply dropout and batch normalization
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.batch_norm(lstm_out.transpose(1, 2)).transpose(1, 2)
        
        # Pass through fully connected layers
        fc_output = self.activation(self.fc1(lstm_out))  # (batch_size, seq_len, hidden_dim)
        fc_output = self.activation(self.fc2(fc_output))  # (batch_size, seq_len, hidden_dim // 2)
        latent_output = self.fc3(fc_output)  # (batch_size, seq_len, output_dim)
        
        return latent_output

# Model parameters
input_dim = 768  # BERT Embedding size
hidden_dim = 300
output_dim = 100  # Size of the latent state

# Initialize model
encoder_model = Encoder(input_dim, hidden_dim, output_dim)

#### LSTM Decoder

Generates sequence of words (src/tgt) language based on the previous hidden state, the current word, and a context vector given by a weighted sum over the encoder states.

In [26]:
# class Attention(nn.Module):
#     def __init__(self, latent_dim, embed_dim):
#         super(Attention, self).__init__()
#         # Define the attention layer
#         self.attn = nn.Linear(latent_dim + embed_dim, embed_dim)
#         self.v = nn.Parameter(torch.rand(embed_dim))

#     def forward(self, hidden, encoder_outputs):
#         """
#         Compute the attention weights and the context vector.
        
#         Args:
#             hidden: Previous hidden state of the decoder (batch_size, embed_dim)
#             encoder_outputs: All hidden states from the encoder (batch_size, seq_len, latent_dim)
        
#         Returns:
#             attn_weights: Attention weights (batch_size, seq_len)
#         """
#         batch_size = encoder_outputs.size(0)
#         max_len = encoder_outputs.size(1)

#         # Expand hidden to (batch_size, seq_len, latent_dim) for concatenation
#         hidden = hidden.unsqueeze(1).expand(batch_size, max_len, -1)

#         # Concatenate hidden state and encoder outputs
#         energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # (batch_size, seq_len, embed_dim)
        
#         # Compute attention weights
#         energy = energy.transpose(1, 2)  # (batch_size, embed_dim, seq_len)
#         v = self.v.repeat(batch_size, 1).unsqueeze(1)  # (batch_size, 1, embed_dim)
#         attn_weights = torch.bmm(v, energy).squeeze(1)  # (batch_size, seq_len)
#         attn_weights = F.softmax(attn_weights, dim=1)  # Normalize over seq_len
        
#         return attn_weights

# class AttentionCombine(nn.Module):
#     def __init__(self, latent_dim, embed_dim):
#         super(AttentionCombine, self).__init__()
#         # Combine the attention context vector with the decoder hidden state
#         self.attn_combine = nn.Linear(latent_dim + embed_dim, embed_dim)

#     def forward(self, attn_weights, encoder_outputs, hidden):
#         """
#         Combine the context vector with the hidden state.
        
#         Args:
#             attn_weights: Attention weights (batch_size, seq_len)
#             encoder_outputs: All hidden states from the encoder (batch_size, seq_len, latent_dim)
#             hidden: Previous hidden state of the decoder (batch_size, embed_dim)
        
#         Returns:
#             combined_input: Combined input to pass to the LSTM (batch_size, embed_dim)
#         """
#         # Compute the context vector by applying attention weights to the encoder outputs
#         context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, latent_dim)
#         context = context.squeeze(1)  # (batch_size, latent_dim)

#         # Concatenate the context vector with the previous hidden state
#         combined_input = torch.cat((context, hidden), dim=1)  # (batch_size, latent_dim + embed_dim)

#         # Pass through the attention-combine linear layer
#         combined_input = torch.tanh(self.attn_combine(combined_input))  # (batch_size, embed_dim)
        
#         return combined_input

# class DecoderWithAttention(nn.Module):
#     def __init__(self, output_dim, latent_dim, embed_dim, num_layers=3):
#         super(DecoderWithAttention, self).__init__()
        
#         self.latent_dim = latent_dim
#         self.embed_dim = embed_dim
#         self.num_layers = num_layers
        
#         # LSTM layers for decoding
#         self.lstm = nn.LSTM(embed_dim, embed_dim, num_layers=num_layers, batch_first=True)

#         # Attention mechanism and attention-combine layer
#         self.attn = Attention(latent_dim, embed_dim)
#         self.attn_combine = AttentionCombine(latent_dim, embed_dim)

#         # Fully connected layer to project the latent space to output embedding space
#         self.fc_out = nn.Linear(embed_dim, output_dim)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, latent_vectors, encoder_outputs, hidden):
#         """
#         Args:
#             latent_vectors: The latent vectors from the encoder (batch_size, seq_len, latent_dim)
#             encoder_outputs: The encoder hidden states for attention (batch_size, seq_len, latent_dim)
#             hidden: Previous hidden state of the LSTM (num_layers, batch_size, embed_dim)
            
#         Returns:
#             output: The predicted output (batch_size, seq_len, output_dim)
#             hidden: The updated hidden state (num_layers, batch_size, embed_dim)
#         """
#         batch_size = latent_vectors.size(0)
#         seq_len = latent_vectors.size(1)
        
#         # Initialize the output list
#         outputs = torch.zeros(batch_size, seq_len, self.fc_out.out_features).to(latent_vectors.device)

#         # Process each time step (sequentially) through the decoder
#         for t in range(seq_len):
#             # Get the attention weights from the previous hidden state and encoder outputs
#             attn_weights = self.attn(hidden[-1], encoder_outputs)

#             # Combine the attention context vector with the current latent vector
#             lstm_input = self.attn_combine(attn_weights, encoder_outputs, latent_vectors[:, t])

#             # LSTM forward pass
#             lstm_out, hidden = self.lstm(lstm_input.unsqueeze(1), hidden)  # lstm_out: (batch_size, 1, embed_dim)
#             lstm_out = self.dropout(lstm_out)

#             # Predict the output at the current time step
#             outputs[:, t] = self.fc_out(lstm_out.squeeze(1))

#         return outputs, hidden

#     def init_hidden(self, batch_size):
#         """ Initialize hidden and cell states for the LSTM layers """
#         h0 = torch.zeros(self.num_layers, batch_size, self.embed_dim).to(next(self.parameters()).device)
#         c0 = torch.zeros(self.num_layers, batch_size, self.embed_dim).to(next(self.parameters()).device)
#         return (h0, c0)