In [1]:
!pip install datasets
!pip install transformers
!pip install seacrowd>=0.2.0




In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import csv

In [3]:
class FloresMultiLangDataset(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # # Tokenize the sentence
            # tokenized_input = self.tokenizer(
            #     template.format(sentence=sentence),
            #     return_tensors='pt',
            #     padding=True,
            #     truncation=True,
            #     max_length=self.max_length
            # )
            # tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            # sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [4]:
class FloresMultiLangDataset_embed(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # Tokenize the sentence
            tokenized_input = self.tokenizer(
                template.format(sentence=sentence),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=self.max_length
            )
            tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [5]:
def collate_fn(batch):
    """
    Custom collate function for padding the inputs in a batch and including the actual sentences.
    """
    batch_dict = {}

    for lang_name in languages.keys():
        # Extract all the tokenized inputs, attention masks, and actual sentences for this language in the batch
        inputs = [item[f"{lang_name}_inputs"]['input_ids'].squeeze(0) for item in batch]
        attention_masks = [item[f"{lang_name}_inputs"]['attention_mask'].squeeze(0) for item in batch]
        sentences = [item[lang_name] for item in batch]  # Actual sentences

        # Pad the sequences for each language
        padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
        padded_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)

        # Store padded inputs, masks, and sentences
        batch_dict[f"{lang_name}_inputs"] = {
            'input_ids': padded_inputs,
            'attention_mask': padded_masks
        }
        # batch_dict[f"{lang_name}_attention_masks"] = padded_masks
        batch_dict[f"{lang_name}"] = sentences

    return batch_dict


In [6]:
def get_embeddings_orginal(model, inputs, tokenizer, device,avg_pooling=False):
    embeddings = []
    for input in inputs:
        # Tokenize the sentence
        tokenized_input = tokenizer(
                template.format(sentence=input),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=512
        )
        input = {k: v.to(device) for k, v in tokenized_input.items()}
        # Get raw embeddings
        with torch.no_grad():
            # print(input)
            hidden_states = model(output_hidden_states=True, return_dict=True, **input).hidden_states
            if avg_pooling:
                last_layer = hidden_states[-1]
                attention_mask = input['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
                outputs = (last_layer * attention_mask).mean(1)
            else:
                outputs = hidden_states[-1][:, -1, :]

            if outputs.dtype == torch.bfloat16:
                # bfloat16 not support for .numpy()
                outputs = outputs.float()

            embeddings.append(outputs.cpu().numpy())

    return np.vstack(embeddings)

In [7]:
def get_embeddings(model, tokenizer, sentences, device, target_language,avg_pooling=True):
    embeddings = []
    for sentence in sentences:
        sentence = template[target_language].format(sentence=sentence)
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get raw embeddings
        with torch.no_grad():
            hidden_states = model(output_hidden_states=True, return_dict=True, **inputs).hidden_states
            if avg_pooling:
                last_layer = hidden_states[-1]
                attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
                outputs = (last_layer * attention_mask).mean(1)
            else:
                outputs = hidden_states[-1][:, -1, :]

            if outputs.dtype == torch.bfloat16:
                outputs = outputs.float()

            embeddings.append(outputs.cpu().numpy())

    return np.vstack(embeddings)

In [8]:
def find_most_similar(query_embedding, target_embeddings):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), target_embeddings)[0]
    return np.argmax(similarities)

def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.

    Returns:
        list of dict: List containing cosine similarity scores between languages.
    """
    lang_names = list(languages.keys())
    results = []

    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            # sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            sim_score = find_most_similar(embeddings_dict[lang1], embeddings_dict[lang2])
            print("sim_score", sim_score)
            avg_score = sim_score  # Assuming the similarity score is a 2D array
            results.append({'Language 1': lang1, 'Language 2': lang2, 'Cosine Similarity': avg_score})

    return results

In [12]:
def save_embeddings(embeddings_dict, filename):
    # Get all the unique "target_language-language" pairs as columns
    columns = []
    for target_language in embeddings_dict:
        for language in embeddings_dict[target_language]:
            column_name = f"{target_language}-{language}"
            columns.append(column_name)

    # Assuming all embeddings have the same index length
    index_length = len(next(iter(next(iter(embeddings_dict.values())).values())))
    # Prepare the data to be written to CSV
    data = []
    for i in range(index_length):
        row = []
        for target_language in embeddings_dict:
            for language in embeddings_dict[target_language]:
                embedding = embeddings_dict[target_language][language][i]
                row.append(embedding)
        data.append(row)

    # Write to CSV
    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Write the header (columns)
        csvwriter.writerow(columns)

        # Write the data (rows)
        csvwriter.writerows(data)

In [9]:
import numpy as np
import pandas as pd

# Function to normalize embeddings
def normalize_embeddings(embeddings):
    return (embeddings.T / np.linalg.norm(embeddings, axis=1)).T

# Function to compute cosine similarity between embeddings
def compute_similarity(embedding, embeddings_to_compare):
    return embedding @ embeddings_to_compare.T

# Function for recall@1
def recall_at_1(similarity, targets):
    similarity = np.argsort(similarity, axis=1)[:, ::-1]
    correct = 0
    for i in range(similarity.shape[0]):
        if targets[i] == similarity[i, 0]:
            correct += 1
    recall_at_1 = correct / similarity.shape[0]
    return recall_at_1

# Function for recall@k
def recall_at_k(similarity, targets, k=3):
    similarity = np.argsort(similarity, axis=1)[:, ::-1]
    correct = 0
    for i in range(similarity.shape[0]):
        if targets[i] in similarity[i, :k]:
            correct += 1
    recall_at_k = correct / similarity.shape[0]
    return recall_at_k

# Updated function to evaluate translation accuracy and save average recall
def evaluate_translation_accuracy(embeddings_dict, target_language, k=3):
    total_recall_1_per_language = {lang: 0 for lang in embeddings_dict if lang != target_language}
    total_recall_k_per_language = {lang: 0 for lang in embeddings_dict if lang != target_language}
    total_pairs = len(embeddings_dict[target_language])  # Assuming target language is the reference

    # Target array, where each sentence in the target language should map to its corresponding index in each other language
    targets = list(range(total_pairs))

    print("\nEvaluating translation accuracy using recall@k...")

    # Loop through each sentence embedding in the target language
    for i, target_embedding in enumerate(embeddings_dict[target_language]):

        for lang_name, lang_embeddings in embeddings_dict.items():
            if lang_name == target_language:
                continue  # Skip comparing the target language with itself

            # Compute similarity between the current target sentence embedding and all sentence embeddings in the other language
            similarity = compute_similarity(target_embedding.reshape(1, -1), lang_embeddings)

            # Calculate recall@1 and recall@k for this specific sentence
            recall_1 = recall_at_1(similarity, [i])  # i is the target index for the corresponding sentence
            recall_k = recall_at_k(similarity, [i], k=k)

            # Accumulate total recall values for each language
            total_recall_1_per_language[lang_name] += recall_1
            total_recall_k_per_language[lang_name] += recall_k

    # Calculate average recall@1 and recall@k for each language
    avg_recall_1_per_language = {lang: total_recall_1_per_language[lang] / total_pairs for lang in total_recall_1_per_language}
    avg_recall_k_per_language = {lang: total_recall_k_per_language[lang] / total_pairs for lang in total_recall_k_per_language}

    # Save the average recall scores into a table
    results_table = []
    for lang_name in avg_recall_1_per_language:
        results_table.append({
            'Target Language': target_language,
            'Compared Language': lang_name,
            'Avg Recall@1': avg_recall_1_per_language[lang_name],
            'Avg Recall@k': avg_recall_k_per_language[lang_name]
        })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_table)

    # Save the DataFrame to a CSV file
    results_df.to_csv('f{}average_translation_accuracy_results.csv', index=False)

    print("\nAverage recall results saved to average_translation_accuracy_results.csv")
    print(results_df)
    return results_table


# Experiment

In [10]:
# Define the sentence template for each language
template = {
    'English': 'This sentence: "{sentence}" means in one word:',
    'Chinese_Simplified': '这句话: "{sentence}" 用一个词来表示是:',
    'Russian': 'Это предложение: "{sentence}" означает одним словом:',
    'Dutch': 'Deze zin: "{sentence}" betekent in één woord:',
    'German': 'Dieser Satz: "{sentence}" bedeutet mit einem Wort:'
}

# Language dictionary mapping language names to their FLORES-200 codes
languages = {
    'English': 'eng_Latn',
    'Chinese_Simplified': 'zho_Hans',
    'Russian': 'rus_Cyrl',
    'Dutch': 'nld_Latn',
    'German': 'deu_Latn'
}


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct",
                                                     device_map='auto',
                                                     output_hidden_states=True,
                                                     trust_remote_code=True,
                                                     load_in_8bit= 16 == 8)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
tokenizer.padding_side = "left"  # Allow batched inference


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [16]:
# Load FLORES-200 dataset
dataset = load_dataset('Muennighoff/flores200', 'all', split='devtest',trust_remote_code=True)
max_samples = 4
if max_samples:
    dataset = dataset.select(range(max_samples))

flores_dataset = FloresMultiLangDataset(dataset, languages, tokenizer)

data_loader = DataLoader(flores_dataset, batch_size=2, shuffle=False)


print(len(flores_dataset))

4


In [17]:
# Initialize dictionary to store embeddings for each target language and language pair
embeddings_dict = {target_language: {lang_name: np.array([]) for lang_name in languages.keys()} for target_language in languages.keys()}

# Stage 1: Save embeddings for each language and target language
print("Generating embeddings for all languages...")

# Iterate through batches and generate embeddings
for batch in tqdm(data_loader, desc="Batches Progress", leave=True):
    for target_language in languages.keys():

        for lang_name in languages.keys():
            print(f"\nGenerating embeddings for target language: {target_language} and langauge {lang_name}")
            # Get embeddings for the current language and append to the dictionary for the target language
            inputs = batch[f"{lang_name}"]
            embeddings = get_embeddings(model, tokenizer, inputs, device, target_language)

            # Convert list to a NumPy array and append to the dictionary for the target language
            if len(embeddings_dict[target_language][lang_name]) == 0:

                # If it's the first batch, initialize the array
                embeddings_dict[target_language][lang_name] = embeddings

            else:
                # Concatenate the new embeddings with the existing array
                embeddings_dict[target_language][lang_name] = np.concatenate(
                    (embeddings_dict[target_language][lang_name], embeddings),
                    axis=0
                )

# At this point, embeddings_dict contains embeddings for all languages and target languages



Generating embeddings for all languages...


Batches Progress:   0%|          | 0/2 [00:00<?, ?it/s]


Generating embeddings for target language: English and langauge English





Generating embeddings for target language: English and langauge Chinese_Simplified

Generating embeddings for target language: English and langauge Russian

Generating embeddings for target language: English and langauge Dutch

Generating embeddings for target language: English and langauge German

Generating embeddings for target language: Chinese_Simplified and langauge English

Generating embeddings for target language: Chinese_Simplified and langauge Chinese_Simplified

Generating embeddings for target language: Chinese_Simplified and langauge Russian

Generating embeddings for target language: Chinese_Simplified and langauge Dutch

Generating embeddings for target language: Chinese_Simplified and langauge German

Generating embeddings for target language: Russian and langauge English

Generating embeddings for target language: Russian and langauge Chinese_Simplified

Generating embeddings for target language: Russian and langauge Russian

Generating embeddings for target language

Batches Progress:  50%|█████     | 1/2 [00:42<00:42, 42.81s/it]


Generating embeddings for target language: English and langauge English

Generating embeddings for target language: English and langauge Chinese_Simplified

Generating embeddings for target language: English and langauge Russian

Generating embeddings for target language: English and langauge Dutch

Generating embeddings for target language: English and langauge German

Generating embeddings for target language: Chinese_Simplified and langauge English

Generating embeddings for target language: Chinese_Simplified and langauge Chinese_Simplified

Generating embeddings for target language: Chinese_Simplified and langauge Russian

Generating embeddings for target language: Chinese_Simplified and langauge Dutch

Generating embeddings for target language: Chinese_Simplified and langauge German

Generating embeddings for target language: Russian and langauge English

Generating embeddings for target language: Russian and langauge Chinese_Simplified

Generating embeddings for target language

Batches Progress: 100%|██████████| 2/2 [01:18<00:00, 39.33s/it]


In [18]:
# Stage 2: Evaluate translation accuracy using the stored embeddings
print("Evaluating translation accuracy for each target language...")

# Initialize list to store results
all_results = []

# this is the prompt
for target_language in languages.keys():
    print(f"\nEvaluating target language: {target_language}")

    # Evaluate using the embeddings in the dictionary for current target language vs. other languages
    results_table = evaluate_translation_accuracy(embeddings_dict[target_language], target_language, k=3)

    # Store the results for each comparison
    all_results += results_table

# Print a summary of the results
print("\nFinal Results:")
print(all_results)

Evaluating translation accuracy for each target language...

Evaluating target language: English

Evaluating translation accuracy using recall@k...

Average recall results saved to average_translation_accuracy_results.csv
  Target Language   Compared Language  Avg Recall@1  Avg Recall@k
0         English  Chinese_Simplified          0.25          0.75
1         English             Russian          0.25          1.00
2         English               Dutch          0.25          0.75
3         English              German          0.25          0.75

Evaluating target language: Chinese_Simplified

Evaluating translation accuracy using recall@k...

Average recall results saved to average_translation_accuracy_results.csv
      Target Language Compared Language  Avg Recall@1  Avg Recall@k
0  Chinese_Simplified           English          0.25          0.75
1  Chinese_Simplified           Russian          0.25          0.75
2  Chinese_Simplified             Dutch          0.25          0.75
3  

In [19]:
# Convert the list to a DataFrame
df = pd.DataFrame(all_results)
df.to_csv('average_translation_accuracy_results.csv', index=False)

# Dynamically pivot the DataFrame based on target and compared languages

pivot_df = df.pivot(index='Target Language', columns='Compared Language', values='Avg Recall@k')

pivot_df.to_csv('average_translation_accuracy_table.csv', index=True)

# Show the result
print(pivot_df)

Compared Language   Chinese_Simplified  Dutch  English  German  Russian
Target Language                                                        
Chinese_Simplified                 NaN   0.75     0.75    0.75     0.75
Dutch                             0.75    NaN     0.75    0.75     1.00
English                           0.75   0.75      NaN    0.75     1.00
German                            0.75   0.75     1.00     NaN     0.75
Russian                           0.75   1.00     0.75    0.75      NaN


In [20]:
save_embeddings(embeddings_dict, 'embeddings.csv')
