In [1]:
!pip install datasets
!pip install transformers
!pip install seacrowd>=0.2.0


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset

In [17]:
class FloresMultiLangDataset(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # # Tokenize the sentence
            # tokenized_input = self.tokenizer(
            #     template.format(sentence=sentence),
            #     return_tensors='pt',
            #     padding=True,
            #     truncation=True,
            #     max_length=self.max_length
            # )
            # tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            # sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [18]:
class FloresMultiLangDataset_embed(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # Tokenize the sentence
            tokenized_input = self.tokenizer(
                template.format(sentence=sentence),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=self.max_length
            )
            tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [4]:
# Define the prompt template
template = 'This sentence: "{sentence}" means in one word:'

# Language dictionary mapping language names to their FLORES-200 codes
languages = {
    'English': 'eng_Latn',
    'Chinese_Simplified': 'zho_Hans',
    'Russian': 'rus_Cyrl',
    'Dutch': 'nld_Latn',
    'German': 'deu_Latn'
}

model_name_or_path='microsoft/Phi-3.5-mini-instruct'
batch_size=4
max_samples=10
avg_pooling=False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Load FLORES-200 dataset
dataset = load_dataset('Muennighoff/flores200', 'all', split='devtest',trust_remote_code=True)

# Subset for faster testing
if max_samples:
    dataset = dataset.select(range(max_samples))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

flores200.py:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

In [12]:
def collate_fn(batch):
    """
    Custom collate function for padding the inputs in a batch and including the actual sentences.
    """
    batch_dict = {}

    for lang_name in languages.keys():
        # Extract all the tokenized inputs, attention masks, and actual sentences for this language in the batch
        inputs = [item[f"{lang_name}_inputs"]['input_ids'].squeeze(0) for item in batch]
        attention_masks = [item[f"{lang_name}_inputs"]['attention_mask'].squeeze(0) for item in batch]
        sentences = [item[lang_name] for item in batch]  # Actual sentences

        # Pad the sequences for each language
        padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
        padded_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)

        # Store padded inputs, masks, and sentences
        batch_dict[f"{lang_name}_inputs"] = {
            'input_ids': padded_inputs,
            'attention_mask': padded_masks
        }
        # batch_dict[f"{lang_name}_attention_masks"] = padded_masks
        batch_dict[f"{lang_name}"] = sentences

    return batch_dict


In [6]:
def get_embeddings_orginal(model, inputs, tokenizer, device,avg_pooling=False):
    embeddings = []
    for input in inputs:
        # Tokenize the sentence
        tokenized_input = tokenizer(
                template.format(sentence=input),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=512
        )
        input = {k: v.to(device) for k, v in tokenized_input.items()}
        # Get raw embeddings
        with torch.no_grad():
            # print(input)
            hidden_states = model(output_hidden_states=True, return_dict=True, **input).hidden_states
            if avg_pooling:
                last_layer = hidden_states[-1]
                attention_mask = input['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
                outputs = (last_layer * attention_mask).mean(1)
            else:
                outputs = hidden_states[-1][:, -1, :]

            if outputs.dtype == torch.bfloat16:
                # bfloat16 not support for .numpy()
                outputs = outputs.float()

            embeddings.append(outputs.cpu().numpy())

    return np.vstack(embeddings)

In [7]:
def get_embeddings(model, inputs, device, avg_pooling=False):
    """
    Get embeddings for a batch of inputs from the model.

    Args:
        model: The model to use for embedding extraction.
        inputs (dict): A dictionary containing tokenized inputs including input_ids and attention_mask.
        device: The device to move tensors to (CPU or GPU).
        avg_pooling (bool): Whether to use average pooling or not.

    Returns:
        numpy.ndarray: The embeddings for the input sentences.
    """

    # Get embeddings
    with torch.no_grad():
        # Compute hidden states for the entire batch
        hidden_states = model(**inputs, output_hidden_states=True, return_dict=True).hidden_states

        if avg_pooling:
            # Average pooling
            last_layer = hidden_states[-1]
            attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
            outputs = (last_layer * attention_mask).mean(1)
        else:
            # Last token embeddings
            outputs = hidden_states[-1][:, -1, :]

        if outputs.dtype == torch.bfloat16:
            # Convert bfloat16 to float
            outputs = outputs.float()

        return outputs.cpu().numpy()


In [8]:
def find_most_similar(query_embedding, target_embeddings):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), target_embeddings)[0]
    return np.argmax(similarities)

def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.

    Returns:
        list of dict: List containing cosine similarity scores between languages.
    """
    lang_names = list(languages.keys())
    results = []

    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            # sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            sim_score = find_most_similar(embeddings_dict[lang1], embeddings_dict[lang2])
            print("sim_score", sim_score)
            avg_score = sim_score  # Assuming the similarity score is a 2D array
            results.append({'Language 1': lang1, 'Language 2': lang2, 'Cosine Similarity': avg_score})

    return results

### With padding

In [19]:
# Create dataset and dataloader
flores_dataset = FloresMultiLangDataset_embed(dataset, languages, tokenizer)

data_loader = DataLoader(flores_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
# Iterate through batches
print("\nEvaluating multi-language comparison...")
for batch in data_loader:
    embeddings_dict = {}

    # Get embeddings for each language
    for lang_name in languages.keys():
        inputs = batch[f"{lang_name}_inputs"]

        # inputs = batch[f"{lang_name}"]
        embeddings_dict[lang_name] = get_embeddings(model, inputs, device, avg_pooling)


    # Compare embeddings across all languages
    # compare_languages(embeddings_dict, languages)

print(embeddings_dict)


Evaluating multi-language comparison...
{'English': array([[-0.6810483 ,  0.5636255 ,  1.4985955 , ...,  0.25928235,
        -0.6570488 , -1.6021911 ],
       [-0.8085609 ,  1.542456  ,  1.4518462 , ..., -0.37085328,
        -1.1139747 ,  0.36541122]], dtype=float32), 'Chinese_Simplified': array([[-0.99803936,  2.0504184 ,  0.7251579 , ..., -0.72027   ,
        -0.6299916 ,  0.43884766],
       [-1.9111211 ,  0.7332017 ,  1.7773608 , ..., -0.7536082 ,
        -0.46216893, -0.6455548 ]], dtype=float32), 'Russian': array([[-0.1575512 ,  2.2245595 ,  0.74896723, ..., -0.302447  ,
        -0.5379793 ,  0.7604788 ],
       [-1.1520818 ,  0.82284606,  1.6821033 , ...,  0.10595613,
        -0.7026712 , -0.3532121 ]], dtype=float32), 'Dutch': array([[-0.75713086,  1.9640365 ,  1.0295773 , ..., -0.9675391 ,
        -1.0510389 ,  0.23758511],
       [-1.469815  ,  0.5945768 ,  1.3816404 , ...,  0.39969704,
        -1.0548513 , -0.5089908 ]], dtype=float32), 'German': array([[-0.5297653 ,  2.312

### Without padding

In [15]:
# Create dataset and dataloader
flores_dataset = FloresMultiLangDataset(dataset, languages, tokenizer)

data_loader = DataLoader(flores_dataset, batch_size=2, shuffle=False)
# Iterate through batches
print("\nEvaluating multi-language comparison...")
for batch in data_loader:
    embeddings_dict = {}

    # Get embeddings for each language
    for lang_name in languages.keys():

        inputs = batch[f"{lang_name}"]
        embeddings_dict[lang_name] = get_embeddings_orginal(model, inputs, tokenizer, device, avg_pooling)

    # Compare embeddings across all languages
    # compare_languages(embeddings_dict, languages)
print(embeddings_dict)


Evaluating multi-language comparison...
{'English': array([[-0.681048  ,  0.56362945,  1.4985946 , ...,  0.25928172,
        -0.65704685, -1.6021899 ],
       [-1.4297134 ,  0.6120147 ,  1.1679485 , ..., -0.5086704 ,
        -0.18688923, -0.53805745]], dtype=float32), 'Chinese_Simplified': array([[-0.18980633,  1.1031379 ,  1.3552455 , ...,  0.02886122,
        -0.5978378 , -0.724843  ],
       [-1.9111208 ,  0.73320115,  1.7773612 , ..., -0.7536093 ,
        -0.46217003, -0.64555466]], dtype=float32), 'Russian': array([[-0.7442848 ,  1.2163576 ,  1.7224597 , ...,  0.6914887 ,
        -1.1447638 , -1.1658822 ],
       [-1.1520803 ,  0.8228463 ,  1.6821078 , ...,  0.10596199,
        -0.70267195, -0.35321018]], dtype=float32), 'Dutch': array([[-0.22489499,  0.832567  ,  1.562782  , ...,  0.18727915,
        -0.28485298, -0.7620505 ],
       [-1.4698157 ,  0.5945792 ,  1.3816407 , ...,  0.3996979 ,
        -1.0548487 , -0.50899065]], dtype=float32), 'German': array([[-0.63848245,  0.909