In [1]:
!pip install datasets
!pip install transformers
!pip install seacrowd>=0.2.0


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset

In [3]:
class FloresMultiLangDataset(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # # Tokenize the sentence
            # tokenized_input = self.tokenizer(
            #     template.format(sentence=sentence),
            #     return_tensors='pt',
            #     padding=True,
            #     truncation=True,
            #     max_length=self.max_length
            # )
            # tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            # sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [4]:
class FloresMultiLangDataset_embed(Dataset):
    def __init__(self, dataset, languages, tokenizer, max_length=512):
        """
        Custom PyTorch Dataset to load multiple languages from FLORES-200.
        Args:
            dataset (datasets.Dataset): The loaded dataset from FLORES-200.
            languages (dict): Dictionary of language names and their codes (e.g., {'English': 'eng_Latn', ...}).
            tokenizer (AutoTokenizer): The tokenizer for encoding text.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.dataset = dataset
        self.languages = languages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentences = {}
        for lang_name, lang_code in self.languages.items():
            sentence = self.dataset[idx][f"sentence_{lang_code}"]
            sentences[lang_name] = sentence

            # Tokenize the sentence
            tokenized_input = self.tokenizer(
                template.format(sentence=sentence),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=self.max_length
            )
            tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            sentences[f"{lang_name}_inputs"] = tokenized_input

        return sentences



def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding and print cosine similarity scores.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.
    """
    lang_names = list(languages.keys())
    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            print(f"Similarity between {lang1} and {lang2}: {sim_score[0][0]:.4f}")


In [5]:
# Define the prompt template
template = 'This sentence: "{sentence}" means in one word:'

# Language dictionary mapping language names to their FLORES-200 codes
languages = {
    'English': 'eng_Latn',
    'Chinese_Simplified': 'zho_Hans',
    'Russian': 'rus_Cyrl',
    'Dutch': 'nld_Latn',
    'German': 'deu_Latn'
}

model_name_or_path='microsoft/Phi-3.5-mini-instruct'
batch_size=4
max_samples=10
avg_pooling=False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Load FLORES-200 dataset
dataset = load_dataset('Muennighoff/flores200', 'all', split='devtest',trust_remote_code=True)

# Subset for faster testing
if max_samples:
    dataset = dataset.select(range(max_samples))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

flores200.py:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

In [6]:
def collate_fn(batch):
    """
    Custom collate function for padding the inputs in a batch and including the actual sentences.
    """
    batch_dict = {}

    for lang_name in languages.keys():
        # Extract all the tokenized inputs, attention masks, and actual sentences for this language in the batch
        inputs = [item[f"{lang_name}_inputs"]['input_ids'].squeeze(0) for item in batch]
        attention_masks = [item[f"{lang_name}_inputs"]['attention_mask'].squeeze(0) for item in batch]
        sentences = [item[lang_name] for item in batch]  # Actual sentences

        # Pad the sequences for each language
        padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
        padded_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)

        # Store padded inputs, masks, and sentences
        batch_dict[f"{lang_name}_inputs"] = {
            'input_ids': padded_inputs,
            'attention_mask': padded_masks
        }
        # batch_dict[f"{lang_name}_attention_masks"] = padded_masks
        batch_dict[f"{lang_name}"] = sentences

    return batch_dict


In [7]:
def get_embeddings_orginal(model, inputs, tokenizer, device,avg_pooling=False):
    embeddings = []
    for input in inputs:
        # Tokenize the sentence
        tokenized_input = tokenizer(
                template.format(sentence=input),
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=512
        )
        input = {k: v.to(device) for k, v in tokenized_input.items()}
        # Get raw embeddings
        with torch.no_grad():
            # print(input)
            hidden_states = model(output_hidden_states=True, return_dict=True, **input).hidden_states
            if avg_pooling:
                last_layer = hidden_states[-1]
                attention_mask = input['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
                outputs = (last_layer * attention_mask).mean(1)
            else:
                outputs = hidden_states[-1][:, -1, :]

            if outputs.dtype == torch.bfloat16:
                # bfloat16 not support for .numpy()
                outputs = outputs.float()

            embeddings.append(outputs.cpu().numpy())

    return np.vstack(embeddings)

In [38]:
def get_embeddings(model, tokenizer, sentences, device, target_language,avg_pooling=True):
    embeddings = []
    for sentence in sentences:
        sentence = template[target_language].format(sentence=sentence)
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get raw embeddings
        with torch.no_grad():
            hidden_states = model(output_hidden_states=True, return_dict=True, **inputs).hidden_states
            if avg_pooling:
                last_layer = hidden_states[-1]
                attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_layer.shape)
                outputs = (last_layer * attention_mask).mean(1)
            else:
                outputs = hidden_states[-1][:, -1, :]

            if outputs.dtype == torch.bfloat16:
                outputs = outputs.float()

            embeddings.append(outputs.cpu().numpy())

    return np.vstack(embeddings)

In [9]:
def find_most_similar(query_embedding, target_embeddings):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), target_embeddings)[0]
    return np.argmax(similarities)

def compare_languages(embeddings_dict, languages):
    """
    Compare each language embedding with every other language embedding.
    Args:
        embeddings_dict (dict): Dictionary of embeddings for each language.
        languages (dict): Dictionary of language names and codes.

    Returns:
        list of dict: List containing cosine similarity scores between languages.
    """
    lang_names = list(languages.keys())
    results = []

    for i, lang1 in enumerate(lang_names):
        for lang2 in lang_names[i + 1:]:
            # sim_score = cosine_similarity(embeddings_dict[lang1], embeddings_dict[lang2])
            sim_score = find_most_similar(embeddings_dict[lang1], embeddings_dict[lang2])
            print("sim_score", sim_score)
            avg_score = sim_score  # Assuming the similarity score is a 2D array
            results.append({'Language 1': lang1, 'Language 2': lang2, 'Cosine Similarity': avg_score})

    return results

### With padding

In [10]:
# Create dataset and dataloader
flores_dataset = FloresMultiLangDataset_embed(dataset, languages, tokenizer)

data_loader = DataLoader(flores_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
# Iterate through batches
print("\nEvaluating multi-language comparison...")
for batch in data_loader:
    embeddings_dict = {}

    # Get embeddings for each language
    for lang_name in languages.keys():
        inputs = batch[f"{lang_name}_inputs"]

        # inputs = batch[f"{lang_name}"]
        embeddings_dict[lang_name] = get_embeddings(model, inputs, device, avg_pooling)


    # Compare embeddings across all languages
    # compare_languages(embeddings_dict, languages)

print(embeddings_dict)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)



Evaluating multi-language comparison...


You are not running the flash-attention implementation, expect numerical differences.


{'English': array([[-0.6810483 ,  0.5636255 ,  1.4985955 , ...,  0.25928235,
        -0.6570488 , -1.6021911 ],
       [-0.8085609 ,  1.542456  ,  1.4518462 , ..., -0.37085328,
        -1.1139747 ,  0.36541122]], dtype=float32), 'Chinese_Simplified': array([[-0.99803936,  2.0504184 ,  0.7251579 , ..., -0.72027   ,
        -0.6299916 ,  0.43884766],
       [-1.9111211 ,  0.7332017 ,  1.7773608 , ..., -0.7536082 ,
        -0.46216893, -0.6455548 ]], dtype=float32), 'Russian': array([[-0.1575512 ,  2.2245595 ,  0.74896723, ..., -0.302447  ,
        -0.5379793 ,  0.7604788 ],
       [-1.1520818 ,  0.82284606,  1.6821033 , ...,  0.10595613,
        -0.7026712 , -0.3532121 ]], dtype=float32), 'Dutch': array([[-0.75713086,  1.9640365 ,  1.0295773 , ..., -0.9675391 ,
        -1.0510389 ,  0.23758511],
       [-1.469815  ,  0.5945768 ,  1.3816404 , ...,  0.39969704,
        -1.0548513 , -0.5089908 ]], dtype=float32), 'German': array([[-0.5297653 ,  2.3126752 ,  0.9612089 , ..., -0.63619095,
   

### Without padding

In [29]:
# Create dataset and dataloader
flores_dataset = FloresMultiLangDataset(dataset, languages, tokenizer)

data_loader = DataLoader(flores_dataset, batch_size=64, shuffle=False)
# Iterate through batches
print("\nEvaluating multi-language comparison...")
for batch in data_loader:
    embeddings_dict = {}

    # Get embeddings for each language
    for lang_name in languages.keys():

        inputs = batch[f"{lang_name}"]
        embeddings_dict[lang_name] = get_embeddings_orginal(model, inputs, tokenizer, device, avg_pooling)

    # Compare embeddings across all languages
    # compare_languages(embeddings_dict, languages)
print(embeddings_dict)


Evaluating multi-language comparison...
{'English': array([[-1.1653645e-01,  1.0594211e+00,  1.0164121e+00, ...,
         4.8997301e-01, -1.0327865e+00, -1.5234058e+00],
       [-3.8812381e-01,  3.8650429e-01,  1.3320467e+00, ...,
         5.2960849e-01, -6.8232042e-01, -6.3653588e-01],
       [-1.0909724e+00,  1.2619832e+00,  7.9809719e-01, ...,
         1.3612042e-01, -4.5336342e-01, -9.5113742e-01],
       ...,
       [ 1.5710264e-03,  1.0056028e+00, -5.6417778e-02, ...,
         1.5218052e-01, -1.7696951e-01, -1.3924565e+00],
       [-6.8104798e-01,  5.6362945e-01,  1.4985946e+00, ...,
         2.5928172e-01, -6.5704685e-01, -1.6021899e+00],
       [-1.4297134e+00,  6.1201471e-01,  1.1679485e+00, ...,
        -5.0867039e-01, -1.8688923e-01, -5.3805745e-01]], dtype=float32), 'Chinese_Simplified': array([[-0.7474899 ,  0.8443469 ,  0.58095926, ...,  0.41505593,
        -1.0497061 , -0.5718859 ],
       [-0.51353467,  0.35379994,  0.79364663, ...,  0.86338544,
        -1.9312159 , -0

In [26]:
import numpy as np
import pandas as pd

# Function to normalize embeddings
def normalize_embeddings(embeddings):
    return (embeddings.T / np.linalg.norm(embeddings, axis=1)).T

# Function to compute cosine similarity between embeddings
def compute_similarity(embedding, embeddings_to_compare):
    return embedding @ embeddings_to_compare.T

# Function for recall@1
def recall_at_1(similarity, targets):
    similarity = np.argsort(similarity, axis=1)[:, ::-1]
    correct = 0
    for i in range(similarity.shape[0]):
        if targets[i] == similarity[i, 0]:
            correct += 1
    recall_at_1 = correct / similarity.shape[0]
    return recall_at_1

# Function for recall@k
def recall_at_k(similarity, targets, k=3):
    similarity = np.argsort(similarity, axis=1)[:, ::-1]
    correct = 0
    for i in range(similarity.shape[0]):
        if targets[i] in similarity[i, :k]:
            correct += 1
    recall_at_k = correct / similarity.shape[0]
    return recall_at_k

# Updated function to evaluate translation accuracy and save average recall
def evaluate_translation_accuracy(embeddings_dict, target_language, k=3):
    total_recall_1_per_language = {lang: 0 for lang in embeddings_dict if lang != target_language}
    total_recall_k_per_language = {lang: 0 for lang in embeddings_dict if lang != target_language}
    total_pairs = len(embeddings_dict[target_language])  # Assuming target language is the reference

    # Target array, where each sentence in the target language should map to its corresponding index in each other language
    targets = list(range(total_pairs))

    print("\nEvaluating translation accuracy using recall@k...")

    # Loop through each sentence embedding in the target language
    for i, target_embedding in enumerate(embeddings_dict[target_language]):

        for lang_name, lang_embeddings in embeddings_dict.items():
            if lang_name == target_language:
                continue  # Skip comparing the target language with itself

            # Compute similarity between the current target sentence embedding and all sentence embeddings in the other language
            similarity = compute_similarity(target_embedding.reshape(1, -1), lang_embeddings)

            # Calculate recall@1 and recall@k for this specific sentence
            recall_1 = recall_at_1(similarity, [i])  # i is the target index for the corresponding sentence
            recall_k = recall_at_k(similarity, [i], k=k)

            # Accumulate total recall values for each language
            total_recall_1_per_language[lang_name] += recall_1
            total_recall_k_per_language[lang_name] += recall_k

    # Calculate average recall@1 and recall@k for each language
    avg_recall_1_per_language = {lang: total_recall_1_per_language[lang] / total_pairs for lang in total_recall_1_per_language}
    avg_recall_k_per_language = {lang: total_recall_k_per_language[lang] / total_pairs for lang in total_recall_k_per_language}

    # Save the average recall scores into a table
    results_table = []
    for lang_name in avg_recall_1_per_language:
        results_table.append({
            'Target Language': target_language,
            'Compared Language': lang_name,
            'Avg Recall@1': avg_recall_1_per_language[lang_name],
            'Avg Recall@k': avg_recall_k_per_language[lang_name]
        })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_table)

    # Save the DataFrame to a CSV file
    results_df.to_csv('f{}average_translation_accuracy_results.csv', index=False)

    print("\nAverage recall results saved to average_translation_accuracy_results.csv")
    print(results_df)
    return results_table


In [27]:
results_table = evaluate_translation_accuracy(embeddings_dict,"English")


Evaluating translation accuracy using recall@k...

Average recall results saved to average_translation_accuracy_results.csv
  Target Language   Compared Language  Avg Recall@1  Avg Recall@k
0         English  Chinese_Simplified           0.8           1.0
1         English             Russian           0.2           0.8
2         English               Dutch           0.8           1.0
3         English              German           0.8           0.8


In [28]:
print(results_table)

[{'Target Language': 'English', 'Compared Language': 'Chinese_Simplified', 'Avg Recall@1': 0.8, 'Avg Recall@k': 1.0}, {'Target Language': 'English', 'Compared Language': 'Russian', 'Avg Recall@1': 0.2, 'Avg Recall@k': 0.8}, {'Target Language': 'English', 'Compared Language': 'Dutch', 'Avg Recall@1': 0.8, 'Avg Recall@k': 1.0}, {'Target Language': 'English', 'Compared Language': 'German', 'Avg Recall@1': 0.8, 'Avg Recall@k': 0.8}]


In [30]:
# Define the sentence template for each language
template = {
    'English': 'This sentence: "{sentence}" means in one word:',
    'Chinese_Simplified': '这句话: "{sentence}" 用一个词来表示是:',
    'Russian': 'Это предложение: "{sentence}" означает одним словом:',
    'Dutch': 'Deze zin: "{sentence}" betekent in één woord:',
    'German': 'Dieser Satz: "{sentence}" bedeutet mit einem Wort:'
}

# Language dictionary mapping language names to their FLORES-200 codes
languages = {
    'English': 'eng_Latn',
    'Chinese_Simplified': 'zho_Hans',
    'Russian': 'rus_Cyrl',
    'Dutch': 'nld_Latn',
    'German': 'deu_Latn'
}


In [42]:
# Initialize list to store results
all_results = []

# Iterate through batches and evaluate for each language
for batch in data_loader:
    for target_language in languages.keys():
        print(f"\nEvaluating target language: {target_language}")

        embeddings_dict = {}
        # Get embeddings for each language
        for lang_name in languages.keys():
            inputs = batch[f"{lang_name}"]
            embeddings_dict[lang_name] = get_embeddings(model, tokenizer, inputs, device, lang_name)

        # Evaluate translation accuracy for the current target language
        results_table = evaluate_translation_accuracy(embeddings_dict, target_language, k=3)
        all_results.append(results_table)

# # Combine all results into a single DataFrame and save as CSV
# all_results_df = pd.concat(all_results, ignore_index=True)
# all_results_df.to_csv("multilingual_translation_accuracy.csv", index=False)

# Print a summary
print("\nFinal Results:")
print(all_results)


Evaluating target language: English

Evaluating translation accuracy using recall@k...

Average recall results saved to average_translation_accuracy_results.csv
  Target Language   Compared Language  Avg Recall@1  Avg Recall@k
0         English  Chinese_Simplified           0.1           0.3
1         English             Russian           0.1           0.3
2         English               Dutch           0.2           0.3
3         English              German           0.1           0.3

Evaluating target language: Chinese_Simplified

Evaluating translation accuracy using recall@k...

Average recall results saved to average_translation_accuracy_results.csv
      Target Language Compared Language  Avg Recall@1  Avg Recall@k
0  Chinese_Simplified           English           0.1           0.3
1  Chinese_Simplified           Russian           0.2           0.3
2  Chinese_Simplified             Dutch           0.2           0.3
3  Chinese_Simplified            German           0.1          

NameError: name 'all_results_df' is not defined

In [43]:
print(all_results)

[[{'Target Language': 'English', 'Compared Language': 'Chinese_Simplified', 'Avg Recall@1': 0.1, 'Avg Recall@k': 0.3}, {'Target Language': 'English', 'Compared Language': 'Russian', 'Avg Recall@1': 0.1, 'Avg Recall@k': 0.3}, {'Target Language': 'English', 'Compared Language': 'Dutch', 'Avg Recall@1': 0.2, 'Avg Recall@k': 0.3}, {'Target Language': 'English', 'Compared Language': 'German', 'Avg Recall@1': 0.1, 'Avg Recall@k': 0.3}], [{'Target Language': 'Chinese_Simplified', 'Compared Language': 'English', 'Avg Recall@1': 0.1, 'Avg Recall@k': 0.3}, {'Target Language': 'Chinese_Simplified', 'Compared Language': 'Russian', 'Avg Recall@1': 0.2, 'Avg Recall@k': 0.3}, {'Target Language': 'Chinese_Simplified', 'Compared Language': 'Dutch', 'Avg Recall@1': 0.2, 'Avg Recall@k': 0.3}, {'Target Language': 'Chinese_Simplified', 'Compared Language': 'German', 'Avg Recall@1': 0.1, 'Avg Recall@k': 0.3}], [{'Target Language': 'Russian', 'Compared Language': 'English', 'Avg Recall@1': 0.3, 'Avg Recall@k