We repeat the procedure in other notebook to import dataset: 

In [1]:
import itertools
import random
from pprint import pprint
from sentence_transformers import util
from transformers import BertTokenizer, BertModel
import torch

# Define the paths to the dataset files
english_dataset_path = r'./dataset2/News-Commentary/News-Commentary.de-en.en'
german_dataset_path = r'./dataset2/News-Commentary/News-Commentary.de-en.de'

# Function to get random sentence pairs
def get_random_sentence_pairs(english_path, chinese_path, num_pairs=1):
    with open(english_path, 'r', encoding='utf-8') as eng_file, \
         open(chinese_path, 'r', encoding='utf-8') as zh_file:
        
        # Read all lines from both files
        english_lines = eng_file.readlines()
        chinese_lines = zh_file.readlines()
        
        # Ensure both files have the same number of lines
        if len(english_lines) != len(chinese_lines):
            print("Error: The files don't have the same number of lines.")
            return None

        # Generate random indices
        random_indices = random.sample(range(len(english_lines)), num_pairs)
        
        # Get the random sentence pairs
        sentence_pairs = [(english_lines[index].strip(), chinese_lines[index].strip()) for index in random_indices]
        
        return sentence_pairs

# Function to perform mean pooling on the model outputs
import torch
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to compute and print the cosine similarity scores
# Function to compute and print the cosine similarity scores using multilingual BERT
def print_similarity_scores(sentence_pairs):
    
    # Initialize the tokenizer and model for multilingual BERT
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    
    # Separate the sentence pairs
    sentences1, sentences2 = zip(*sentence_pairs)

    # Tokenize the sentences
    encoded_input1 = tokenizer(sentences1, padding=True, truncation=True, return_tensors='pt')
    encoded_input2 = tokenizer(sentences2, padding=True, truncation=True, return_tensors='pt')
    
    # Compute embeddings for both lists
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)
    
    # Mean pooling to get one vector per sentence
    embeddings1 = mean_pooling(model_output1, encoded_input1['attention_mask'])
    embeddings2 = mean_pooling(model_output2, encoded_input2['attention_mask'])

    # Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # Output the pairs with their score
    for i in range(len(sentences1)):
        print(f"{sentences1[i]} \n {sentences2[i]} \n Score: {cosine_scores[i][i]:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the SentenceTransformer model

# Example usage:
# Generate 5 random sentence pairs
num_of_gen = 5
sentence_pair = get_random_sentence_pairs(english_dataset_path, german_dataset_path, num_of_gen)

# Compute and print the similarity scores for the sentence pairs
if sentence_pair:
    print_similarity_scores(sentence_pair)

A declaration of independence by Kosovo will likely bring a similar declaration from Georgia’s breakaway Abkhazia region, which Russia could well recognize. 
 Sollte Georgien militärische Schritte unternehmen, um das zu verhindern, würde Russlands Militär wahrscheinlich mit Gewalt reagieren und dabei eine Situation schaffen, die außer Kontrolle geraten könnte. 
 Score: 0.6652
You cannot divert the trolley. 
 Der Waggon kann nicht mehr umgeleitet werden. 
 Score: 0.5164
But a lump-sum tax on robots would merely lead robot producers to bundle artificial intelligence within other machinery. 
 Doch eine auf Roboter erhobene Pauschalsteuer würde lediglich dazu führen, dass die Roboterhersteller künstliche Intelligenz mit anderen Maschinen bündeln würden. 
 Score: 0.6602
Many challenges confront India on its path to sustained strong growth, principally that of converting the country’s vast promise into reality. 
 Auf seinem Weg in Richtung nachhaltiges starkes Wachstum ist Indien mit zahlrei