We repeat the procedure in other notebook to import dataset: 

In [2]:
import itertools
import random
from pprint import pprint
from sentence_transformers import util
from transformers import BertTokenizer, BertModel
import torch

# Define the paths to the dataset files
english_dataset_path = r'./dataset/News-Commentary/News-Commentary.en-zh.en'
chinese_dataset_path = r'./dataset/News-Commentary/News-Commentary.en-zh.zh'

# Function to get random sentence pairs
def get_random_sentence_pairs(english_path, chinese_path, num_pairs=1):
    with open(english_path, 'r', encoding='utf-8') as eng_file, \
         open(chinese_path, 'r', encoding='utf-8') as zh_file:
        
        # Read all lines from both files
        english_lines = eng_file.readlines()
        chinese_lines = zh_file.readlines()
        
        # Ensure both files have the same number of lines
        if len(english_lines) != len(chinese_lines):
            print("Error: The files don't have the same number of lines.")
            return None

        # Generate random indices
        random_indices = random.sample(range(len(english_lines)), num_pairs)
        
        # Get the random sentence pairs
        sentence_pairs = [(english_lines[index].strip(), chinese_lines[index].strip()) for index in random_indices]
        
        return sentence_pairs

# Function to perform mean pooling on the model outputs
import torch
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to compute and print the cosine similarity scores
# Function to compute and print the cosine similarity scores using multilingual BERT
def print_similarity_scores(sentence_pairs):
    
    # Initialize the tokenizer and model for multilingual BERT
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    
    # Separate the sentence pairs
    sentences1, sentences2 = zip(*sentence_pairs)

    # Tokenize the sentences
    encoded_input1 = tokenizer(sentences1, padding=True, truncation=True, return_tensors='pt')
    encoded_input2 = tokenizer(sentences2, padding=True, truncation=True, return_tensors='pt')
    
    # Compute embeddings for both lists
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)
    
    # Mean pooling to get one vector per sentence
    embeddings1 = mean_pooling(model_output1, encoded_input1['attention_mask'])
    embeddings2 = mean_pooling(model_output2, encoded_input2['attention_mask'])

    # Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # Output the pairs with their score
    for i in range(len(sentences1)):
        print(f"{sentences1[i]} \n {sentences2[i]} \n Score: {cosine_scores[i][i]:.4f}")


In [3]:
# Load the SentenceTransformer model

# Example usage:
# Generate 5 random sentence pairs
num_of_gen = 5
sentence_pair = get_random_sentence_pairs(english_dataset_path, chinese_dataset_path, num_of_gen)

# Compute and print the similarity scores for the sentence pairs
if sentence_pair:
    print_similarity_scores(sentence_pair)

Its member states are constantly being evaluated for their economic potential and desirability as a market for investments, goods, and services. At the same time, their effort to forge a community free from external intervention is shaping a new regional order based on common security and shared prosperity. 
 发自雅加达 — — 东南亚联盟如今正迎来一个决定性的时刻。 一直以来，全世界都在评估东盟各成员国作为一个整体性投资、商品和服务市场的潜力和可行性。 而与此同时，东盟各国也正在努力构建一个不受外部干涉的联盟，并以此形成一个基于共同安全和共同繁荣的新秩序。 
 Score: 0.7592
Today, whether owing to “neo-Ottomanism” or just old-fashioned nasty governance under a thin-skinned autocrat (President Recep Tayyip Erdoğan) with Islamist tendencies, Turkey is on many watch lists. Its intervention in Middle East politics has been furtive and lacking in consistency or clarity. 
 今天，无论是因为“新奥特曼主义”抑或仅仅是有伊斯兰倾向的小气的独裁者老掉牙的令人讨厌的治理（总统雷杰普·塔伊普·埃尔多安 ） ， 土耳其已经进入到不止一份观察名单。 土耳其一直偷偷摸摸地干预中东政治，这种干预既缺乏一致性条理也不甚清晰。 
 Score: 0.7575
The conventional wisdom is, however, wrong; worse, it is dangerous, for we have all seen how quickly it can take