In [1]:
import os
import re
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

# Download BERT LaBSE model
tokenizer_labse = BertTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = BertModel.from_pretrained("sentence-transformers/LaBSE", device_map='cuda')
nltk.download('punkt')

# Function to split text into sentences
def split_into_sentences(text):
    return sent_tokenize(text)

# Function to get the LABSE embeddings for each sentence
def get_labse_embedding(sentences, batch_size=8):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        tokens = tokenizer_labse(batch, return_tensors='pt', padding=True, truncation=True)
        tokens = {k: tokens[k].to('cuda') for k in tokens.keys()}
        with torch.no_grad():
            outputs = model(**tokens)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings)

# Function to calculate cosine similarity
def calculate_cosine_similarity(embeddings1, embeddings2):
    return cosine_similarity(embeddings1.cpu(), embeddings2.cpu())

# Function to process the subfolder corresponding to a single entry in multiple languages
def process_subdirectory(directory, batch_size=8):
    files = os.listdir(directory)
    sentences_by_language = {}
    max_sentences = 0

    # Group texts by language and store the maximum number of sentences
    for file in files:
        match = re.search(r'_([a-zA-Z]{2})\.txt$', file)
        if match:
            language = match.group(1)
            if language not in sentences_by_language:
                sentences_by_language[language] = []
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = split_into_sentences(text)
                sentences_by_language[language].extend(sentences)
                max_sentences = max(max_sentences, len(sentences))

    # Calculate embeddings for each language #DIF
    embeddings_by_language = {}
    for language, sentences in sentences_by_language.items():
        sentence_embeddings = get_labse_embedding(sentences, batch_size=batch_size)
        pad_length = max_sentences - len(sentences)
        if pad_length > 0:
            zero_tensor = torch.zeros((pad_length, sentence_embeddings.shape[1])).to('cuda')
            sentence_embeddings = torch.cat([sentence_embeddings, zero_tensor])
        embeddings_by_language[language] = sentence_embeddings

    # Calculate cosine similarity between all pairs of sentences #Da spiegare
    similarities = []
    languages = list(embeddings_by_language.keys())
    for i in range(len(languages)):
        lang1 = languages[i]
        for j in range(i + 1, len(languages)):
            lang2 = languages[j]
            similarity = calculate_cosine_similarity(embeddings_by_language[lang1], embeddings_by_language[lang2])
            for k in range(similarity.shape[0]):
                for l in range(similarity.shape[1]):
                    if k < len(sentences_by_language[lang1]) and l < len(sentences_by_language[lang2]):
                        similarities.append({
                            'Language 1': lang1,
                            'Sentence 1': sentences_by_language[lang1][k],
                            'Language 2': lang2,
                            'Sentence 2': sentences_by_language[lang2][l],
                            'Cosine similarity': similarity[k][l]
                        })

    return pd.DataFrame(similarities)

# Main function
def main():
    main_directory = "Gastronomia/FrenchCuisine_it-en-es-de-fr"
    subdirectories = os.listdir(main_directory)
    sim_dir = "Prova_full"

    if not os.path.exists(sim_dir):
        os.makedirs(sim_dir)

    progbar = tqdm(subdirectories, total=len(subdirectories))
    for subdirectory in progbar:
        progbar.set_description(desc=f'Current subdirectory: {subdirectory}')
        subdirectory_path = os.path.join(main_directory, subdirectory)
        if os.path.isdir(subdirectory_path):
            df = process_subdirectory(subdirectory_path)
            csv_filename = f"{subdirectory}.csv"
            df.to_csv(os.path.join(sim_dir, csv_filename), index=False)
            torch.cuda.empty_cache()  # Clear GPU memory

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/lgaliero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Current subdirectory: Ratatouille_en-it-es-fr-de: 100%|██████████| 21/21 [00:17<00:00,  1.18it/s]    
