In [None]:
import os
import re
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm #library that creates a progression bar in the terminal

# Download BERT LaBSE model
tokenizer_labse = BertTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = BertModel.from_pretrained("sentence-transformers/LaBSE", device_map='cuda') 
#cuda allows me to use GPU with Pytorch
nltk.download('punkt') # nltk's sentencizer (module) that tokenizes sentences when it encounters a punctuation sign

# Function to split text into sentences
def split_into_sentences(text):
    return sent_tokenize(text)

# Function to get the LABSE embeddings for each sentence
def get_labse_embedding(sentences, batch_size=8): 
    # batch_size indicates the size of each batch of sentences processed at once. Allows us to control memory usage
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size] #selects a batch of sentences from the sentences list based on the current index i and the batch size.
        tokens = tokenizer_labse(batch, return_tensors='pt', padding=True, truncation=True)
        tokens = {k: tokens[k].to('cuda') for k in tokens.keys()} #We have a set of tensors, each with a label (key). To perform computations swiftly, we have to use a powerful machine (GPU). The code takes each tensor, sends it to the powerful machine, and updates the set to reflect this move. The code iterates over all items, sends each one to the fast machine, and updates the collection accordingly.
        with torch.no_grad():
            outputs = model(**tokens)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1) #calculates the mean pooling of the last hidden states across all tokens in each sentence. Results in a single vector representation (embedding) for each sentence in the batch.
        embeddings.append(batch_embeddings) #appends the embeddings of the sentences in the current batch to the embeddings list. Each element of embeddings is a tensor containing the embeddings of one batch of sentences.
    return torch.cat(embeddings) #all the embeddings in the embeddings list along the specified dimension (default is 0), resulting in a single tensor containing embeddings for all sentences in the input list.
# This ensures that we get a single embedding of a fixed size for each sentence.

# Function to calculate cosine similarity
def calculate_cosine_similarity(embeddings1, embeddings2):
    similarity = cosine_similarity(embeddings1.cpu(), embeddings2.cpu())
    return similarity

# Function to process the subfolder corresponding to a single entry in multiple languages
# It also initializes data structures (a dictionary) to store sentences by language, along with a variable to track the maximum number of sentences.
def process_subdirectory(directory, batch_size=8):
    files = os.listdir(directory)
    sentences_by_language = {}
    max_sentences = 0

    # Group texts by language and store the maximum number of sentences
    for file in files:
        match = re.search(r'_([a-zA-Z]{2})\.txt$', file)
        if match:
            language = match.group(1)
            if language not in sentences_by_language:
                sentences_by_language[language] = []
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = split_into_sentences(text)
                sentences_by_language[language].extend(sentences)
                #Update the maimum number of sentences
                max_sentences = max(max_sentences, len(sentences))

    # Calculate embeddings for each language #DIF
    embeddings_by_language = {}
    for language, sentences in sentences_by_language.items():
        sentence_embeddings = get_labse_embedding(sentences, batch_size=batch_size) # troviamo i tensori degli embedding delle frasi
        pad_length = max_sentences - len(sentences) # troviamo la lunghezza di padding
        zero_tensor = torch.zeros((pad_length, sentence_embeddings.shape[1])).to('cuda') if pad_length > 0 else torch.tensor([], device='cuda')
        embeddings_by_language[language] = torch.cat([sentence_embeddings, zero_tensor])

    # Calculate cosine similarity between all pairs of sentences #Da spiegare
    similarities = []
    languages = list(embeddings_by_language.keys())
    for i in range(len(languages)):
        lang1 = languages[i]
        for j in range(i + 1, len(languages)):
            lang2 = languages[j]
            similarity = calculate_cosine_similarity(embeddings_by_language[lang1], embeddings_by_language[lang2])
            for k in range(similarity.shape[0]):
                for l in range(similarity.shape[1]):
                    if k < len(sentences_by_language[lang1]) and l < len(sentences_by_language[lang2]):
                        similarities.append({
                            'Language 1': lang1,
                            'Sentence 1': sentences_by_language[lang1][k],
                            'Language 2': lang2,
                            'Sentence 2': sentences_by_language[lang2][l],
                            'Cosine similarity': similarity[k][l]
                        })

    # Create a DataFrame with the results and return it
    df = pd.DataFrame(similarities)
    return df

# Main function
def main():
    main_directory = "Scienza_puliti"
    subdirectories = os.listdir(main_directory)
    sim_dir = "Prova_full"

    if not os.path.exists(sim_dir):
        os.makedirs(sim_dir)

    progbar = tqdm(subdirectories, total=len(subdirectories))
    for subdirectory in progbar:
        progbar.set_description(desc=f'Current subdirectory: {subdirectory}')
        subdirectory_path = os.path.join(main_directory, subdirectory)
        if os.path.isdir(subdirectory_path):
            df = process_subdirectory(subdirectory_path)
            csv_filename = f"{subdirectory}.csv"
            df.to_csv(os.path.join(sim_dir, csv_filename), index=False)
            torch.cuda.empty_cache()  # Clear GPU memory

if __name__ == "__main__":
    main()