In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

# Indicate the path to the source folder containing the CSV files
folder_path = "/content/drive/MyDrive/Similarity/F_Gastronomia_sim"
# Indicate the path to the target folder where filtered files will be saved
filtered_folder_path = "/content/drive/MyDrive/Similarities_Gastronomia_filtered"

# Ensure the target folder exists
if not os.path.exists(filtered_folder_path):
    os.makedirs(filtered_folder_path)

# List of the languages our project focuses on
languages = ["en", "it", "fr", "de", "es"]

# Define the function to process each CSV file
def process_csv(file_path, relative_path, language_combination_counts):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Only keep pairs of sentences that have a cosine similarity greater than or equal to 0.75
    df = df[df['Cosine similarity'] >= 0.75]

    # Filter rows to include only specified languages
    df = df[df['Language 1'].isin(languages) & df['Language 2'].isin(languages)]

    # Standardize the order of languages in each row
    df['language_combination'] = df.apply(lambda row: '-'.join(sorted([row['Language 1'], row['Language 2']])), axis=1)

    # Count occurrences of each language combination
    combination_counts = df['language_combination'].value_counts()

    # Update the local DataFrame with the counts
    for combination, count in combination_counts.items():
        lang1, lang2 = combination.split('-')
        language_combination_counts.loc[lang1, lang2] += count
        if lang1 != lang2:
            language_combination_counts.loc[lang2, lang1] += count

    # Ensure the subfolder structure is replicated in the target folder
    filtered_file_path = os.path.join(filtered_folder_path, relative_path)
    os.makedirs(os.path.dirname(filtered_file_path), exist_ok=True)
    df.to_csv(filtered_file_path, index=False)

In [None]:
# Function to determine the prevailing language in a given DataFrame
def get_prevailing_language(language_combination_counts):
    total_counts = language_combination_counts.sum(axis=0)
    prevailing_language = total_counts.idxmax()
    max_count = total_counts.max()
    return prevailing_language, max_count

# Traverse the source folder and process each CSV file while maintaining the folder structure
for root, dirs, files in os.walk(folder_path):
    # Skip the root folder itself
    if root == folder_path:
        continue

    # Initialize an empty DataFrame to store the counts for each subfolder
    language_combination_counts = pd.DataFrame(0, index=languages, columns=languages)

    for file_name in files:
        if file_name.endswith('.csv'):
            file_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(file_path, folder_path)
            process_csv(file_path, relative_path, language_combination_counts)

    # Determine the prevailing language for the current subfolder
    prevailing_language, max_count = get_prevailing_language(language_combination_counts)

    # Print the results for the current subfolder
    relative_subfolder = os.path.relpath(root, folder_path)
    print(f"\nSubfolder: {relative_subfolder}")
    print("Language Combination Counts Summary:")
    print(language_combination_counts)
    print("\nTotal Counts per Language:")
    print(language_combination_counts.sum(axis=0))
    print(f"\nThe prevailing language is '{prevailing_language}' with a total count of {max_count}.")


Subfolder: It
Language Combination Counts Summary:
     en   it   fr  de   es
en    0  179  150  51  106
it  179    0  264  37  121
fr  150  264    0  49  125
de   51   37   49   0   36
es  106  121  125  36    0

Total Counts per Language:
en    486
it    601
fr    588
de    173
es    388
dtype: int64

The prevailing language is 'it' with a total count of 601.

Subfolder: Fr
Language Combination Counts Summary:
    en   it   fr  de   es
en   0   66   45  53   75
it  66    0  107  47   61
fr  45  107    0  40  109
de  53   47   40   0   47
es  75   61  109  47    0

Total Counts per Language:
en    239
it    281
fr    301
de    187
es    292
dtype: int64

The prevailing language is 'fr' with a total count of 301.

Subfolder: Es
Language Combination Counts Summary:
     en   it   fr   de   es
en    0   71  127  113  265
it   71    0   71   59  201
fr  127   71    0   78  253
de  113   59   78    0  137
es  265  201  253  137    0

Total Counts per Language:
en    576
it    402
fr    52