In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

# Indicate the path to the source folder containing the CSV files
folder_path = "/content/drive/MyDrive/Similarity/F_Scienza"
# Indicate the path to the target folder where filtered files will be saved
filtered_folder_path = "/content/drive/MyDrive/Similarities_Scienza_filtered"

# Ensure the target folder exists
if not os.path.exists(filtered_folder_path):
    os.makedirs(filtered_folder_path)

# List of the languages our project focuses on
languages = ["en", "it", "fr", "de", "es"]

# Define the function to process each CSV file
def process_csv(file_path, relative_path, language_combination_counts):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Only keep pairs of sentences that have a cosine similarity greater than or equal to 0.75
    df = df[df['Cosine similarity'] >= 0.75]

    # Filter rows to include only specified languages
    df = df[df['Language 1'].isin(languages) & df['Language 2'].isin(languages)]

    # Standardize the order of languages in each row
    df['language_combination'] = df.apply(lambda row: '-'.join(sorted([row['Language 1'], row['Language 2']])), axis=1)

    # Count occurrences of each language combination
    combination_counts = df['language_combination'].value_counts()

    # Update the local DataFrame with the counts
    for combination, count in combination_counts.items():
        lang1, lang2 = combination.split('-')
        language_combination_counts.loc[lang1, lang2] += count
        if lang1 != lang2:
            language_combination_counts.loc[lang2, lang1] += count

    # Ensure the subfolder structure is replicated in the target folder
    filtered_file_path = os.path.join(filtered_folder_path, relative_path)
    os.makedirs(os.path.dirname(filtered_file_path), exist_ok=True)
    df.to_csv(filtered_file_path, index=False)

**Code working on folders with a clear hierarchy**
(Gastronomy domain)

In [None]:
# Function to determine the prevailing language in a given DataFrame
def get_prevailing_language(language_combination_counts):
    total_counts = language_combination_counts.sum(axis=0)
    prevailing_language = total_counts.idxmax()
    max_count = total_counts.max()
    return prevailing_language, max_count

# Traverse the source folder and process each CSV file while maintaining the folder structure
for root, dirs, files in os.walk(folder_path):
    # Skip the root folder itself
    if root == folder_path:
        continue

    # Initialize an empty DataFrame to store the counts for each subfolder
    language_combination_counts = pd.DataFrame(0, index=languages, columns=languages)

    for file_name in files:
        if file_name.endswith('.csv'):
            file_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(file_path, folder_path)
            process_csv(file_path, relative_path, language_combination_counts)

    # Determine the prevailing language for the current subfolder
    prevailing_language, max_count = get_prevailing_language(language_combination_counts)

    # Print the results for the current subfolder
    relative_subfolder = os.path.relpath(root, folder_path)
    print(f"\nSubfolder: {relative_subfolder}")
    print("Language Combination Counts Summary:")
    print(language_combination_counts)
    print("\nTotal Counts per Language:")
    print(language_combination_counts.sum(axis=0))
    print(f"\nThe prevailing language is '{prevailing_language}' with a total count of {max_count}.")

**Code working on folders with a lesser level of organization**
(Scientific domain)

In [None]:
import os
import pandas as pd

# Indicate the path to the source folder containing the CSV files
folder_path = "/content/drive/MyDrive/Similarity/F_Scienza_sim"
# Indicate the path to the target folder where filtered files will be saved
filtered_folder_path = "/content/drive/MyDrive/Similarity/F_Scienza_sim"

# # If it does not exist, create the target folder
if not os.path.exists(filtered_folder_path):
    os.makedirs(filtered_folder_path)

# List of the languages our project focuses on
languages = ["en", "it", "fr", "de", "es"]

# Initialize an empty DataFrame to store the counts
language_combination_counts = pd.DataFrame(0, index=languages, columns=languages)

# Define the function to process each CSV file
def process_csv(file_path):
    global language_combination_counts

    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Only keep pairs of sentences that have a cosine similarity greater than or equal to 0.75
    df = df[df['Cosine similarity'] >= 0.75]

    # Filter rows to include only specified languages
    df = df[df['Language 1'].isin(languages) & df['Language 2'].isin(languages)]

    # Standardize the order of languages in each row
    df['language_combination'] = df.apply(lambda row: '-'.join(sorted([row['Language 1'], row['Language 2']])), axis=1)

    # Count occurrences of each language combination
    combination_counts = df['language_combination'].value_counts()

    # Update the global DataFrame with the counts
    for combination, count in combination_counts.items():
        lang1, lang2 = combination.split('-')
        language_combination_counts.loc[lang1, lang2] += count
        if lang1 != lang2:
            language_combination_counts.loc[lang2, lang1] += count

    # Save the filtered DataFrame to a new CSV file
    filtered_file_path = os.path.join(filtered_folder_path, os.path.basename(file_path))
    df.to_csv(filtered_file_path, index=False)

# List all files in the folder
file_list = os.listdir(folder_path)

# Iterate over each file in file_list, check if the file is a csv file, and implement the process_csv function
for file_name in file_list:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        process_csv(file_path)

# Print the final DataFrame with counts
print("Language Combination Counts Summary:")
print(language_combination_counts)

# Determine which language has the highest total counts
total_counts = language_combination_counts.sum(axis=0)
prevailing_language = total_counts.idxmax()
max_count = total_counts.max()

print("\nTotal Counts per Language:")
print(total_counts)
print(f"\nThe prevailing language is '{prevailing_language}' with a total count of {max_count}.")


Language Combination Counts Summary:
      en    it    fr    de    es
en     0  4164  3018  1285  3425
it  4164     0  2468  1567  2998
fr  3018  2468     0  1450  2152
de  1285  1567  1450     0  1125
es  3425  2998  2152  1125     0

Total Counts per Language:
en    11892
it    11197
fr     9088
de     5427
es     9700
dtype: int64

The prevailing language is 'en' with a total count of 11892.
