## Alignment and estimated mutual information

In [212]:
import numpy as np
import os
import pandas as pd
import random
import pickle
import json
from src.utils.gpt2_letter_tokenizer import CustomGPT2Tokenizer, mGPTTokenizer, mBERTTokenizer, CustomBERTTokenizer
from src.data.components.datasets import TokenTaggingDataset, tokenize_text_with_labels

random.seed(42)
np.random.seed(42)

In [213]:
# storing entropy differences and their standard deviations
diff_ent = {
    "de": (10.19, 0.06),
    "en": (9.49, 0.03),
    "fr": (9.81, 0.06),
    "it": (9.79, 0.05),
    "ja": (9.25, 0.04),
    "kor": (10.63, 0.19),
    "sr": (7.23, 0.08),
    "sv": (9.75, 0.05),
    "th": (9.49, 0.07),
    "vi": (10.32, 0.07),
    "zh": (10.29, 0.02),
    "yue": (8.70, 0.07),
}

### Load data

In [214]:
model = "mbert"
lang = "yue"
parameters = 4 
mode = 'dct'

LAB_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/aligned"
PHONEME_LAB_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/aligned"
WAV_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/wav_files"
DATA_CACHE = f"/home/user/ding/Projects/Prosody/languages/{lang}/cache"

PRED_ROOT = f"/home/user/ding/Projects/Prosody/logs/train/runs/{model}"

orig_model = f"{PRED_ROOT}/{model}_{lang}_orig/metrics.json"
mis2_model = f"{PRED_ROOT}/{model}_{lang}_mis1/metrics.json" # naming is swapped
mis1_model = f"{PRED_ROOT}/{model}_{lang}_mis2/metrics.json" # naming is swapped
cus_model = f"{PRED_ROOT}/{model}_{lang}_noletter/metrics.json"
# letter_model = f"{PRED_ROOT}/{model}_{lang}_letter/metrics.json"

TRAIN_FILE = "train-clean-100"
VAL_FILE = "dev-clean"
TEST_FILE = "test-clean"

SAVE_DIR = f"/home/user/ding/Projects/Prosody/precomputed/{model}/mi_alignment_{lang}_{model}.csv"

In [215]:
def get_test_loss(model_path):
    with open(model_path, 'r') as file:
        metrics_data = json.load(file)
    return metrics_data.get("test/loss", None)

In [216]:
test_loss_0 = get_test_loss(orig_model)
test_loss_1 = get_test_loss(mis1_model)
test_loss_2 = get_test_loss(mis2_model)
test_loss_3 = get_test_loss(cus_model)

mi_mis0 = diff_ent[lang][0] - test_loss_0 if test_loss_0 is not None else None
mi_mis1 = diff_ent[lang][0] - test_loss_1 if test_loss_1 is not None else None
mi_mis2 = diff_ent[lang][0] - test_loss_2 if test_loss_2 is not None else None
mi_mis3 = diff_ent[lang][0] - test_loss_3 if test_loss_3 is not None else None

print(f"Conditional entropy (cond_ent) for original model: {mi_mis0:.2f}")

print(f"Conditional entropy (cond_ent) for misalignment 1 model: {mi_mis1:.2f}")

print(f"Conditional entropy (cond_ent) for misalignment 2 model: {mi_mis2:.2f}")

print(f"Conditional entropy (cond_ent) for custom model: {mi_mis3:.2f}")


Conditional entropy (cond_ent) for original model: 0.78
Conditional entropy (cond_ent) for misalignment 1 model: 0.75
Conditional entropy (cond_ent) for misalignment 2 model: 0.72
Conditional entropy (cond_ent) for custom model: 0.76


In [217]:
from src.data.f0_regression_datamodule import (
    F0RegressionDataModule as DataModule,
)

In [218]:
dm = DataModule(
    wav_root=WAV_ROOT,
    lab_root=LAB_ROOT,
    phoneme_lab_root=PHONEME_LAB_ROOT,
    data_cache=DATA_CACHE,
    train_file=TRAIN_FILE,
    val_file=VAL_FILE,
    test_file=TEST_FILE,
    dataset_name=f"CommonVoice_{lang}",
    model_name="bert-base-multilingual-cased",
    f0_mode=mode,
    f0_n_coeffs=parameters,
    score_last_token=True,
    tokenization_by_letter=False,
)

dm.setup()

Using bert-base-multilingual-cased tokenizer.
Dataloader: padding with token id: 0
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/yue/cache/train-clean-100', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 4012/4012 [00:01<00:00, 2819.10it/s]


Failed 369/4012
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/yue/cache/dev-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 486/486 [00:00<00:00, 3791.73it/s]


Failed 49/486
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/yue/cache/test-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 502/502 [00:00<00:00, 3827.38it/s]

Failed 50/502
Train dataset size: 3643
Validation dataset size: 437
Test dataset size: 452





In [219]:
train_texts, train_labels = dm.train_texts, dm.train_durations
val_texts, val_labels = dm.val_texts, dm.val_durations
test_texts, test_labels = dm.test_texts, dm.test_durations

print(
    f"Lengths of train, val, test in samples: {len(train_texts), len(val_texts), len(test_texts)}"
)

Lengths of train, val, test in samples: (4012, 486, 502)


In [220]:
import re

def assign_labels(input_string, labels):
    # Create list to hold words and punctuation
    words_with_punctuation = re.findall(r'[\u0E00-\u0E7F\w]+|[.,!?;"-]|\'', input_string) #words_with_punctuation = re.findall(r"[\w']+|[.,!?;\"-]|'", input_string)

    # Create list to hold only words
    #words_only = re.findall(r"\w+'?\w*", input_string) #original
    words_only= re.findall(r'[\u0E00-\u0E7F\w]+', input_string) #gio


    # Make sure the number of labels matches the number of words
    if not len(labels) == len(words_only):
        # alignmend or extraction failed, skip sample
        return None, None, None

    # Create a generator for word-label pairs
    word_label_pairs = ((word, label) for word, label in zip(words_only, labels))

    # Create list of tuples where each word is matched to a label and each punctuation is matched to None
    words_with_labels = []
    for token in words_with_punctuation:
        #print(token)
        if re.match(r'[\u0E00-\u0E7F\w]+', token): # original: if re.match(r"\w+'?\w*", token):
            #print("match")
            words_with_labels.append(next(word_label_pairs))
        else:
            words_with_labels.append((token, None))
            #print("no match")

    return words_only, words_with_punctuation, words_with_labels


def assign_labels_to_sentences(sentences, labels):
    single_words = []
    single_labels = []
    for i in range(len(sentences)):
        words_only, words_with_punct, words_with_labels = assign_labels(
            sentences[i], labels[i]
        )
        # check if alignment failed
        if words_only is None:
            #print(f"Alignment failed for sentence {i}")
            continue
        #print(words_with_labels)
        # remove Nones
        words_with_labels = [(w, l) for w, l in words_with_labels if l is not None]
        if len(words_with_labels) == 0:
            #print("No labels for sentence {i}")
            continue
        # process words and labels
        words, word_labels = zip(
            *[(w, l) for w, l in words_with_labels if l is not None]
        )
        single_words.extend(words)
        single_labels.extend(word_labels)
        

    return single_words, single_labels

In [221]:
#from src.utils.text_processing import assign_labels_to_sentences

all_train_words, all_train_labels = assign_labels_to_sentences(
    train_texts, train_labels
)
all_dev_words, all_dev_labels = assign_labels_to_sentences(val_texts, val_labels)
all_test_words, all_test_labels = assign_labels_to_sentences(test_texts, test_labels)

print(f"Words and labels train: {len(all_train_words), len(all_train_labels)}")
print(f"Words and labels dev: {len(all_dev_words), len(all_dev_labels)}")
print(f"Words and labels test: {len(all_test_words), len(all_test_labels)}")

all_words = all_train_words + all_dev_words + all_test_words

# Compute the length of each word
word_lengths = [len(word) for word in all_words]

# Compute the mean word length
mean_word_length = sum(word_lengths) / len(word_lengths)

print(f"Mean word length: {mean_word_length}")

Words and labels train: (30157, 30157)
Words and labels dev: (3567, 3567)
Words and labels test: (3656, 3656)
Mean word length: 1.3604868913857677


In [222]:
# Function to extract input_ids and calculate correct alignment proportion for any dataset
def calculate_alignment_proportion(dataset):
    total_words = 0
    total_misalignment = 0
    discard_count_1 = 0
    discard_count_2 = 0
    discard_count_3 = 0
    
    # Iterate through all items in the dataset
    for i in range(len(dataset)):
        # Get each item from the dataset
        item = dataset.__getitem__(i)
        
        input_ids = item['input_ids']  
        loss_mask = item['loss_mask']  
        word_to_tokens = item["word_to_tokens"]
        
        # Iterate through only the 2nd, 4th, 6th, etc., elements excluding the last two
        for token_ids in word_to_tokens[1:-2:2]:  # Start at index 1 and step by 2, skipping last 2 elements
            if isinstance(token_ids, list) and len(token_ids) > 1:
                total_misalignment += 1
                discard_count_3 += 1
            if isinstance(token_ids, list) and len(token_ids) > 2:
                discard_count_2 += 1
            if isinstance(token_ids, list) and len(token_ids) > 3:
                discard_count_1 += 1

        num_words = sum(1 for mask in loss_mask if mask == 1)
        total_words += num_words

    misalignment_count_1 = total_misalignment - discard_count_1
    misalignment_count_2 = total_misalignment - discard_count_2
    misalignment_count_3 = total_misalignment - discard_count_3

    if total_words > 0:
        misalignment_proportion_0 = total_misalignment / total_words
        misalignment_proportion_1 = misalignment_count_1 / total_words
        misalignment_proportion_2 = misalignment_count_2 / total_words
        misalignment_proportion_3 = misalignment_count_3 / total_words
    else:
        misalignment_proportion_0 = 0
        misalignment_proportion_1 = 0
        misalignment_proportion_2 = 0
        misalignment_proportion_3 = 0
    
    return misalignment_proportion_0, total_misalignment, misalignment_proportion_1, misalignment_count_1, misalignment_proportion_2, misalignment_count_2, misalignment_proportion_3, misalignment_count_3, total_words


In [223]:
train_mis_proportion_0, train_total_misalignment, train_mis_proportion_1, train_misalignment_count_1, train_mis_proportion_2, train_misalignment_count_2, train_mis_proportion_3, train_misalignment_count_3, train_total_words = calculate_alignment_proportion(dm.train_dataset)
dev_mis_proportion_0, dev_total_misalignment, dev_mis_proportion_1, dev_misalignment_count_1, dev_mis_proportion_2, dev_misalignment_count_2, dev_mis_proportion_3, dev_misalignment_count_3, dev_total_words = calculate_alignment_proportion(dm.val_dataset)
test_mis_proportion_0, test_total_misalignment, test_mis_proportion_1, test_misalignment_count_1, test_mis_proportion_2, test_misalignment_count_2, test_mis_proportion_3, test_misalignment_count_3,test_total_words = calculate_alignment_proportion(dm.test_dataset)



all_misalignment = train_total_misalignment + dev_total_misalignment + test_total_misalignment
all_misalignment_count_1 = train_misalignment_count_1 + dev_misalignment_count_1 + test_misalignment_count_1
all_misalignment_count_2 = train_misalignment_count_2 + dev_misalignment_count_2 + test_misalignment_count_2
all_misalignment_count_3 = train_misalignment_count_3 + dev_misalignment_count_3 + test_misalignment_count_3

all_total_words = train_total_words + dev_total_words + test_total_words

all_mis_proportion_0 = all_misalignment / all_total_words 
all_mis_proportion_1 = all_misalignment_count_1 / all_total_words 
all_mis_proportion_2 = all_misalignment_count_2 / all_total_words 
all_mis_proportion_3 = all_misalignment_count_3 / all_total_words 

clean_words = all_total_words - all_misalignment
using_words_0 = all_total_words
using_words_1 = clean_words + all_misalignment_count_1
using_words_2 = clean_words + all_misalignment_count_2
using_words_3 = clean_words + all_misalignment_count_3

using_words_prop_0 = using_words_0 / all_total_words
using_words_prop_1 = using_words_1 / all_total_words
using_words_prop_2 = using_words_2 / all_total_words
using_words_prop_3 = using_words_3 / all_total_words


# Print the results
print(f"Train Dataset: Total Words = {train_total_words}")
print(f"Original wrong Alignments = {train_total_misalignment}, Original Proportion = {train_mis_proportion_0:.4f}")
print(f"Wrong Alignments 1 = {train_misalignment_count_1}, Proportion 1 = {train_mis_proportion_1:.4f}")
print(f"Wrong Alignments 2 = {train_misalignment_count_2}, Proportion 2 = {train_mis_proportion_2:.4f}")
print(f"Wrong Alignments 3 = {train_misalignment_count_3}, Proportion 3 = {train_mis_proportion_3:.4f}")
print(f"Dev Dataset: Total Words = {dev_total_words}")
print(f"Original wrong Alignments = {dev_total_misalignment}, Original Proportion = {dev_mis_proportion_0:.4f}")
print(f"Wrong Alignments 1 = {dev_misalignment_count_1}, Proportion 1 = {dev_mis_proportion_1:.4f}")
print(f"Wrong Alignments 2 = {dev_misalignment_count_2}, Proportion 2 = {dev_mis_proportion_2:.4f}")
print(f"Wrong Alignments 3 = {dev_misalignment_count_3}, Proportion 3 = {dev_mis_proportion_3:.4f}")
print(f"Test Dataset: Total Words = {test_total_words}")
print(f"Original wrong Alignments = {test_total_misalignment}, Original Proportion = {test_mis_proportion_0:.4f}")
print(f"Wrong Alignments 1 = {test_misalignment_count_1}, Proportion 1 = {test_mis_proportion_1:.4f}")
print(f"Wrong Alignments 2 = {test_misalignment_count_2}, Proportion 2 = {test_mis_proportion_2:.4f}")
print(f"Wrong Alignments 3 = {test_misalignment_count_3}, Proportion 3 = {test_mis_proportion_3:.4f}")

print("-" * 50) 

print(f"All Dataset: Total Words = {all_total_words}")
print(f"Original wrong Alignments = {all_misalignment}, Original Proportion = {all_mis_proportion_0:.4f}")
print(f"Wrong Alignments 1 = {all_misalignment_count_1}, Proportion 1 = {all_mis_proportion_1:.4f}")
print(f"Wrong Alignments 2 = {all_misalignment_count_2}, Proportion 2 = {all_mis_proportion_2:.4f}")
print(f"Wrong Alignments 3 = {all_misalignment_count_3}, Proportion 3 = {all_mis_proportion_3:.4f}")

print(f"Original used words = {using_words_0}, Original used Proportion = {using_words_prop_0:.4f}")
print(f"Used words 1 = {using_words_1}, Original used Proportion = {using_words_prop_1:.4f}")
print(f"Used words 2 = {using_words_2}, Original used Proportion = {using_words_prop_2:.4f}")
print(f"Used words 3 = {using_words_3}, Original used Proportion = {using_words_prop_3:.4f}")


Train Dataset: Total Words = 29133
Original wrong Alignments = 7515, Original Proportion = 0.2580
Wrong Alignments 1 = 7426, Proportion 1 = 0.2549
Wrong Alignments 2 = 6750, Proportion 2 = 0.2317
Wrong Alignments 3 = 0, Proportion 3 = 0.0000
Dev Dataset: Total Words = 3431
Original wrong Alignments = 942, Original Proportion = 0.2746
Wrong Alignments 1 = 928, Proportion 1 = 0.2705
Wrong Alignments 2 = 836, Proportion 2 = 0.2437
Wrong Alignments 3 = 0, Proportion 3 = 0.0000
Test Dataset: Total Words = 3513
Original wrong Alignments = 944, Original Proportion = 0.2687
Wrong Alignments 1 = 937, Proportion 1 = 0.2667
Wrong Alignments 2 = 855, Proportion 2 = 0.2434
Wrong Alignments 3 = 0, Proportion 3 = 0.0000
--------------------------------------------------
All Dataset: Total Words = 36077
Original wrong Alignments = 9401, Original Proportion = 0.2606
Wrong Alignments 1 = 9291, Proportion 1 = 0.2575
Wrong Alignments 2 = 8441, Proportion 2 = 0.2340
Wrong Alignments 3 = 0, Proportion 3 = 0

In [224]:
import csv

def format_float(value):
    if isinstance(value, float):
        return f"{value:.2f}"
    return value

# Function to write results to a CSV file
def write_results_to_csv(file_path, model, language, total_words, misalignment, mis_proportion_0, used_words_0, used_prop_0,
                         misalignment_count_1, mis_proportion_1, used_words_1, used_prop_1,
                         misalignment_count_2, mis_proportion_2, used_words_2, used_prop_2,
                         misalignment_count_3, mis_proportion_3, used_words_3, used_prop_3,
                         diff_entropy, test_loss_0, test_loss_1, test_loss_2, test_loss_3,
                         mi_mis0, mi_mis1, mi_mis2, mi_mis3):
    
    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow([
            'model', 'lang', 'n_words', 'n_mis_orig', 'prop_mis_orig', 'n_used_orig', 'prop_used_orig',
            'n_mis_1', 'prop_mis_1', 'n_used_1', 'prop_used_1',
            'n_mis_2', 'prop_mis_2', 'n_used_2', 'prop_used_2',
            'n_mis_3', 'prop_mis_3', 'n_used_3', 'prop_used_3',
            "diff_entropy", "cond_ent_0", "cond_ent_1", "cond_ent_2", "cond_ent_3",
            "mi_0", "mi_1", "mi_2", "mi_3"
        ])
        
        # Write the results for the current language, formatting floats to two decimal places
        writer.writerow([
            model,
            language, 
            total_words, 
            misalignment, 
            format_float(mis_proportion_0), 
            used_words_0, 
            format_float(used_prop_0),
            misalignment_count_1, 
            format_float(mis_proportion_1), 
            used_words_1, 
            format_float(used_prop_1),
            misalignment_count_2, 
            format_float(mis_proportion_2), 
            used_words_2, 
            format_float(used_prop_2),
            misalignment_count_3, 
            format_float(mis_proportion_3), 
            used_words_3, 
            format_float(used_prop_3),
            format_float(diff_entropy), 
            format_float(test_loss_0), 
            format_float(test_loss_1), 
            format_float(test_loss_2), 
            format_float(test_loss_3),
            format_float(mi_mis0), 
            format_float(mi_mis1), 
            format_float(mi_mis2), 
            format_float(mi_mis3)
        ])

In [225]:
# write_results_to_csv(SAVE_DIR, model, lang, all_total_words, all_misalignment, all_mis_proportion_0, using_words_0, using_words_prop_0,
#                      all_misalignment_count_1, all_mis_proportion_1, using_words_1, using_words_prop_1,
#                      all_misalignment_count_2, all_mis_proportion_2, using_words_2, using_words_prop_2,
#                      all_misalignment_count_3, all_mis_proportion_3, using_words_3, using_words_prop_3,
#                      diff_ent[lang][0], test_loss_0, test_loss_1, test_loss_2, test_loss_3,
#                      mi_mis0, mi_mis1, mi_mis2, mi_mis3
#                      )

print("Results have been written to the CSV file.")

Results have been written to the CSV file.


In [226]:
mi_alig_dir = f"/home/user/ding/Projects/Prosody/precomputed/{model}"
# combine all csv files in mi_alig_dir

# List to hold data from all CSV files
csv_files = [f for f in os.listdir(mi_alig_dir) if f.endswith(f'{model}.csv')]

# List to store DataFrames
dataframes = []

# Read and store each CSV file as a DataFrame
for file in csv_files:
    file_path = os.path.join(mi_alig_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame into a new CSV file
# combined_csv_path = os.path.join(mi_alig_dir, f'mi_alignment_{model}.csv')
# combined_df.to_csv(combined_csv_path, index=False)

# print(f"All MI and alignment CSV files have been combined into {combined_csv_path}")

In [227]:

def calculate_special_tokens_proportion(dataset):
    total_words = 0
    count_1 = 0
    count_2 = 0
    count_3 = 0
    
    # Iterate through all items in the dataset
    for i in range(len(dataset)):
        # Get each item from the dataset
        item = dataset.__getitem__(i)
        
        input_ids = item['input_ids']  
        loss_mask = item['loss_mask']  
        word_to_tokens = item["word_to_tokens"]
        
        # Iterate through only the 2nd, 4th, 6th, etc., elements excluding the last two
        for token_ids in word_to_tokens[1:-2:2]:  # Start at index 1 and step by 2, skipping last 2 elements
            if isinstance(token_ids, list) and len(token_ids) == 1:
                count_1 += 1
            if isinstance(token_ids, list) and len(token_ids) == 2:
                count_2 += 1
            if isinstance(token_ids, list) and len(token_ids) > 2:
                count_3 += 1

        num_words = sum(1 for mask in loss_mask if mask == 1)
        total_words += num_words

    
    prop_1 = count_1 / total_words
    prop_2 = count_2 / total_words
    prop_3 = count_3 / total_words
    
    return total_words, count_1, count_2, count_3, prop_1, prop_2, prop_3

In [228]:
train_total_words, train_count_1, train_count_2, train_count_3, train_prop_1, train_prop_2, train_prop_3 = calculate_special_tokens_proportion(dm.train_dataset)
dev_total_words, dev_count_1, dev_count_2, dev_count_3, dev_prop_1, dev_prop_2, dev_prop_3 = calculate_special_tokens_proportion(dm.val_dataset)
test_total_words, test_count_1, test_count_2, test_count_3, test_prop_1, test_prop_2, test_prop_3 = calculate_special_tokens_proportion(dm.test_dataset)

all_total_words = train_total_words + dev_total_words + test_total_words
all_count_1 = train_count_1 + dev_count_1 + test_count_1
all_count_2 = train_count_2 + dev_count_2 + test_count_2
all_count_3 = train_count_3 + dev_count_3 + test_count_3

all_prop_1 = all_count_1 / all_total_words
all_prop_2 = all_count_2 / all_total_words
all_prop_3 = all_count_3 / all_total_words


In [229]:
loss_tn1 = f"{PRED_ROOT}/{model}_{lang}_token_n1/metrics.json"
loss_tn2 = f"{PRED_ROOT}/{model}_{lang}_token_n2/metrics.json"
loss_tn3= f"{PRED_ROOT}/{model}_{lang}_token_n3/metrics.json"


tn1_loss = get_test_loss(loss_tn1)
tn2_loss = get_test_loss(loss_tn2)
tn3_loss = get_test_loss(loss_tn3)


mi_tn1 = diff_ent[lang][0] - tn1_loss if tn1_loss is not None else None
mi_tn2 = diff_ent[lang][0] - tn2_loss if tn2_loss is not None else None
mi_tn3 = diff_ent[lang][0] - tn3_loss if tn3_loss is not None else None

In [230]:

output_csv = f"/home/user/ding/Projects/Prosody/precomputed/{model}/mi_by_token_type_{model}_{lang}.csv"

# Write the results to a CSV file
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow([
        'model', 'language', 'test_total_words',
        'token_1_n', 'token_2_n', 'token_3_n',
        'token_1_prop', 'token_2_prop', 'token_3_prop',
        'loss_token_1', 'loss_token_2', 'loss_token_3',
        'diff_ent', 'mi_token_1', 'mi_token_2', 'mi_token_3'
    ])
    
    # Write the data row
    writer.writerow([
        model, lang, test_total_words,
        test_count_1, test_count_2, test_count_3,
        format_float(test_prop_1), format_float(test_prop_2), format_float(test_prop_3),
        format_float(tn1_loss), format_float(tn2_loss), format_float(tn3_loss),
        format_float(diff_ent[lang][0]),
        format_float(mi_tn1), format_float(mi_tn2), format_float(mi_tn3)
    ])

print(f"Results have been saved to {output_csv}")


Results have been saved to /home/user/ding/Projects/Prosody/precomputed/mbert/mi_by_token_type_mbert_yue.csv


In [None]:
mi_token_dir = f"/home/user/ding/Projects/Prosody/precomputed/{model}"
# combine all csv files in mi_alig_dir

# List to hold data from all CSV files
csv_files = [f for f in os.listdir(mi_token_dir) if f.startswith(f'mi_by_token_type_mbert')]
print(csv_files)

# List to store DataFrames
dataframes = []

# Read and store each CSV file as a DataFrame
for file in csv_files:
    file_path = os.path.join(mi_alig_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame into a new CSV file
combined_csv_path = os.path.join(mi_alig_dir, f'mi_token_{model}.csv')
combined_df.to_csv(combined_csv_path, index=False)

print(f"All MI and alignment CSV files have been combined into {combined_csv_path}")