## Differential Entropy and Control functions 

In [886]:
import numpy as np
import random
import pickle
import torch
from transformers import (
    GPT2Tokenizer,
    BertTokenizer,
    AutoTokenizer,
    AutoModel,
    AdamW,
    GPT2Model,
    AutoConfig,
    AutoModelForCausalLM,
    GPT2Config,
    BertConfig,
    BertModel
)

from src.utils.gpt2_letter_tokenizer import CustomGPT2Tokenizer, mGPTTokenizer, mBERTTokenizer
from src.data.components.datasets import TokenTaggingDataset, tokenize_text_with_labels

random.seed(42)
np.random.seed(42)

### Load data

In [887]:
lang = "it"
parameters = 4 
mode = 'dct'

LAB_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/aligned"
PHONEME_LAB_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/aligned"
WAV_ROOT = f"/home/user/ding/Projects/Prosody/languages/{lang}/wav_files"
DATA_CACHE = f"/home/user/ding/Projects/Prosody/languages/{lang}/cache"

TRAIN_FILE = "train-clean-100"
VAL_FILE = "dev-clean"
TEST_FILE = "test-clean"

SAVE_DIR = f"/home/user/ding/Projects/Prosody/precomputed/predictions/f0_{mode}_{parameters}"

In [888]:
from src.data.f0_regression_datamodule import (
    F0RegressionDataModule as DataModule,
)

In [889]:
dm = DataModule(
    wav_root=WAV_ROOT,
    lab_root=LAB_ROOT,
    phoneme_lab_root=PHONEME_LAB_ROOT,
    data_cache=DATA_CACHE,
    train_file=TRAIN_FILE,
    val_file=VAL_FILE,
    test_file=TEST_FILE,
    dataset_name=f"CommonVoice_{lang}",
    model_name="ai-forever/mGPT",
    f0_mode=mode,
    f0_n_coeffs=parameters,
    score_last_token=True,
)

In [890]:
dm.setup()

Using ai-forever/mGPT tokenizer
Dataloader: padding with token id: 5
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/it/cache/train-clean-100', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 3991/3991 [00:02<00:00, 1565.63it/s]


Failed 877/3991
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/it/cache/dev-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 499/499 [00:00<00:00, 2114.83it/s]


Failed 107/499
Loading data from cache: ('/home/user/ding/Projects/Prosody/languages/it/cache/test-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 500/500 [00:00<00:00, 2228.13it/s]

Failed 121/500
Train dataset size: 3114
Validation dataset size: 392
Test dataset size: 379





In [891]:
train_texts, train_labels = dm.train_texts, dm.train_durations
val_texts, val_labels = dm.val_texts, dm.val_durations
test_texts, test_labels = dm.test_texts, dm.test_durations

print(
    f"Lengths of train, val, test in samples: {len(train_texts), len(val_texts), len(test_texts)}"
)

Lengths of train, val, test in samples: (3991, 499, 500)


In [892]:
# # double check
# with open(f"{DATA_CACHE}/{TRAIN_FILE}/f0_dct_4.pkl", 'rb') as f1:
#     train = pickle.load(f1)
# train_texts = train["texts"]
# train_label = train["f0"]

# with open(f"{DATA_CACHE}/{VAL_FILE}/f0_dct_4.pkl", 'rb') as f2:
#     val = pickle.load(f2)
# val_texts = val["texts"]
# val_label = val["f0"]

# with open(f"{DATA_CACHE}/{TEST_FILE}/f0_dct_4.pkl", 'rb') as f3:
#     test = pickle.load(f3)
# test_texts = test["texts"]
# test_label = test["f0"]

# print(
#     f"Lengths of train, val, test in samples: {len(train_texts), len(val_texts), len(test_texts)}"
# )

In [893]:
import re

def assign_labels(input_string, labels):
    # Create list to hold words and punctuation
    words_with_punctuation = re.findall(r'[\u0E00-\u0E7F\w]+|[.,!?;"-]|\'', input_string) #words_with_punctuation = re.findall(r"[\w']+|[.,!?;\"-]|'", input_string)

    # Create list to hold only words
    #words_only = re.findall(r"\w+'?\w*", input_string) #original
    words_only= re.findall(r'[\u0E00-\u0E7F\w]+', input_string) #gio


    # Make sure the number of labels matches the number of words
    if not len(labels) == len(words_only):
        # alignmend or extraction failed, skip sample
        return None, None, None

    # Create a generator for word-label pairs
    word_label_pairs = ((word, label) for word, label in zip(words_only, labels))

    # Create list of tuples where each word is matched to a label and each punctuation is matched to None
    words_with_labels = []
    for token in words_with_punctuation:
        #print(token)
        if re.match(r'[\u0E00-\u0E7F\w]+', token): # original: if re.match(r"\w+'?\w*", token):
            #print("match")
            words_with_labels.append(next(word_label_pairs))
        else:
            words_with_labels.append((token, None))
            #print("no match")

    return words_only, words_with_punctuation, words_with_labels


def assign_labels_to_sentences(sentences, labels):
    single_words = []
    single_labels = []
    for i in range(len(sentences)):
        words_only, words_with_punct, words_with_labels = assign_labels(
            sentences[i], labels[i]
        )
        # check if alignment failed
        if words_only is None:
            #print(f"Alignment failed for sentence {i}")
            continue
        #print(words_with_labels)
        # remove Nones
        words_with_labels = [(w, l) for w, l in words_with_labels if l is not None]
        if len(words_with_labels) == 0:
            #print("No labels for sentence {i}")
            continue
        # process words and labels
        words, word_labels = zip(
            *[(w, l) for w, l in words_with_labels if l is not None]
        )
        single_words.extend(words)
        single_labels.extend(word_labels)
        

    return single_words, single_labels

In [894]:
#from src.utils.text_processing import assign_labels_to_sentences

all_train_words, all_train_labels = assign_labels_to_sentences(
    train_texts, train_labels
)
all_dev_words, all_dev_labels = assign_labels_to_sentences(val_texts, val_labels)
all_test_words, all_test_labels = assign_labels_to_sentences(test_texts, test_labels)

print(f"Words and labels train: {len(all_train_words), len(all_train_labels)}")
print(f"Words and labels dev: {len(all_dev_words), len(all_dev_labels)}")
print(f"Words and labels test: {len(all_test_words), len(all_test_labels)}")

all_words = all_train_words + all_dev_words + all_test_words

# Compute the length of each word
word_lengths = [len(word) for word in all_words]

# Compute the mean word length
mean_word_length = sum(word_lengths) / len(word_lengths)

print(f"Mean word length: {mean_word_length}")

Words and labels train: (31592, 31592)
Words and labels dev: (3983, 3983)
Words and labels test: (3838, 3838)
Mean word length: 5.285388069926166


In [895]:
all_train_labels = np.array(all_train_labels)
all_dev_labels = np.array(all_dev_labels)
all_test_labels = np.array(all_test_labels)

all_train_labels.shape, all_dev_labels.shape, all_test_labels.shape

((31592, 4), (3983, 4), (3838, 4))

In [896]:
from sklearn.model_selection import train_test_split

# Combine all labels into a single dataset
all_labels_combined = np.concatenate([all_train_labels, all_dev_labels, all_test_labels])

# Step 1: Split into 80% train and 20% temporary (val+test)
train_labels, temp_labels = train_test_split(all_labels_combined, test_size=0.2, random_state=234)

# Step 2: Split 20% into 10% validation and 10% test
dev_labels, test_labels = train_test_split(temp_labels, test_size=0.5, random_state=234)


In [897]:
'''index = 4
all_train_labels = all_train_labels[:,index]
all_dev_labels = all_dev_labels[:,index]
all_test_labels = all_test_labels[:,index]'''

'index = 4\nall_train_labels = all_train_labels[:,index]\nall_dev_labels = all_dev_labels[:,index]\nall_test_labels = all_test_labels[:,index]'

### Kernel density estimation and Differential Entropy Computation

In [898]:
def monte_carlo_diff_entropy(density_func, samples, num_samples=10000):
    if num_samples < samples.shape[1]:
        samples = np.random.choice(samples, num_samples, replace=False)

    log_densities = -np.log(density_func(samples))

    entropy_estimate = np.mean(log_densities)

    return entropy_estimate

In [899]:
# bootstrapping to get confidence intervals
from sklearn.utils import resample
from scipy.stats import gaussian_kde
from src.utils.approximation import cross_validate_gkde_bandwidth

n_train_size = int(len(train_labels) * 0.1)
n_dev_size = int(len(dev_labels) * 0.1)
n_test_size = int(len(test_labels) * 0.1)
print(
    f"n_train_size: {n_train_size}, n_dev_size: {n_dev_size}, n_test_size: {n_test_size}"
)

n_train_size: 3153, n_dev_size: 394, n_test_size: 394


In [900]:
nb_train_samples = n_train_size #1500
nb_dev_samples = n_dev_size #500
nb_test_samples = n_test_size

train_indices = np.random.choice(
    np.arange(len(train_labels)), nb_train_samples, replace=False
)
train_data = train_labels[train_indices]
dev_indices = np.random.choice(
    np.arange(len(dev_labels)), nb_dev_samples, replace=False
)
dev_data = dev_labels[dev_indices]

best_bw = cross_validate_gkde_bandwidth(
    train_data=train_data.T,
    test_data=dev_data.T,
)
print(f"best bw {best_bw}")

param scott, score -9.730869150608443
new best param scott, score -9.730869150608443
param silverman, score -9.730277088244076
new best param silverman, score -9.730277088244076
param 0.01, score -822.0661322779494
param 0.1, score -14.7088512345511
param 0.3, score -9.7586359595202
best bw silverman


In [901]:
random.seed(42)
np.random.seed(42)

n_iterations = 10
diff_entropy_list = []

for i in range(n_iterations):
    train_sample = resample(train_labels, n_samples=n_train_size)
    dev_sample = resample(dev_labels, n_samples=n_dev_size)
    test_sample = resample(test_labels, n_samples=n_test_size)
    # combined_sample = np.vstack([train_sample, dev_sample, test_sample])
    # print("shape:", test_sample.shape)
    # best_bw = 0.01
    # best_bw = cross_validate_gkde_bandwidth(train_sample.T, dev_sample.T)
    # print(f"Best bandwidth: {best_bw}")
    density = gaussian_kde(train_sample .T, bw_method=best_bw)
    mc_entropy = monte_carlo_diff_entropy(density, test_sample.T, len(test_sample))
    diff_entropy_list.append(mc_entropy)
    print(
        f"Finished iteration {i+1} out of {n_iterations} with diff entropy: {mc_entropy}"
    )

diff_entropy_list = np.array(diff_entropy_list)
print(f"Mean: {np.mean(diff_entropy_list)}, std: {np.std(diff_entropy_list)}")

Finished iteration 1 out of 10 with diff entropy: 9.809771739286505
Finished iteration 2 out of 10 with diff entropy: 9.750048877661559
Finished iteration 3 out of 10 with diff entropy: 9.504738913650513
Finished iteration 4 out of 10 with diff entropy: 9.886439995081926
Finished iteration 5 out of 10 with diff entropy: 9.693491221752105
Finished iteration 6 out of 10 with diff entropy: 9.543142867248283
Finished iteration 7 out of 10 with diff entropy: 9.546238112630961
Finished iteration 8 out of 10 with diff entropy: 10.127104781056161
Finished iteration 9 out of 10 with diff entropy: 9.628495771018496
Finished iteration 10 out of 10 with diff entropy: 9.891696332708475
Mean: 9.738116861209498, std: 0.18587108770835306
