## Absolute Prominence Task - Differential Entropy and Control functions 

In [1]:
from src.data.components.helsinki import HelsinkiProminenceExtractor
from src.data.components.datasets import TokenTaggingDataset
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
import numpy as np
import os
from tqdm import tqdm
import numpy as np

from src.utils.text_processing import python_lowercase_remove_punctuation
from src.utils.text_processing import get_wordlist_from_string

# only to create a valid dataset
dummy_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_special_tokens=True)

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [2]:
WAV_ROOT = "/Users/lukas/Desktop/projects/MIT/data/LibriTTS"
LAB_ROOT = "/Users/lukas/Desktop/projects/MIT/data/LibriTTSCorpusLabel"
PHONEME_LAB_ROOT = "/Users/lukas/Desktop/projects/MIT/data/LibriTTSCorpusLabelPhoneme"
DATA_CACHE = "/Users/lukas/Desktop/projects/MIT/data/cache"

TRAIN_FILE = "train-clean-100"
VAL_FILE = "dev-clean"
TEST_FILE = "test-clean"

SAVE_DIR = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/predictions/emnlp/f0_dct_4"

In [3]:
from src.data.f0_regression_datamodule import (
    F0RegressionDataModule as DataModule,
)

In [4]:
dm = DataModule(
    wav_root=WAV_ROOT,
    lab_root=LAB_ROOT,
    phoneme_lab_root=PHONEME_LAB_ROOT,
    data_cache=DATA_CACHE,
    train_file=TRAIN_FILE,
    val_file=VAL_FILE,
    test_file=TEST_FILE,
    dataset_name="libritts",
    model_name="gpt2",
    f0_mode="dct",
    f0_n_coeffs=4,
    score_last_token=True,
)

In [5]:
dm.setup()

Using GPT2 tokenizer
Dataloader: padding with token id: 50256
Loading data from cache: ('/Users/lukas/Desktop/projects/MIT/data/cache/train-clean-100', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 31071/31071 [00:17<00:00, 1814.67it/s]


Failed 1590/31071
Loading data from cache: ('/Users/lukas/Desktop/projects/MIT/data/cache/dev-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 4217/4217 [00:02<00:00, 1917.07it/s]


Failed 217/4217
Loading data from cache: ('/Users/lukas/Desktop/projects/MIT/data/cache/test-clean', 'f0_dct_4.pkl')


Preprocessing samples: 100%|██████████| 4389/4389 [00:02<00:00, 1899.25it/s]

Failed 270/4389
Train dataset size: 29481
Validation dataset size: 4000
Test dataset size: 4119





In [6]:
train_texts, train_labels = dm.train_texts, dm.train_durations
val_texts, val_labels = dm.val_texts, dm.val_durations
test_texts, test_labels = dm.test_texts, dm.test_durations

print(
    f"Lengths of train, val, test in samples: {len(train_texts), len(val_texts), len(test_texts)}"
)

Lengths of train, val, test in samples: (31071, 4217, 4389)


In [7]:
from src.utils.text_processing import assign_labels_to_sentences

all_train_words, all_train_labels = assign_labels_to_sentences(
    train_texts, train_labels
)
all_dev_words, all_dev_labels = assign_labels_to_sentences(val_texts, val_labels)
all_test_words, all_test_labels = assign_labels_to_sentences(test_texts, test_labels)

print(f"Words and labels train: {len(all_train_words), len(all_train_labels)}")
print(f"Words and labels dev: {len(all_dev_words), len(all_dev_labels)}")
print(f"Words and labels test: {len(all_test_words), len(all_test_labels)}")

Words and labels train: (520358, 520358)
Words and labels dev: (72537, 72537)
Words and labels test: (77979, 77979)


In [8]:
all_train_labels = np.array(all_train_labels)
all_dev_labels = np.array(all_dev_labels)
all_test_labels = np.array(all_test_labels)

all_train_labels.shape, all_dev_labels.shape, all_test_labels.shape

((520358, 4), (72537, 4), (77979, 4))

### Kernel density estimation and Differential Entropy Computation

In [11]:
# bootstrapping to get confidence intervals
from sklearn.utils import resample
from scipy.stats import gaussian_kde
from src.utils.approximation import cross_validate_gkde_bandwidth
from src.utils.approximation import monte_carlo_diff_entropy

n_iterations = 10
n_train_size = int(len(all_train_labels) * 0.1)
n_dev_size = int(len(all_dev_labels) * 0.02)
n_test_size = int(len(all_test_labels) * 0.3)
print(
    f"n_train_size: {n_train_size}, n_dev_size: {n_dev_size}, n_test_size: {n_test_size}"
)

diff_entropy_list = []

for i in range(n_iterations):
    train_sample = resample(all_train_labels, n_samples=n_train_size)
    dev_sample = resample(all_dev_labels, n_samples=n_dev_size)
    test_sample = resample(all_test_labels, n_samples=n_test_size)
    # best_bw = 0.01
    # # best_bw = cross_validate_gkde_bandwidth(train_sample.T, dev_sample.T)
    # print(f"Best bandwidth: {best_bw}")
    density = gaussian_kde(all_train_labels.T, bw_method=0.1)
    mc_entropy = monte_carlo_diff_entropy(density, test_sample.T, len(test_sample))
    diff_entropy_list.append(mc_entropy)
    print(
        f"Finished iteration {i+1} out of {n_iterations} with diff entropy: {mc_entropy}"
    )

diff_entropy_list = np.array(diff_entropy_list)
print(f"Mean: {np.mean(diff_entropy_list)}, std: {np.std(diff_entropy_list)}")

n_train_size: 52035, n_dev_size: 1450, n_test_size: 23393
Finished iteration 1 out of 10 with diff entropy: 9.709436452446255
Finished iteration 2 out of 10 with diff entropy: 9.681574546466615
Finished iteration 3 out of 10 with diff entropy: 9.671035452211688
Finished iteration 4 out of 10 with diff entropy: 9.692982617089081
Finished iteration 5 out of 10 with diff entropy: 9.653652964713734
Finished iteration 6 out of 10 with diff entropy: 9.673444665170699
Finished iteration 7 out of 10 with diff entropy: 9.678973393367961
Finished iteration 8 out of 10 with diff entropy: 9.650818798463296
Finished iteration 9 out of 10 with diff entropy: 9.684177108634584
Finished iteration 10 out of 10 with diff entropy: 9.649133868803366
Mean: 9.674522986736728, std: 0.018378846723507404


In [36]:
from src.utils.approximation import cross_validate_gkde_bandwidth

nb_train_samples = 30000
nb_test_samples = 5000

train_indices = np.random.choice(
    np.arange(len(all_train_labels)), nb_train_samples, replace=False
)
train_data = all_train_labels[train_indices]
test_indices = np.random.choice(
    np.arange(len(all_test_labels)), nb_test_samples, replace=False
)
test_data = all_test_labels[test_indices]

best_bw = cross_validate_gkde_bandwidth(
    train_data=train_data.T,
    test_data=test_data.T,
)
print(f"best bw {best_bw}")

param scott, score -8.657614158143597
new best param scott, score -8.657614158143597
param silverman, score -8.76890310133348
param 0.01, score -22904.34414780233
param 0.1, score -206.66053136976763
param 0.3, score -18.74305193892195
param 0.5, score -9.18066585609889
best bw scott


In [41]:
# kernel density estimation
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

density = gaussian_kde(np.array(all_train_labels[:20000]).T, bw_method=best_bw)

# xs = np.linspace(0, 6, 1000)
# plt.plot(xs, density(xs))
# plt.show()

In [43]:
indices = np.random.choice(len(all_test_labels), 50000, replace=False)
sampled_labels = all_test_labels[indices]

surprisals = density.logpdf(sampled_labels.T)
diff_entropy = -np.mean(surprisals)

print(f"Differential entropy: {diff_entropy:.4f}")

Differential entropy: 9.0481


In [25]:
from src.utils.plots import plot_vector_kde

# labels_non_nan = [p for ps in train_labels for p in ps if p]
train_labels_flat = np.concatenate(train_labels)

plot_vector_kde(
    train_labels_flat,
    bw_scalar=0.5,
    label_names=["DCT-1", "DCT-2", "DCT-3", "DCT-4"],
    title="Distribution of the 4 DCT f0 coefficients",
    # save_path="/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/predictions/energy_mean/energy_distribution.png",
)

AssertionError: Number of label names must match the number of coefficients

## Store text and labels as pickle 

In [None]:
SAVE_PATH = "/Users/lukas/Desktop/projects/MIT/data/baseline_data/f0_dct_4"

In [None]:
import pickle

with open(SAVE_PATH + "/train_words.pkl", "wb") as f:
    pickle.dump(all_train_words, f)

with open(SAVE_PATH + "/train_labels.pkl", "wb") as f:
    pickle.dump(all_train_labels, f)

with open(SAVE_PATH + "/test_words.pkl", "wb") as f:
    pickle.dump(all_test_words, f)

with open(SAVE_PATH + "/test_labels.pkl", "wb") as f:
    pickle.dump(all_test_labels, f)

with open(SAVE_PATH + "/dev_words.pkl", "wb") as f:
    pickle.dump(all_dev_words, f)

with open(SAVE_PATH + "/dev_labels.pkl", "wb") as f:
    pickle.dump(all_dev_labels, f)

# Baseline Models and Control Functions 

In [None]:
SAVE_PATH = "/Users/lukas/Desktop/projects/MIT/data/baseline_data/f0_dct_4"

In [None]:
# load data again

import pickle


with open(SAVE_PATH + "/train_words.pkl", "rb") as f:
    all_train_words = pickle.load(f)

with open(SAVE_PATH + "/train_labels.pkl", "rb") as f:
    all_train_labels = pickle.load(f)

with open(SAVE_PATH + "/test_words.pkl", "rb") as f:
    all_test_words = pickle.load(f)

with open(SAVE_PATH + "/test_labels.pkl", "rb") as f:
    all_test_labels = pickle.load(f)

with open(SAVE_PATH + "/dev_words.pkl", "rb") as f:
    all_dev_words = pickle.load(f)

with open(SAVE_PATH + "/dev_labels.pkl", "rb") as f:
    all_dev_labels = pickle.load(f)

print(len(all_train_words), len(all_train_labels))
print(len(all_dev_words), len(all_dev_labels))
print(len(all_test_words), len(all_test_labels))

520358 520358
72537 72537
77979 77979


In [None]:
from src.models.baselines.GloVe import GloVeModel

glove_model = GloVeModel(
    model_path="/Users/lukas/Desktop/projects/MIT/data/models/glove/glove.6B.100d.txt"
)

Loading GloVe: 100%|██████████| 400000/400000 [00:06<00:00, 60027.78it/s]


In [None]:
train_emb = [glove_model.get_word_embedding(word) for word in all_train_words]
dev_emb = [glove_model.get_word_embedding(word) for word in all_dev_words]
test_emb = [glove_model.get_word_embedding(word) for word in all_test_words]

print(
    f"Shapes of train, dev, test embeddings: {len(train_emb)}, {len(dev_emb)}, {len(test_emb)}"
)

# create numpy arrays and print shapes
import numpy as np

train_emb = np.array(train_emb)
dev_emb = np.array(dev_emb)
test_emb = np.array(test_emb)

train_labels = np.array(all_train_labels)
dev_labels = np.array(all_dev_labels)
test_labels = np.array(all_test_labels)


print(
    f"Shapes of train, dev, test embeddings: {train_emb.shape}, {dev_emb.shape}, {test_emb.shape}"
)

print(
    f"Shapes of train, dev, test labels: {train_labels.shape}, {dev_labels.shape}, {test_labels.shape}"
)

Shapes of train, dev, test embeddings: 520358, 72537, 77979
Shapes of train, dev, test embeddings: (520358, 100), (72537, 100), (77979, 100)
Shapes of train, dev, test labels: (520358, 4), (72537, 4), (77979, 4)


# Sklearn LinReg

In [None]:
# train sklearn model
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(train_emb, all_train_labels)

# compute mse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

predictions = reg.predict(test_emb)
mse = mean_absolute_error(all_test_labels, predictions)
print(f"Mean absolute error: {mse}")

# compute r2
r2 = r2_score(all_test_labels, predictions)
print(f"R2 score: {r2}")

# compute pearson
flat_labels = np.concatenate(all_test_labels)
flat_predictions = np.concatenate(predictions)
pearson = pearsonr(flat_labels, flat_predictions)
print(f"Pearson correlation: {pearson}")

Mean absolute error: 2.824094102286586
R2 score: 0.05075961504813015
Pearson correlation: PearsonRResult(statistic=0.2909611379801414, pvalue=0.0)
