## Relative to Prev Prominence Task - Differential Entropy and Control functions 

In [None]:
from src.data.components.helsinki import HelsinkiProminenceExtractor
from src.data.components.datasets import TokenTaggingDataset
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
import numpy as np
import os
from tqdm import tqdm

from src.utils.text_processing import python_lowercase_remove_punctuation
from src.utils.text_processing import get_wordlist_from_string

# only to create a valid dataset
dummy_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_special_tokens=True)

### Load data

In [None]:
DATA_DIR = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/data/Helsinki"
SAVE_DIR = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/predictions/emnlp/prominence_relative_prev"

In [None]:
train_extractor = HelsinkiProminenceExtractor(
    DATA_DIR,
    "train_360.txt",
)
train_texts = train_extractor.get_all_texts()
train_prominences = train_extractor.get_all_real_prominence()

dev_extractor = HelsinkiProminenceExtractor(
    DATA_DIR,
    "dev.txt",
)
dev_texts = dev_extractor.get_all_texts()
dev_prominences = dev_extractor.get_all_real_prominence()

test_extractor = HelsinkiProminenceExtractor(
    DATA_DIR,
    "test.txt",
)
test_texts = test_extractor.get_all_texts()
test_prominences = test_extractor.get_all_real_prominence()

print(
    f"train_texts: {len(train_texts)}, dev_texts: {len(dev_texts)}, test_texts: {len(test_texts)}"
)

In [None]:
train_words = [word for text in train_texts for word in text.split()]
dev_words = [word for text in dev_texts for word in text.split()]
test_words = [word for text in test_texts for word in text.split()]

print(
    f"train_words: {len(train_words)}, dev_words: {len(dev_words)}, test_words: {len(test_words)}"
)

In [None]:
GLOBAL_MEAN_PROMINENCE = np.mean([p for ps in train_prominences for p in ps if p])
GLOBAL_MEAN_PROMINENCE

In [None]:
from src.utils.plots import plot_kde

labels_non_nan = [p for ps in train_prominences for p in ps if p]

# plot_kde(labels_non_nan, label_name="Absolute Prominence", title="Absolute Prominence Distribution", save_path=SAVE_DIR + "/absolute_prominence_distribution.png")

In [None]:
train_dataset = TokenTaggingDataset(
    input_texts=train_texts,
    targets=train_prominences,
    tokenizer=dummy_tokenizer,
    model_name="gpt2",
    score_last_token=True,
    relative_to_prev=True,
    n_prev=3,
)

test_dataset = TokenTaggingDataset(
    input_texts=test_texts,
    targets=test_prominences,
    tokenizer=dummy_tokenizer,
    model_name="gpt2",
    score_last_token=True,
    relative_to_prev=True,
    n_prev=3,
)

In [None]:
train_sentences = []
train_labels = []
for i in range(len(train_dataset)):
    item = train_dataset.__getitem__(i)
    train_sentences.append(item["input_text"])
    mask = np.array(item["loss_mask"])
    labels = np.array(item["tokenized_labels"])
    valid_labels = np.array(labels[mask == 1])
    train_labels.append(valid_labels)

test_sentences = []
test_labels = []
for i in range(len(test_dataset)):
    item = test_dataset.__getitem__(i)
    test_sentences.append(item["input_text"])
    mask = np.array(item["loss_mask"])
    labels = np.array(item["tokenized_labels"])
    valid_labels = np.array(labels[mask == 1])
    test_labels.append(valid_labels)

In [None]:
from src.utils.text_processing import assign_labels_to_sentences

all_train_words, all_train_labels = assign_labels_to_sentences(
    train_sentences, train_labels
)
all_test_words, all_test_labels = assign_labels_to_sentences(
    test_sentences, test_labels
)

print(len(all_train_words), len(all_train_labels))
print(len(all_test_words), len(all_test_labels))

In [None]:
from src.utils.plots import plot_kde

# labels_non_nan = [p for ps in all_train_labels for p in ps if p]

plot_kde(
    all_train_labels,
    label_name="Relative to 3 Previous Prominence",
    title="Relative Prominence Distribution",
)

### Kernel density estimation and Differential Entropy Computation

In [None]:
# kernel density estimation
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

density = gaussian_kde(all_train_labels)

# xs = np.linspace(0, 6, 1000)
# plt.plot(xs, density(xs))
# plt.show()

In [None]:
from src.utils.approximation import monte_carlo_diff_entropy

diff_entropy = monte_carlo_diff_entropy(density, all_train_labels, 1000)
diff_entropy

In [None]:
diff_entropy

# Baseline Models and Control Functions 

### Avg of all words in corpus

In [None]:
avg_difference = np.mean(
    all_train_labels
)  # Here, train_labels are assumed to be prominences
print(f"Average prominence: {avg_difference}")

# compute mse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

predictions = [avg_difference] * len(
    all_test_labels
)  # all_test_labels are assumed to be prominences
mse = mean_absolute_error(all_test_labels, predictions)
print(f"Mean absolute error: {mse}")

# compute r2
r2 = r2_score(all_test_labels, predictions)
print(f"R2 score: {r2}")

# compute pearson
pearson = pearsonr(all_test_labels, predictions)
print(f"Pearson correlation: {pearson}")

# store predictions
avg_test_predictions = []
for i in range(len(all_test_words)):
    sentence_predictions = [avg_difference] * len(all_test_words[i].split(" "))
    avg_test_predictions.append(sentence_predictions)

# store predictions
import pickle
import os

SAVE_DIR = "./path/to/save/directory"  # Please specify your directory path
os.makedirs(f"{SAVE_DIR}/avg", exist_ok=True)

with open(f"{SAVE_DIR}/avg/pred_avg.pkl", "wb") as f:
    pickle.dump(avg_test_predictions, f)

# store texts
with open(f"{SAVE_DIR}/avg/texts_avg.pkl", "wb") as f:
    pickle.dump(all_test_words, f)

# store labels
with open(f"{SAVE_DIR}/avg/labels_avg.pkl", "wb") as f:
    pickle.dump(all_test_labels, f)

### Corpus statistics: predict average per word 

In [None]:
# collect the words types and their respective labels
word_prominence = {}
for word, prominence in zip(all_train_words, all_train_labels):
    if word not in word_prominence:
        word_prominence[word] = []
    word_prominence[word].append(prominence)

# compute the average prominence score for each word
word_prominence_avg = {}
for word, prominence in word_prominence.items():
    word_prominence_avg[word] = np.mean(prominence)

# for each word in the test set, get the average prominence score
predictions = []
for word in all_test_words:
    if word in word_prominence_avg:
        predictions.append(word_prominence_avg[word])
    else:
        predictions.append(avg_difference)  # avg_difference needs to be defined

print(f"Length of test set: {len(all_test_labels)}")
print(f"Length of predictions: {len(predictions)}")

# compute mae
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

mse = mean_absolute_error(all_test_labels, predictions)
print(f"Mean absolute error: {mse}")

# compute r2
r2 = r2_score(all_test_labels, predictions)
print(f"R2 score: {r2}")

# compute pearson
pearson = pearsonr(all_test_labels, predictions)
print(f"Pearson correlation: {pearson}")

# store predictions
word_test_predictions = []
for sentence in all_test_words:
    sentence_predictions = [
        word_prominence_avg[word] if word in word_prominence_avg else avg_difference
        for word in sentence.split()
    ]
    word_test_predictions.append(sentence_predictions)

# store predictions
import pickle
import os

SAVE_DIR = "./path/to/save/directory"  # Please specify your directory path
os.makedirs(f"{SAVE_DIR}/wordavg", exist_ok=True)

with open(f"{SAVE_DIR}/wordavg/pred_wordavg.pkl", "wb") as f:
    pickle.dump(word_test_predictions, f)

# store texts
with open(f"{SAVE_DIR}/wordavg/texts_wordavg.pkl", "wb") as f:
    pickle.dump(all_test_words, f)

# store labels
with open(f"{SAVE_DIR}/wordavg/labels_wordavg.pkl", "wb") as f:
    pickle.dump(all_test_labels, f)

## GloVe Baseline

In [None]:
GLOVE_PATH = "/Users/lukas/Desktop/projects/MIT/data/models/glove/glove.6B.300d.txt"

H_PARAMS = {
    "num_layers": 3,
    "input_size": 300,  # Update this based on the word embedding model
    "hidden_size": 32,
    "num_labels": 1,
    "dropout_probability": 0.1,
    "learning_rate": 0.001,
    "batch_size": 32,
    "max_epochs": 3,
}

In [None]:
from src.models.baselines.control_function import ControlFunction

control_function = ControlFunction(
    word_embedding_type="glove", word_embedding_path=GLOVE_PATH, hparams=H_PARAMS
)

In [None]:
control_function.fit(words=all_train_words, labels=all_train_labels)

In [None]:
# store predictions

pred = control_function.predict(all_test_words)
# flatten the pred
pred = [item for sublist in pred for item in sublist]

In [None]:
# compute mae
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(all_test_labels, pred)
print(f"Mean absolute error: {mae}")

# compute r2
from sklearn.metrics import r2_score

r2 = r2_score(all_test_labels, pred)
print(f"R2 score: {r2}")

# compute pearson
from scipy.stats import pearsonr

pearson = pearsonr(np.array(all_test_labels), pred)
print(f"Pearson correlation: {pearson}")

### Gradient Boosting

In [None]:
word_embedding_model = control_function.word_embedding_model

In [None]:
train_embeddings = [word_embedding_model.get_word_embedding(w) for w in all_train_words]
test_embeddings = [word_embedding_model.get_word_embedding(w) for w in all_test_words]

In [None]:
# sklearn histgrad regressor
from sklearn.ensemble import HistGradientBoostingRegressor

hgb = HistGradientBoostingRegressor(max_iter=500)
hgb.fit(train_embeddings, all_train_labels)

# store predictions
pred = hgb.predict(test_embeddings)

# compute mae
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(all_test_labels, pred)
print(f"Mean absolute error: {mae}")

# compute r2
from sklearn.metrics import r2_score

r2 = r2_score(all_test_labels, pred)
print(f"R2 score: {r2}")

# compute pearson
from scipy.stats import pearsonr

pearson = pearsonr(np.array(all_test_labels), pred)
print(f"Pearson correlation: {pearson}")