In [2]:
import numpy as np
from prosody.src.data_preparation.helsinki import HelsinkiProminenceExtractor
import os

In [5]:
root_dir = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/data/Helsinki"
save_path = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/predictions"
train_filename = "train_360.txt"
test_filename = "test.txt"

In [6]:
from prosody.src.data_preparation.helsinki import HelsinkiProminenceExtractor

train_extractor = HelsinkiProminenceExtractor(root_dir, train_filename)
test_extractor = HelsinkiProminenceExtractor(root_dir, test_filename)

train_texts = train_extractor.get_all_texts()
train_prominences = train_extractor.get_all_real_prominence()

test_texts = test_extractor.get_all_texts()
test_prominences = test_extractor.get_all_real_prominence()

print(f"train utterances {len(train_texts)}, test utterances {len(test_texts)}")

Loaded 116263 utterances
Loaded 4822 utterances
train utterances 116263, test utterances 4822


# Create GloVe Samples 

In [4]:
glove_dir = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/glove"

In [5]:
!wget http://nlp.stanford.edu/data/glove.6B.zip -P {glove_dir}
!unzip glove.6B.zip
!ls -lat

--2023-04-11 21:41:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-04-11 21:41:21--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-11 21:41:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/Users/lukas/Deskto

In [88]:
weight_dir = (
    "/Users/lukas/Desktop/projects/MIT/prosody/precomputed/models/GloVe_weights"
)

vocab, embeddings = [], []
with open(os.path.join(weight_dir, "glove.6B.300d.txt"), "rt") as fi:
    full_content = fi.read().strip().split("\n")
for i in range(len(full_content)):
    i_word = full_content[i].split(" ")[0]
    i_embeddings = [float(val) for val in full_content[i].split(" ")[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [89]:
import numpy as np

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

print(f"Shapes vocab: {vocab_npa.shape}  embeddings: {embs_npa.shape}")

Shapes vocab: (400000,)  embeddings: (400000, 300)


In [90]:
vocab_npa = np.insert(vocab_npa, 0, "<pad>")
vocab_npa = np.insert(vocab_npa, 1, "<unk>")
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1, embs_npa.shape[1]))  # embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa, axis=0, keepdims=True)  # embedding for '<unk>' token.

# insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa, unk_emb_npa, embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 300)


In [91]:
import torch

my_embedding_layer = torch.nn.Embedding.from_pretrained(
    torch.from_numpy(embs_npa).float()
)

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 300])


In [18]:
with open("vocab_npa.npy", "wb") as f:
    np.save(f, vocab_npa)

with open("embs_npa.npy", "wb") as f:
    np.save(f, embs_npa)

In [19]:
word_to_idx = {word: i for i, word in enumerate(vocab_npa)}
idx_to_word = {i: word for i, word in enumerate(vocab_npa)}

print(word_to_idx["house"])

168


In [30]:
texts[0], prominences[0]

("A 'JOLLY' ART CRITIC", [0.128, 2.454, 0.986, 0.233])

In [107]:
X = []
y = []

failed = 0
for i, sentence in enumerate(texts):
    for j, word in enumerate(sentence.split(" ")):
        try:
            word = word.lower()
            idx = word_to_idx[word]
            label = prominences[i][j]
            X.append(torch.tensor([embs_npa[idx]]))
            y.append(torch.tensor([label]))
        except:
            failed += 1
            continue

print(f"len X: {len(X)}  len y: {len(y)}  failed: {failed}")

X = torch.cat(X, dim=0)
y = torch.cat(y, dim=0)

print(f"failed fraction {failed/(failed+len(X))}")

len X: 88534  len y: 88534  failed: 1516
failed fraction 0.016835091615769016


In [108]:
print(f"shapes of X and y: {X.shape}, {y.shape}")

shapes of X and y: torch.Size([88534, 300]), torch.Size([88534])


In [109]:
# store the embeddings and labels
save_path = "/Users/lukas/Desktop/projects/MIT/prosody/precomputed/data_embeddings"
data_str = train_filename.replace(".txt", "")
model_name = f"glove_6B_300d_helsinki_continuous_{data_str}"
torch.save(X, os.path.join(save_path, f"{model_name}_X.pt"))
torch.save(y, os.path.join(save_path, f"{model_name}_y.pt"))

# Regression training

In [110]:
import torch

In [111]:
save_path = "/Users/lukas/Desktop/projects/MIT/prosody/precomputed/data_embeddings"
model_name = "glove_6B_300d_helsinki_continuous"
X_train_path = os.path.join(save_path, f"{model_name}_train_100_X.pt")
y_train_path = os.path.join(save_path, f"{model_name}_train_100_y.pt")
X_test_path = os.path.join(save_path, f"{model_name}_test_X.pt")
y_test_path = os.path.join(save_path, f"{model_name}_test_y.pt")
X_train = torch.load(X_train_path)
y_train = torch.load(y_train_path)
X_test = torch.load(X_test_path)
y_test = torch.load(y_test_path)

In [112]:
# split data into train and test set
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(
    f"Shapes of train and test set: {X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}"
)

Shapes of train and test set: torch.Size([560915, 300]), torch.Size([88534, 300]), torch.Size([560915]), torch.Size([88534])


### Sklearn models

In [113]:
from prosody.src.models.sklearn.sklearn_models import train_sklearn_regressor

In [114]:
# Train dummy
from sklearn.dummy import DummyRegressor

dummy_regressor = DummyRegressor(strategy="mean")
model, scores = train_sklearn_regressor(
    dummy_regressor, X_train, y_train, X_test, y_test
)
print("Dummy Regressor Scores: {}".format(scores))

Dummy Regressor Scores: {'train': {'mean_absolute_error': 0.6437916, 'mean_squared_error': 0.637565, 'r2_score': 0.0}, 'test': {'mean_absolute_error': 0.6464712, 'mean_squared_error': 0.64568114, 'r2_score': -1.3697985488558828e-05}}


In [115]:
# Linear Regression
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression()
model, scores = train_sklearn_regressor(
    linear_regressor, X_train, y_train, X_test, y_test
)
print(f"Linear regression scores: {scores}")

Linear regression scores: {'train': {'mean_absolute_error': 0.48051304851959553, 'mean_squared_error': 0.44526446002107334, 'r2_score': 0.3016170541255997}, 'test': {'mean_absolute_error': 0.4846849476697734, 'mean_squared_error': 0.4536318040207255, 'r2_score': 0.29742717230591875}}


In [116]:
# GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

hist_regressor = HistGradientBoostingRegressor()
model, scores = train_sklearn_regressor(
    hist_regressor,
    X_train,
    y_train,
    X_test,
    y_test,
    save_model=True,
    save_path=save_path,
)
print(f"Hist Gradient Boosting Regressor scores: {scores}")

Hist Gradient Boosting Regressor scores: {'train': {'mean_absolute_error': 0.4465646042592211, 'mean_squared_error': 0.3980175811111108, 'r2_score': 0.3757222599957225}, 'test': {'mean_absolute_error': 0.4543714458066604, 'mean_squared_error': 0.4118446241972164, 'r2_score': 0.3621460408458822}}


In [117]:
# MLP
from sklearn.neural_network import MLPRegressor

mlp_regressor = MLPRegressor(
    hidden_layer_sizes=(100, 10),
    max_iter=10,
    alpha=1e-4,
    solver="adam",
    tol=1e-4,
    random_state=1,
    learning_rate_init=0.1,
    early_stopping=True,
)
model, scores = train_sklearn_regressor(mlp_regressor, X_train, y_train, X_test, y_test)
print(f"MLP Regressor scores: {scores}")



MLP Regressor scores: {'train': {'mean_absolute_error': 0.6449205532514198, 'mean_squared_error': 0.6375879964949015, 'r2_score': -3.621044713986166e-05}, 'test': {'mean_absolute_error': 0.6476010928802634, 'mean_squared_error': 0.6456756848846676, 'r2_score': -5.263480176820323e-06}}
