# Create the baseline results

In [25]:
import numpy as np

## Load data via torch dataset

In [1]:
from transformers import BertTokenizer

# only to create a valid dataset
dummy_tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased", add_special_tokens=True
)

In [2]:
from src.data.components.helsinki import HelsinkiProminenceExtractor
from src.data.components.datasets import TokenTaggingDataset
from torch.utils.data import DataLoader

In [3]:
train_extractor = HelsinkiProminenceExtractor(
    "/Users/lukas/Desktop/projects/MIT/prosody/prosody/repositories/helsinki-prosody/data",
    "train_360.txt",
)
train_texts = train_extractor.get_all_texts()
train_prominences = train_extractor.get_all_real_prominence()

test_extractor = HelsinkiProminenceExtractor(
    "/Users/lukas/Desktop/projects/MIT/prosody/prosody/repositories/helsinki-prosody/data",
    "test.txt",
)
test_texts = test_extractor.get_all_texts()
test_prominences = test_extractor.get_all_real_prominence()

print(f"train_texts: {len(train_texts)}, test_texts: {len(test_texts)}")

train_texts: 116263, test_texts: 4822


In [5]:
train_dataset = TokenTaggingDataset(
    train_texts,
    train_prominences,
    dummy_tokenizer,
    "bert-cased",
    score_first_token=True,
    relative_to_prev=True,
    n_prev=3,
)

test_dataset = TokenTaggingDataset(
    test_texts,
    test_prominences,
    dummy_tokenizer,
    "bert-cased",
    score_first_token=True,
    relative_to_prev=True,
    n_prev=3,
)

In [30]:
train_sentences = []
train_labels = []
for i in range(len(train_dataset)):
    item = train_dataset.__getitem__(i)
    train_sentences.append(item["input_text"])
    # print(f"length split {i}: {len(item['input_text'].split(' '))}")
    # original = np.array(item["original_labels"])
    # print("original", original)
    mask = np.array(item["loss_mask"])
    # print("mask", mask)
    labels = np.array(item["tokenized_labels"])
    # print("labels", labels)
    valid_labels = np.array(labels[mask == 1])
    # print(f"length valid {i}: {len(valid_labels)}")
    # print("valid", valid_labels)
    train_labels.append(valid_labels)

In [31]:
test_sentences = []
test_labels = []
for i in range(len(test_dataset)):
    item = test_dataset.__getitem__(i)
    test_sentences.append(item["input_text"])
    # print(f"length split {i}: {len(item['input_text'].split(' '))}")
    # original = np.array(item["original_labels"])
    # print("original", original)
    mask = np.array(item["loss_mask"])
    # print("mask", mask)
    labels = np.array(item["tokenized_labels"])
    # print("labels", labels)
    valid_labels = np.array(labels[mask == 1])
    # print(f"length valid {i}: {len(valid_labels)}")
    # print("valid", valid_labels)
    test_labels.append(valid_labels)

### Process data
#### Remove punctuation, lowercase everything 

In [32]:
from src.utils.text_processing import python_lowercase_remove_punctuation

train_sentences = python_lowercase_remove_punctuation(train_sentences)
test_sentences = python_lowercase_remove_punctuation(test_sentences)

train_sentences[:5]

['for man of you your characteristic race here may he hardy sweet gigantic grow here tower proportionate to nature here climb the vast pure spaces unconfined uncheckd by wall or roof here laugh with storm or sun here joy here patiently inure here heed himself unfold himself not others formulas heed here fill his time to duly fall to aid last to disappear to serve',
 'tom the pipers son',
 'tom tom the pipers son stole a pig and away he run the pig was eat and tom was beat and tom ran crying down the street',
 'there was not a worse vagabond in shrewsbury than old barney the piper',
 'he never did any work except to play the pipes and he played so badly that few pennies ever found their way into his pouch']

In [33]:
# create a list of all words in the training set
train_words = [word for sentence in train_sentences for word in sentence.split(" ")]
train_prominences = [prominence for sentence in train_labels for prominence in sentence]

# create a list of all words in the test set
test_words = [word for sentence in test_sentences for word in sentence.split(" ")]
test_prominences = [prominence for sentence in test_labels for prominence in sentence]

print(
    f"Lengths of train and test set: {len(train_words)} and {len(test_words)}, respectively"
)
print(
    f"Lengths of train and test labels: {len(train_prominences)} and {len(test_prominences)}, respectively"
)

Lengths of train and test set: 2075946 and 90050, respectively
Lengths of train and test labels: 2075946 and 90050, respectively


## Simple Models

### Random predictions 

In [34]:
from src.models.baselines.dummy_models import DummyModel

dummy_model = DummyModel(train_prominences, nb_sig=3)
print(f"Dummy model ")

predictions = dummy_model.predict(len(test_prominences))

# compute mse
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_prominences, predictions)
print(f"Mean squared error: {mse}")

Dummy model 
Mean squared error: 3.704456719075822


### Avg of all word in corpus

In [38]:
avg_difference = np.mean(train_prominences)
print(f"Average difference in prominence: {avg_difference}")

# compute mse
from sklearn.metrics import mean_squared_error

predictions = [avg_difference] * len(test_prominences)
mse = mean_squared_error(test_prominences, predictions)
print(f"Mean squared error: {mse}")

Average difference in prominence: 0.1367394704229622
Mean squared error: 0.967241405994297


### Corpus statistics: predict average diff per word 

In [39]:
# collect the words and their prominence scores
word_prominence = {}
for word, prominence in zip(train_words, train_prominences):
    if word not in word_prominence:
        word_prominence[word] = []
    word_prominence[word].append(prominence)

# compute the average prominence score for each word
word_prominence_avg = {}
for word, prominence in word_prominence.items():
    word_prominence_avg[word] = np.mean(prominence)

# for each word in the test set, get the average prominence score
predictions = []
for word in test_words:
    if word in word_prominence_avg:
        predictions.append(word_prominence_avg[word])
    else:
        predictions.append(avg_difference)

# compute mse
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_prominences, predictions)
print(f"Mean squared error: {mse}")

Mean squared error: 0.6902478109049962


## GloVe Embedding Baseline
#### Expects Embeddings already downloaded 

In [69]:
import os
from prosody.src.models.sklearn.sklearn_models import train_sklearn_regressor

In [42]:
weight_dir = "/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/glove"

vocab, embeddings = [], []
with open(os.path.join(weight_dir, "glove.6B.300d.txt"), "rt") as fi:
    full_content = fi.read().strip().split("\n")
for i in range(len(full_content)):
    i_word = full_content[i].split(" ")[0]
    i_embeddings = [float(val) for val in full_content[i].split(" ")[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [43]:
import numpy as np

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

print(f"Shapes vocab: {vocab_npa.shape}  embeddings: {embs_npa.shape}")

Shapes vocab: (400000,)  embeddings: (400000, 300)


In [44]:
vocab_npa = np.insert(vocab_npa, 0, "<pad>")
vocab_npa = np.insert(vocab_npa, 1, "<unk>")
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1, embs_npa.shape[1]))  # embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa, axis=0, keepdims=True)  # embedding for '<unk>' token.

# insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa, unk_emb_npa, embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 300)


In [45]:
import torch

my_embedding_layer = torch.nn.Embedding.from_pretrained(
    torch.from_numpy(embs_npa).float()
)

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 300])


In [46]:
word_to_idx = {word: i for i, word in enumerate(vocab_npa)}
idx_to_word = {i: word for i, word in enumerate(vocab_npa)}

print(word_to_idx["house"])

168


In [55]:
# Create training and test data based on embedding of the word
train_data = []
for word in train_words:
    if word in word_to_idx:
        train_data.append(my_embedding_layer(torch.tensor(word_to_idx[word])))
    else:
        train_data.append(my_embedding_layer(torch.tensor(word_to_idx["<unk>"])))

test_data = []
for word in test_words:
    if word in word_to_idx:
        test_data.append(my_embedding_layer(torch.tensor(word_to_idx[word])))
    else:
        test_data.append(my_embedding_layer(torch.tensor(word_to_idx["<unk>"])))

print(
    f"Lengths of train and test set: {len(train_data)} and {len(test_data)}, respectively"
)

Lengths of train and test set: 2075946 and 90050, respectively


In [62]:
train_data = torch.stack(train_data)
test_data = torch.stack(test_data)

train_data.shape

TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [65]:
train_labels = torch.tensor(train_prominences)
test_labels = torch.tensor(test_prominences)

train_labels.shape, test_labels.shape

(torch.Size([2075946]), torch.Size([90050]))

### GloVe: sklearn dummy regression

In [66]:
from sklearn.dummy import DummyRegressor

dummy_model = DummyRegressor(strategy="mean")
dummy_model.fit(train_data, train_labels)

predictions = dummy_model.predict(test_data)

# compute mse
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_labels, predictions)
print(f"Mean squared error: {mse}")

Mean squared error: 0.967241405994297


### GloVe: sklearn linear regression 

In [67]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(train_data, train_labels)

predictions = linear_model.predict(test_data)

# compute mse
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_labels, predictions)
print(f"Mean squared error: {mse}")

Mean squared error: 0.7338721230640632


### GloVe: sklearn histgradboost

In [68]:
from sklearn.ensemble import HistGradientBoostingRegressor

hist_model = HistGradientBoostingRegressor()
hist_model.fit(train_data, train_labels)

predictions = hist_model.predict(test_data)

# compute mse
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_labels, predictions)
print(f"Mean squared error: {mse}")

Mean squared error: 0.677030123714282
