In [1]:
from pytorch_lightning.utilities.seed import seed_everything
seed_everything(seed=42)

Global seed set to 42


42

# Data preparation

In [2]:
import numpy as np

with open("segmentation_dataset.npz", "rb") as f:
  npz = np.load(f, allow_pickle=True)
  X_train, y_train, X_valid, y_valid, X_test, y_test = npz.values()

Pad all the sequences to the longest sequence in the split and encode them using the embedding.

In [3]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import OneHotEncoder

labels = list(set.union(*[set(yi) for yi in y_train]))
encoder = OneHotEncoder().fit(np.array(labels).reshape(-1, 1))

def embed_data(model, X, y, encoder):
    seq_length = [len(xi) for xi in X]
    max_seq_length = max(seq_length)
    
    padded_seq = np.stack(
        [np.pad(xi, (0, max_seq_length - len(xi)), constant_values="P") 
         for xi in X])
    
    padding_mask = torch.tensor((padded_seq == "P").astype(int))
    padded_seq[padding_mask == 1] = "N"
    
    embedded_seq = torch.tensor(np.stack([[model[sample]
                                         for sample in seq] 
                                        for seq in padded_seq]))
    
    
    encoded_labels = [torch.tensor(encoder.transform(yi.reshape(-1, 1)).todense()) for yi in y]
    padded_labels = pad_sequence(encoded_labels, batch_first=True, padding_value=0)
  
    return embedded_seq, padded_labels, padding_mask

# LSTM Model

## Model definition

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [5]:
import torch.nn as nn
import torch
import pytorch_lightning as pl
from segmentation.metrics import pairwise_metrics, under_over_segmentation
from collections import defaultdict
from more_itertools import stagger

class BaselineModel(pl.LightningModule):
    def __init__(self, 
                 embedding_dim: int = 10,
                 hidden_size: int = 100, 
                 dropout: float = 0.0,
                 num_layers: int = 1, 
                 num_labels: int = 10):
        super().__init__()
        self.save_hyperparameters()
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size,
                            dropout=dropout,
                            num_layers=num_layers,
                            batch_first=True)
        self.classification = nn.Linear(hidden_size, num_labels)
        self.softmax = nn.Softmax(dim=2)
        
    def _predict(self, batch):
        x, y, mask = batch
        x, _ = self.lstm(x)
        x = self.classification(x)
        x = self.softmax(x)
               
        loss = nn.functional.binary_cross_entropy(x[mask == 0].float(), y[mask == 0].float())
        return x, loss
    
    def _postprocess(self, pred):
        for i, (p, c, n) in enumerate(stagger(pred)):
            if p != None and p != c != n:
                pred[i] = n
        return pred
    
    def _test(self, batch):
        metrics = defaultdict(list)
        x, y, mask = batch
        
        with torch.no_grad():
            pred, loss = self._predict(batch)
            
            for pi, yi, mi in zip(pred, y, mask):
                pi = pi[mi == 0].argmax(axis=-1).cpu().numpy()
                _, pi = np.unique(pi, return_inverse=True)
            
                yi = yi[mi == 0].argmax(axis=-1).cpu().numpy()
                _, yi = np.unique(yi, return_inverse=True)
            
                precision, recall, f1 = pairwise_metrics(yi, pi)
                metrics["precision"].append(precision)
                metrics["recall"].append(recall)
                metrics["f1"].append(f1)
                under, over = under_over_segmentation(yi, pi)
                metrics["under"] = under
                metrics["over"] = over
        
        metrics = {k: np.mean(v) for k, v in metrics.items()}
        return loss, metrics

    def training_step(self, batch, batch_idx):
        _, loss = self._predict(batch)
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, metrics = self._test(batch)
        self.log("val_loss", loss)
        for k, m in metrics.items(): self.log(f"val_{k}", m)
        return loss
    
    def test_step(self, batch, batch_idx):
        loss, metrics = self._test(batch)        
        self.log("test_loss", loss)
        for k, m in metrics.items(): self.log(f"test_{k}", m)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "train_loss"
            }
        }

In [6]:
from torch.utils.data import DataLoader

class DataModule(pl.LightningDataModule):
    def __init__(self, train, valid = None, test = None, batch_size: int = 32):
        super().__init__()
        self._train = train
        self._valid = valid
        self._test = test
        self.batch_size = batch_size
        
    def train_dataloader(self):
        return DataLoader(list(zip(*self._train)), batch_size=self.batch_size)

    def val_dataloader(self):
        if self._valid is not None:
            return DataLoader(list(zip(*self._valid)), batch_size=self.batch_size)
        
    def test_dataloader(self):
        if self._test is not None:
            return DataLoader(list(zip(*self._test)), batch_size=self.batch_size)

## Train using harte2vec

In [76]:
from harte2vec.harte2vec import Harte2Vec
harte2vec = Harte2Vec.from_pretrimport pandas as pd

df = pd.DataFrame.from_dict([dict(**res[0], **res[1][0]) for res in results])
df.iloc[(df["val_under"] + df["val_over"] + df["val_f1"]).sort_values(ascending=False).index]ained("harte2vec.pt")

data = DataModule(embed_data(harte2vec, X_train, y_train, encoder), 
                  embed_data(harte2vec, X_valid, y_valid, encoder), 
                  embed_data(harte2vec, X_test, y_test, encoder))

In [77]:
from pytorch_lightning.callbacks import EarlyStopping, StochasticWeightAveraging
import logging

logging.getLogger("pytorch_lightning").setLevel(logging.CRITICAL)


def train(config, data=data, epochs=500, validate=True, log=False):
    model = BaselineModel(**config)
    
    trainer = pl.Trainer(max_epochs=epochs, accelerator="gpu", devices=1,
                         enable_progress_bar=False,
                         callbacks=[
                             EarlyStopping(monitor="train_loss", min_delta=0.00, patience=2),
                             StochasticWeightAveraging(swa_lrs=1e-2)
                         ])
    trainer.fit(model, datamodule=data)
    results = trainer.validate(model, datamodule=data, verbose=False) if validate else None
    return results, model

### Hyperparameter search

In [78]:
from itertools import product
from tqdm.auto import tqdm

parameters = { 
    "hidden_size": [2, 5, 10],
    "num_layers": [5, 10],
    "dropout": [0.0, 0.5],
}

results = list()
combinations = list(enumerate(product(*parameters.values())))


for idx, params in tqdm(combinations):
    params = dict(zip(parameters.keys(), params))
    res, _ = train(params, data=data, epochs=350)
    results.append((params, res))

  0%|          | 0/12 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


In [79]:
import pandas as pd

df = pd.DataFrame.from_dict([dict(**res[0], **res[1][0]) for res in results])
df.iloc[(df["val_under"] + df["val_over"] + df["val_f1"]).sort_values(ascending=False).index]

Unnamed: 0,hidden_size,num_layers,dropout,val_loss,val_precision,val_recall,val_f1,val_under,val_over
2,2,10,0.0,0.246221,0.453463,0.940055,0.591652,0.956955,0.387198
8,10,5,0.0,0.235573,0.510368,0.897013,0.624548,0.900554,0.362908
6,5,10,0.0,0.238436,0.491842,0.923419,0.620334,0.900554,0.362908
10,10,10,0.0,0.234461,0.486818,0.933336,0.617251,0.900554,0.362908
0,2,5,0.0,0.25642,0.46173,0.954748,0.60187,0.900554,0.362908
7,5,10,0.5,0.248866,0.46173,0.954748,0.60187,0.900554,0.362908
11,10,10,0.5,0.24724,0.46173,0.954748,0.60187,0.900554,0.362908
3,2,10,0.5,0.265206,0.415539,1.0,0.571265,1.0,0.212421
9,10,5,0.5,0.243349,0.461484,0.944391,0.595604,0.881099,0.306476
1,2,5,0.5,0.262745,0.42521,0.997376,0.579649,0.956955,0.243891


### Test best hyperparameters

In [80]:
from harte2vec.harte2vec import Harte2Vec
harte2vec = Harte2Vec.from_pretrained("harte2vec.pt")

data = DataModule(embed_data(harte2vec, np.concatenate([X_train, X_valid]), np.concatenate([y_train, y_valid]), encoder),
                  test=embed_data(harte2vec, X_test, y_test, encoder))

In [92]:
model = BaselineModel(hidden_size=5, num_layers=10, dropout=0.0)

trainer = pl.Trainer(default_root_dir="models/harte2vec_lstm",
                     max_epochs=350, accelerator="gpu", devices=1,
                     callbacks=[
                         EarlyStopping(monitor="train_loss", patience=10),
                         StochasticWeightAveraging(swa_lrs=1e-2),
                     ])

trainer.fit(model, datamodule=data)

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [91]:
metrics = trainer.test(model, datamodule=data)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_f1            0.5716114712680601
        test_loss           0.2264181226491928
        test_over           0.5705736238317893
     test_precision         0.4536421055554555
       test_recall          0.8340693963702562
       test_under           0.9521838657630957
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


## Train using word2vec

In [69]:
from gensim.models import Word2Vec
word2vec = Word2Vec.load("word2vec.gensim")

data = DataModule(embed_data(word2vec.wv, X_train, y_train, encoder), 
                  embed_data(word2vec.wv, X_valid, y_valid, encoder), 
                  embed_data(word2vec.wv, X_test, y_test, encoder))

### Hyperparameter search

In [70]:
from itertools import product
from tqdm.auto import tqdm

parameters = { 
    "hidden_size": [100, 150, 200],
    "num_layers": [5, 10],
    "dropout": [0.0, 0.3, 0.5],
    "embedding_dim": [word2vec.vector_size]
}

results = list()
combinations = list(enumerate(product(*parameters.values())))

for idx, params in tqdm(combinations):
    params = dict(zip(parameters.keys(), params))
    res, _ = train(params, data=data, epochs=500)
    results.append((params, res))

  0%|          | 0/18 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


In [71]:
import pandas as pd

df = pd.DataFrame.from_dict([dict(**res[0], **res[1][0]) for res in results])
df.iloc[(df["val_under"] + df["val_over"] + df["val_f1"]).sort_values(ascending=False).index]

Unnamed: 0,hidden_size,num_layers,dropout,embedding_dim,val_loss,val_precision,val_recall,val_f1,val_under,val_over
13,200,5,0.3,300,0.238776,0.504178,0.889871,0.621914,0.953125,0.422784
4,100,10,0.3,300,0.243439,0.465856,0.938855,0.601375,0.953125,0.422784
16,200,10,0.3,300,0.241027,0.465856,0.938855,0.601375,0.953125,0.422784
1,100,5,0.3,300,0.235364,0.469606,0.914642,0.594758,0.937257,0.399985
2,100,5,0.5,300,0.230996,0.484252,0.942096,0.616464,0.91754,0.375694
3,100,10,0.0,300,0.236915,0.474295,0.919815,0.603955,0.895268,0.404294
17,200,10,0.5,300,0.238449,0.463079,0.953341,0.602682,0.91754,0.375694
11,150,10,0.5,300,0.235693,0.462152,0.953432,0.601858,0.91754,0.375694
7,150,5,0.3,300,0.228508,0.479463,0.900951,0.602858,0.924982,0.364232
15,200,10,0.0,300,0.250593,0.472174,0.907787,0.598567,0.857806,0.39816


### Test best hyperparameters

In [72]:
from gensim.models import Word2Vec
word2vec = Word2Vec.load("word2vec.gensim")

data = DataModule(embed_data(word2vec.wv, np.concatenate([X_train, X_valid]), np.concatenate([y_train, y_valid]), encoder),
                  test=embed_data(word2vec.wv, X_test, y_test, encoder))

In [73]:
from pytorch_lightning.callbacks import EarlyStopping, StochasticWeightAveraging
import logging

logging.getLogger("pytorch_lightning").setLevel(logging.CRITICAL)

model = BaselineModel(hidden_size=200, num_layers=5, dropout=0.3, embedding_dim=word2vec.vector_size)

trainer = pl.Trainer(default_root_dir="models/word2vec_lstm",
                     max_epochs=500, accelerator="gpu", devices=1,
                     callbacks=[
                         EarlyStopping(monitor="train_loss", min_delta=0.00, patience=5),
                         StochasticWeightAveraging(swa_lrs=1e-2),
                     ])

trainer.fit(model, datamodule=data)

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn("One of given dataloaders is None and it will be skipped.")
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [74]:
metrics = trainer.test(model, datamodule=data)

  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_f1            0.5612626493466071
        test_loss           0.22821615636348724
        test_over           0.5890391885847057
     test_precision         0.4264151985527739
       test_recall          0.9138498778367697
       test_under           0.9296958362175696
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


## Train using fasttext

In [62]:
from gensim.models import FastText
fasttext = FastText.load("fasttext.gensim")

data = DataModule(embed_data(fasttext.wv, X_train, y_train, encoder), 
                  embed_data(fasttext.wv, X_valid, y_valid, encoder), 
                  embed_data(fasttext.wv, X_test, y_test, encoder))

### Hyperparameter search

In [63]:
from itertools import product
from tqdm.auto import tqdm

parameters = { 
    "hidden_size": [100, 150, 200],
    "num_layers": [5, 10],
    "dropout": [0.0, 0.3, 0.5],
    "embedding_dim": [fasttext.vector_size]
}

results = list()
combinations = list(enumerate(product(*parameters.values())))

for idx, params in tqdm(combinations):
    params = dict(zip(parameters.keys(), params))
    res, _ = train(params, data=data, epochs=500)
    results.append((params, res))

  0%|          | 0/18 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


In [65]:
import pandas as pd

df = pd.DataFrame.from_dict([dict(**res[0], **res[1][0]) for res in results])
df.iloc[(df["val_under"] + df["val_over"] + df["val_f1"]).sort_values(ascending=False).index]

Unnamed: 0,hidden_size,num_layers,dropout,embedding_dim,val_loss,val_precision,val_recall,val_f1,val_under,val_over
12,200,5,0.0,300,0.23164,0.493279,0.917324,0.618234,0.953125,0.422784
8,150,5,0.5,300,0.234651,0.47788,0.926903,0.608981,0.953125,0.422784
9,150,10,0.0,300,0.240005,0.468102,0.938602,0.603304,0.953125,0.422784
17,200,10,0.5,300,0.251168,0.465856,0.938855,0.601375,0.953125,0.422784
4,100,10,0.3,300,0.240357,0.502448,0.879451,0.617864,0.895268,0.404294
2,100,5,0.5,300,0.228496,0.488206,0.951508,0.619465,0.91754,0.375694
11,150,10,0.5,300,0.237049,0.467153,0.951446,0.605947,0.91754,0.375694
14,200,5,0.5,300,0.240002,0.462152,0.953432,0.601858,0.91754,0.375694
5,100,10,0.5,300,0.252493,0.467859,0.923907,0.599378,0.865254,0.38849
3,100,10,0.0,300,0.254995,0.467859,0.923907,0.599378,0.865254,0.38849


### Test best hyperparameters

In [66]:
from gensim.models import FastText
fasttext = FastText.load("fasttext.gensim")

data = DataModule(embed_data(fasttext.wv, np.concatenate([X_train, X_valid]), np.concatenate([y_train, y_valid]), encoder),
                  test=embed_data(fasttext.wv, X_test, y_test, encoder))

In [67]:
from pytorch_lightning.callbacks import EarlyStopping, StochasticWeightAveraging
import logging
logging.getLogger("pytorch_lightning").setLevel(logging.CRITICAL)

model = BaselineModel(hidden_size=200, num_layers=5, dropout=0.0, embedding_dim=fasttext.vector_size)

trainer = pl.Trainer(default_root_dir="models/fasttext_lstm",
                     max_epochs=350, accelerator="gpu", devices=1,
                     callbacks=[
                         EarlyStopping(monitor="train_loss", min_delta=0.00, patience=2),
                         StochasticWeightAveraging(swa_lrs=1e-2),
                     ])

trainer.fit(model, datamodule=data)

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn("One of given dataloaders is None and it will be skipped.")
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [68]:
metrics = trainer.test(model, datamodule=data)

  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_f1            0.5579043528480463
        test_loss           0.22975552082061768
        test_over           0.5705736238317893
     test_precision         0.42399803102213496
       test_recall          0.9242292002107206
       test_under           0.9521838657630957
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


# FORM Baseline

In [41]:
from segmentation.form import FORM
from segmentation.metrics import pairwise_metrics, under_over_segmentation
import numpy as np

In [43]:
metrics = list()

for xi, yi in zip(X_test, y_test):
    _, pred = np.unique(FORM(list(xi)), return_inverse=True)
    _, target = np.unique(yi, return_inverse=True)
    metrics.append((*pairwise_metrics(target, pred), *under_over_segmentation(target, pred)))

In [44]:
precision, recall, f1, under, over = zip(*metrics)

In [45]:
print("precision", np.mean(precision))
print("recall", np.mean(recall))
print("f1", np.mean(f1))
print("under", np.mean(under))
print("over", np.mean(over))

precision 0.5814049743223666
recall 0.5133868699516293
f1 0.5321519792560437
under 0.5352970694536924
over 0.6040644823138202


# Final results

| Model | Precision | Recall | F1 |
|---|---|---|---|
|FORM|0.58|0.51|0.53|
|Word2Vec LSTM|0.43|0.91|0.57|
|FastText LSTM|0.44|0.90|0.57|
|harte2vec LSTM|0.47|0.82|0.58|