In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [4]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [5]:
# initialize the coherence library
coherence = Coherence(max_words_per_step=4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Metal device set to: Apple M1 Max


2023-03-31 23:38:42.901353: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-31 23:38:42.901491: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-03-31 23:38:44.726209: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-31 23:38:44.757493: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-31 23:38:45.552513: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [6]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-03-31 23:38:50.299752: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [7]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [8]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [9]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [20]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [20]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [30]:
samples = 6
max_tokens = 400  # want to keep this under 512
max_str_length = 30

coherence_map = []
for i, (row, label) in enumerate(zip(text_data[:samples], text_labels[:samples])):
    # compare the current sentence to the previous one
    if i == 0:
        pass
    else:
        prev_row = text_data[:samples][i - 1]

        row = truncate_by_token(row, max_tokens)
        prev_row = truncate_by_token(prev_row, max_tokens)

        # add the keywords to the coherence map
        coherence_map.extend(coherence.get_coherence([row, prev_row], word_sim=0.2))
        print(
            f"Coherence: {[x[0] for x in coherence.get_coherence([row, prev_row], word_sim=0.2)]}"
        )

        # truncate the strings for printing
        truncated_row = truncate_string(row, max_str_length)
        truncated_prev_row = truncate_string(prev_row, max_str_length)

        # get the keywords for the current sentences
        keywords_current = keywords_lib.get_keywords_with_embeddings(row)
        keywords_prev = keywords_lib.get_keywords_with_embeddings(prev_row)

        # compute the word comparisons between the previous (with the coherence map)
        # and the current (possibly the first sentence in a new segment)
        word_comparisons = embedding_lib.compare_keyword_tuples(
            [*coherence_map, *keywords_prev], keywords_current
        )

        similarities = [comparison[2] for comparison in word_comparisons]
        avg_similarity = sum(similarities) / len(similarities)

        print(
            f"{[x[0] for x in coherence_map]} <> {[x[0] for x in keywords_prev]} <> {[x[0] for x in keywords_current]}"
        )
        print(f"{label}, {avg_similarity}, {truncated_prev_row} <> {truncated_row}")

Coherence: ['shoreline', 'basque', 'seashore', 'basque']
['shoreline', 'basque', 'seashore', 'basque'] <> ['basque', 'sebastián', 'saint', 'san', 'latin'] <> ['shoreline', 'seashore', 'coast', 'wetlands', 'seaside']
0, tensor([0.3396]), In spite of appearances both t.. <> The city is in the north of th..
Coherence: ['precipitation', 'shoreline', 'climate', 'shoreline']
['shoreline', 'basque', 'seashore', 'basque', 'precipitation', 'shoreline', 'climate', 'shoreline'] <> ['shoreline', 'seashore', 'coast', 'wetlands', 'seaside'] <> ['precipitation', 'climate', 'winters', 'overcast', 'cloudy']
0, tensor([0.3303]), The city is in the north of th.. <> San Sebastián features an ocea..
Coherence: ['paleolithic', 'precipitation', 'evidence', 'precipitation']
['shoreline', 'basque', 'seashore', 'basque', 'precipitation', 'shoreline', 'climate', 'shoreline', 'paleolithic', 'precipitation', 'evidence', 'precipitation'] <> ['precipitation', 'climate', 'winters', 'overcast', 'cloudy'] <> ['paleolit

<IPython.core.display.Javascript object>

## Sample Segments

In [21]:
segments_to_test = 10

text_segments_to_check = [
    [truncate_by_token(s, max_tokens) for s in segment]
    for segment in text_segments[:segments_to_test]
]

<IPython.core.display.Javascript object>

In [22]:
different_segment_1 = [text_segments_to_check[0][-1], text_segments_to_check[1][0]]
different_segment_2 = [text_segments_to_check[1][-1], text_segments_to_check[2][0]]
different_segment_3 = [text_segments_to_check[2][-1], text_segments_to_check[3][0]]
different_segment_4 = [text_segments_to_check[3][-1], text_segments_to_check[4][0]]
different_segment_5 = [text_segments_to_check[4][-1], text_segments_to_check[5][0]]

same_segment_1 = [text_segments_to_check[0][0], text_segments_to_check[0][1]]
same_segment_2 = [text_segments_to_check[1][0], text_segments_to_check[1][1]]
same_segment_3 = [text_segments_to_check[2][0], text_segments_to_check[2][1]]
same_segment_4 = [text_segments_to_check[3][0], text_segments_to_check[3][1]]
same_segment_5 = [text_segments_to_check[4][0], text_segments_to_check[4][1]]

<IPython.core.display.Javascript object>