In [34]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [37]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [38]:
# initialize the coherence library
coherence = Coherence(max_words_per_step=4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-03-31 14:11:39.034474: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [39]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-03-31 14:11:55.966683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [7]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [8]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [17]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [None]:
segments_to_test = 10
max_tokens = 400  # want to keep this under 512

for segment, labels in zip(
    text_segments[:segments_to_test], segments_labels[:segments_to_test]
):
    truncated_segment = [truncate_by_token(s, max_tokens) for s in segment]
    print(coherence.get_coherence(truncated_segment))
    
text_segments_to_check = [
    [truncate_by_token(s, max_tokens) for s in segment]
    for segment in text_segments[:segments_to_test]
]

In [20]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [33]:
samples = 6
max_tokens = 400  # want to keep this under 512
max_str_length = 30

coherence_map = []
for i, (row, label) in enumerate(zip(text_data[:samples], text_labels[:samples])):
    # compare the current sentence to the previous one
    if i == 0:
        pass
    else:
        prev_row = text_data[:samples][i - 1]

        row = truncate_by_token(row, max_tokens)
        prev_row = truncate_by_token(prev_row, max_tokens)

        # add the keywords to the coherence map
        coherence_map.extend(coherence.get_coherence([row, prev_row]))

        # truncate the strings for printing
        truncated_row = truncate_string(row, max_str_length)
        truncated_prev_row = truncate_string(prev_row, max_str_length)

        # get the keywords for the current sentences
        keywords_current = keywords_lib.get_keywords(row)
        keywords_prev = keywords_lib.get_keywords(prev_row)

        print(keywords_prev)
        print(keywords_current)
        print(coherence_map)
        print([*coherence_map, *keywords_prev])

        # compute the word comparisons between the previous (with the coherence map)
        # and the current (possibly the first sentence in a new segment)
        word_comparisons = embedding_lib.compare_keyword_tuples(
            prev_row, row, coherence_map.extend(keywords_prev), keywords_current
        )

        similarities = [comparison[2] for comparison in word_comparisons]
        avg_similarity = sum(similarities) / len(similarities)

        print(
            f"{label}, {avg_similarity}, {truncated_prev_string} <> {truncated_string}"
        )

[('basque', 0.4506), ('sebastián', 0.3756), ('saint', 0.2981), ('san', 0.2837), ('latin', 0.2224)]
[('shoreline', 0.1984), ('seashore', 0.1778), ('coast', 0.1769), ('wetlands', 0.1763), ('seaside', 0.1623)]
[]
[('basque', 0.4506), ('sebastián', 0.3756), ('saint', 0.2981), ('san', 0.2837), ('latin', 0.2224)]


TypeError: 'NoneType' object is not iterable

<IPython.core.display.Javascript object>