In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [6]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [7]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [8]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [9]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [10]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [45]:
# initialize the coherence library
max_words_per_step = 4
coherence = Coherence(max_words_per_step=max_words_per_step)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-16 01:27:22.643016: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [417]:
def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(
    coherence_map,
    keywords_current,
    suppress_errors=False,
    same_word_multiplier=3,  # if set to 1, don't amplify the same words found
):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]
                second_word_importance = second_word_tuple[1]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]

                    if same_word_multiplier > 1:
                        flattened_coherence_words_only = [
                            element[0]
                            for sublist in coherence_map
                            for element in sublist
                        ]

                        num_occurrences = flattened_coherence_words_only.count(
                            second_word
                        )

                        if num_occurrences > 0:
                            # amplify words that are found as duplicates in the coherence map
                            # if the word shows up 1 time, amplify the weight by 2 times
                            weighting_multiplier = flattened_coherence_words_only.count(
                                second_word
                            ) + (same_word_multiplier - 1)
                        else:
                            #                             weighting_multiplier = 1
                            weighting_multiplier = (
                                1 / same_word_multiplier
                            )  # reduce the importance of this word

                    else:
                        weighting_multiplier = 1  # set to 1 in case this is turned off.

                    # this weight is a recipricol function that will grow smaller the further the keywords are away
                    # we want to put more importance on the current words, so we apply twice as much weight.
                    if i == 0:
                        weight = (weighting_multiplier * 2) / (i + 1)
                    else:
                        weight = (weighting_multiplier * 1) / (i + 1)

                    # multiply the weighting factor by the importance of the second word
                    weight *= second_word_importance

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * coherence.embedding_lib.get_similarity(
                                word_one_emb, word_two_emb
                            ),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data,
    text_labels,
    max_tokens=256,
    max_str_length=30,
    prediction_thresh=0.25,
    pruning=0,  # remove one sentence worth of keywords
    pruning_min=6,  # remove the first sentence in the coherence map once it grows passed 6
    dynamic_threshold=False,
    threshold_warmup=10,  # number of iterations before using dynamic threshold
    last_n_threshold=5,  # will only consider the last n thresholds for dynamic threshold
):
    coherence_map = []
    predictions = []
    thresholds = []
    for i, (row, label) in enumerate(zip(text_data, text_labels)):
        threshold = prediction_thresh
        if dynamic_threshold and (i + 1) > threshold_warmup:
            last_n_thresholds = thresholds[(0 - last_n_threshold) :]
            last_n_thresholds.sort()
            mid = len(last_n_thresholds) // 2
            threshold = (last_n_thresholds[mid] + last_n_thresholds[~mid]) / 2
            print(f"median threshold: {threshold}")
        # compare the current sentence to the previous one
        if i == 0:
            predictions.append((0, 0))
        else:
            prev_row = text_data[i - 1]

            row = truncate_by_token(row, max_tokens)
            prev_row = truncate_by_token(prev_row, max_tokens)

            cohesion, keywords_prev, keywords_current = coherence.get_coherence(
                [row, prev_row], coherence_threshold=0.3
            )

            # add the keywords to the coherence map
            coherence_map.append(cohesion)
            if pruning > 0 and len(coherence_map) >= pruning_min:
                print("pruning...", len(coherence_map))
                coherence_map = coherence_map[
                    pruning:
                ]  # remove the pruning amount from the beginning of the list
                print("done pruning...", len(coherence_map))

            # truncate the strings for printing
            truncated_row = truncate_string(row, max_str_length)
            truncated_prev_row = truncate_string(prev_row, max_str_length)
            print(
                f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}, KW Curr: {[x[0] for x in keywords_current]}"
            )

            # compute the word comparisons between the previous (with the coherence map)
            # and the current (possibly the first sentence in a new segment)
            word_comparisons_with_coherence, weights = compare_coherent_words(
                [*coherence_map, keywords_prev], keywords_current
            )

            similarities_with_coherence = [
                comparison[2] for comparison in word_comparisons_with_coherence
            ]
            avg_similarity_with_coherence = sum(similarities_with_coherence) / (
                len(similarities_with_coherence) or 1
            )
            weighted_avg_similarity_with_coherence = get_weighted_average(
                similarities_with_coherence, weights
            )
            print(f"weighted: {weighted_avg_similarity_with_coherence}")

            # if the two sentences are similar, create a cohesive prediction
            # otherwise, predict a new segment
            if weighted_avg_similarity_with_coherence > threshold:
                print(
                    f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 0))
            else:
                # start of a new segment, empty the map
#                 coherence_map = []
                print(
                    f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 1))

            thresholds.append(weighted_avg_similarity_with_coherence)
            print("===============================================")

    return predictions

<IPython.core.display.Javascript object>

In [418]:
start = 0
num_samples = 250
max_tokens = 256  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

['donostia', 'sebastian', 'sebastián', 'sebastiane']
['pasaia', 'urumea', 'biscay', 'adarra']
Got the keywords in 0.2830 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['sebastián', 'biscay']], KW Curr: ['donostia', 'sebastian', 'sebastián', 'sebastiane']
weighted: tensor([0.2756])
Label: 0, Prediction: 0, logit: tensor([0.2756])
['pasaia', 'urumea', 'biscay', 'adarra']
['climate', 'temperatures', 'winters', 'sebastián']
Got the keywords in 0.2695 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['sebastián', 'biscay'], ['pasaia', 'urumea', 'temperatures', 'winters']], KW Curr: ['pasaia', 'urumea', 'biscay', 'adarra']
weighted: tensor([0.2337])
Label: 0, Prediction: 1, logit: tensor([0.2337])
['climate', 'temperatures', 'winters', 'sebastián']
['paleolithic', 'ametzagaña', 'settlers', 'sapiens']
Got the keywords in 0.2023 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['paleolithic', 'ametzagaña', '

['egia', 'donostia', 'urumea', 'anoeta']
['intxaurrondo', 'walnut', 'basque', 'situated']
Got the keywords in 0.2918 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['navarre', 'donostia', 'navarre', 'fuero'], ['gipuzkoa', 'donostia', 'navarre', 'spain'], ['cortazar', 'gipuzkoa', 'donostia', 'tolosa'], ['cortazar', 'koxkeroak', 'vieja', 'joxemaritarrak'], ['koxkeroak', 'vieja', 'miramar', 'león'], ['euskotren', 'miramar'], ['amara', 'madrid'], ['amara', 'district', 'city', 'built'], ['palace', 'district', 'residence', 'area'], ['egia', 'donostia', 'palace', 'anoeta'], ['egia', 'intxaurrondo', 'donostia', 'walnut']], KW Curr: ['egia', 'donostia', 'urumea', 'anoeta']
weighted: tensor([0.4195])
Label: 0, Prediction: 0, logit: tensor([0.4195])
['intxaurrondo', 'walnut', 'basque', 'situated']
['altza', 'basque', 'sebastián', 'san']
Got the keywords in 0.2652 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['navarre', 'donostia', 'nav

['festival', 'biscay', 'rowing', 'basque']
['agatha', 'saint', 'carnival', 'eve']
Got the keywords in 0.2731 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['ulia', 'exclave'], ['exclave', 'festivals'], ['festival', 'tamborrada', 'celebrations', 'festivals'], ['festival', 'festival', 'tamborrada', 'semana'], ['festival', 'festival', 'biscay', 'grande'], ['biscay', 'rowing', 'saint', 'carnival']], KW Curr: ['festival', 'biscay', 'rowing', 'basque']
weighted: tensor([0.2798])
Label: 0, Prediction: 0, logit: tensor([0.2798])
['agatha', 'saint', 'carnival', 'eve']
['festival', 'gypsy', 'romani', 'carnival']
Got the keywords in 0.2211 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['ulia', 'exclave'], ['exclave', 'festivals'], ['festival', 'tamborrada', 'celebrations', 'festivals'], ['festival', 'festival', 'tamborrada', 'semana'], ['festival', 'festival', 'biscay', 'grande'], ['biscay', 'rowing', 'saint', 'carnival'], ['festival',

weighted: tensor([0.3032])
Label: 0, Prediction: 0, logit: tensor([0.3032])
['izium', '1943', 'bridgehead', '1942']
['sloviansk', 'ukraine', 'izium', 'russia']
Got the keywords in 0.2837 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['donostia', 'universidad'], ['sociedad', 'universidad', 'liga', 'university'], ['sociedad', 'hospers'], ['hospers', 'located'], ['hospers', 'census', 'population', 'households'], ['census', 'population', 'population', 'households'], ['population', 'hospers', 'census', '280'], ['izium', 'moc'], ['izium', 'izium', 'bridgehead', 'ukraine'], ['sloviansk', 'ukraine', 'izium', 'izium']], KW Curr: ['izium', '1943', 'bridgehead', '1942']
weighted: tensor([0.4422])
Label: 0, Prediction: 0, logit: tensor([0.4422])
['sloviansk', 'ukraine', 'izium', 'russia']
['lennon', 'square', 'soviet', 'decommunization']
Got the keywords in 0.2523 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['donostia', 'universidad']

['population', 'census', 'households', 'families']
['suva', 'fiji', 'polynesia', 'islands']
Got the keywords in 0.3807 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['wallingford', 'census', 'population', 'households'], ['census', 'census', 'population', 'population'], [], ['hydrotherapy', 'banya', 'ancient', 'gora'], ['hydrotherapy', 'boris'], ['harvard', 'boris'], ['harvard', '096554'], ['harvard', 'census', 'population', 'households'], ['population', 'census', 'census', 'population'], ['suva', 'census', 'fiji', 'polynesia']], KW Curr: ['population', 'census', 'households', 'families']
weighted: tensor([0.3657])
Label: 1, Prediction: 0, logit: tensor([0.3657])
['suva', 'fiji', 'polynesia', 'islands']
['suva', 'fiji', 'pacific', 'viti']
Got the keywords in 0.3665 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['wallingford', 'census', 'population', 'households'], ['census', 'census', 'population', 'population'], [], ['hydrot

['suva', 'seats', 'seating', 'venues']
['suva', 'fiji', 'gardens', 'garden']
Got the keywords in 0.1976 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['suva', 'fiji', 'fijian', 'suva'], [], ['suva', 'fiji'], ['suva', 'suva', 'pacific', 'infrastructure'], [], []], KW Curr: ['suva', 'seats', 'seating', 'venues']
weighted: tensor([0.1885])
Label: 0, Prediction: 1, logit: tensor([0.1885])
['suva', 'fiji', 'gardens', 'garden']
['suva', 'concerts', 'performances', 'singers']
Got the keywords in 0.2432 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['suva', 'suva', 'concerts', 'fiji']], KW Curr: ['suva', 'fiji', 'gardens', 'garden']
weighted: tensor([0.3086])
Label: 0, Prediction: 0, logit: tensor([0.3086])
['suva', 'concerts', 'performances', 'singers']
['suva', 'fijian', 'fijians', 'fiji']
Got the keywords in 0.2314 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['suva', 'suva', 'concerts', 'fiji'], [

['alvarado', 'mexico', 'city', 'office']
['area', 'census', 'land', 'city']
Got the keywords in 0.0909 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['greenville', 'havana', 'population', 'census'], ['alvarado', 'population'], ['census', 'office']], KW Curr: ['alvarado', 'mexico', 'city', 'office']
weighted: tensor([0.2909])
Label: 0, Prediction: 0, logit: tensor([0.2909])
['area', 'census', 'land', 'city']
['census', 'population', 'households', 'families']
Got the keywords in 0.1647 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['greenville', 'havana', 'population', 'census'], ['alvarado', 'population'], ['census', 'office'], ['census', 'city']], KW Curr: ['area', 'census', 'land', 'city']
weighted: tensor([0.2769])
Label: 0, Prediction: 0, logit: tensor([0.2769])
['census', 'population', 'households', 'families']
['population', 'census', 'households', 'household']
Got the keywords in 0.2545 seconds
Got the embeddings and c

['ada', 'oklahoma', 'tulsa', 'texas']
['ada', 'census', 'population', 'households']
Got the keywords in 0.2298 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['greenville', 'havana', 'population', 'census'], ['alvarado', 'population'], ['census', 'office'], ['census', 'city'], ['census', 'population', 'population', 'census'], ['hanska', 'population', 'census', 'households'], ['hanska', 'city'], ['population', 'city'], ['population', 'census', 'census', 'population'], ['washta', 'population', 'census', 'households'], ['washta', 'census', 'population', 'households'], ['census', 'population', 'population', 'households'], ['population', 'prairie', 'reed', 'ada'], [], ['tulsa', 'ada', 'texas', 'camp'], ['ada', 'ada']], KW Curr: ['ada', 'oklahoma', 'tulsa', 'texas']
weighted: tensor([0.3950])
Label: 0, Prediction: 0, logit: tensor([0.3950])
['ada', 'census', 'population', 'households']
['ada', 'headquartered', 'headquarters', 'companies']
Got the keywords in 0.4

['afrikaners', 'afrikaner', 'settlers', 'argentina']
['comodoro', 'rivadavia', 'climate', 'arid']
Got the keywords in 0.3638 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['greenville', 'havana', 'population', 'census'], ['alvarado', 'population'], ['census', 'office'], ['census', 'city'], ['census', 'population', 'population', 'census'], ['hanska', 'population', 'census', 'households'], ['hanska', 'city'], ['population', 'city'], ['population', 'census', 'census', 'population'], ['washta', 'population', 'census', 'households'], ['washta', 'census', 'population', 'households'], ['census', 'population', 'population', 'households'], ['population', 'prairie', 'reed', 'ada'], [], ['tulsa', 'ada', 'texas', 'camp'], ['ada', 'ada'], ['ada', 'ada', 'census', 'headquartered'], ['ecu', 'ada', 'university', 'central'], ['ecu', 'school'], ['pontotoc', 'ada', 'schools', 'located'], ['pontotoc', 'prosecutor'], ['ada', 'comodoro', 'oklahoma', 'jorge'], ['afrikaners', 'a

['shipyard', 'pier', 'port', 'punta']
['concrete', 'portland', 'production', 'petroquimica']
Got the keywords in 0.2864 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['comodoro', 'jorge', 'started', 'petrolíferos'], ['comodoro', 'argentine'], ['argentine', 'comodoro', 'comodoro', 'rivadavia'], ['pier', 'port', 'comodoro', 'punta'], ['shipyard', 'pier', 'concrete', 'port']], KW Curr: ['shipyard', 'pier', 'port', 'punta']
weighted: tensor([0.4953])
Label: 0, Prediction: 0, logit: tensor([0.4953])
['concrete', 'portland', 'production', 'petroquimica']
['rivadavia', 'comodoro', 'wind', 'generators']
Got the keywords in 0.1420 seconds
Got the embeddings and comparisons in 0.0002 seconds
Coherence Map: [['comodoro', 'jorge', 'started', 'petrolíferos'], ['comodoro', 'argentine'], ['argentine', 'comodoro', 'comodoro', 'rivadavia'], ['pier', 'port', 'comodoro', 'punta'], ['shipyard', 'pier', 'concrete', 'port'], ['concrete', 'rivadavia', 'comodoro', 'portland']], 

['humid', 'subtropical', 'climate', 'summers']
['delphi', 'erie', 'courthouse', 'county']
Got the keywords in 0.2032 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['comodoro', 'jorge', 'started', 'petrolíferos'], ['comodoro', 'argentine'], ['argentine', 'comodoro', 'comodoro', 'rivadavia'], ['pier', 'port', 'comodoro', 'punta'], ['shipyard', 'pier', 'concrete', 'port'], ['concrete', 'rivadavia', 'comodoro', 'portland'], ['comodoro', 'rivadavia'], ['comodoro', 'zolotonosha'], ['chattanooga', 'zolotonosha', 'nashville', 'strunkovka'], ['hohenwald', 'chattanooga', 'nashville', '5479'], ['hohenwald', 'population', 'census', 'households'], ['population', 'census', 'bay', 'households'], ['census', 'population', 'bay', 'households'], ['census', 'population', 'springs', 'bay'], ['springs', 'academy'], ['humid', 'climate', 'bay', 'school'], ['delphi', 'humid', 'subtropical', 'erie']], KW Curr: ['humid', 'subtropical', 'climate', 'summers']
weighted: tensor([0.2880

['census', 'population', 'households', 'families']
['census', 'population', 'households', 'families']
Got the keywords in 0.2525 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['dialect', 'larestani', 'ajami', 'oregon'], ['area', 'helix', 'land', 'city'], ['population', 'city'], ['census', 'census', 'population', 'population']], KW Curr: ['census', 'population', 'households', 'families']
weighted: tensor([0.5981])
Label: 0, Prediction: 0, logit: tensor([0.5981])
['census', 'population', 'households', 'families']
['helix', 'farm', 'farming', 'stateline']
Got the keywords in 0.2095 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['dialect', 'larestani', 'ajami', 'oregon'], ['area', 'helix', 'land', 'city'], ['population', 'city'], ['census', 'census', 'population', 'population'], []], KW Curr: ['census', 'population', 'households', 'families']
weighted: tensor([0.2753])
Label: 0, Prediction: 0, logit: tensor([0.2753])
['helix', '

['shkodër', 'shkodra', 'industries', 'industry']
['albania', 'tirana', 'shkodra', 'albanian']
Got the keywords in 0.2611 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['shkodër', 'ituverava', 'shkodra', 'guarani'], ['shkodër', 'shkodër'], ['shkodër', 'humid', 'temperature', 'kir'], ['shkodër', 'humid', 'temperature', 'serbs'], ['shkodër', 'shkodër', 'ottomans', 'earliest'], ['shkodër', 'shkodër', 'prizren', 'ottomans'], ['shkodër', 'shkodër', 'prizren', 'albania'], ['shkodër', 'shkodër', 'municipality', 'municipalities'], ['shkodër', 'shkodra', 'municipality', 'municipalities'], ['shkodër', 'shkodra', 'albania', 'tirana']], KW Curr: ['shkodër', 'shkodra', 'industries', 'industry']
weighted: tensor([0.4068])
Label: 0, Prediction: 0, logit: tensor([0.4068])
['albania', 'tirana', 'shkodra', 'albanian']
['shkodër', 'shkodra', 'albania', 'castle']
Got the keywords in 0.3260 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['shkodër'

['census', 'population', 'households', 'families']
['population', 'census', 'households', 'household']
Got the keywords in 0.3725 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['karamay', 'karamay', 'climate', 'temperature'], ['karamay', 'karamay', 'population', 'climate'], ['karamay', 'population', 'gdp', 'census'], ['kensett', 'gdp', 'railroad', 'oil'], ['railroad', 'area'], ['kensett', 'census', 'population', 'households'], ['census', 'population', 'population', 'census']], KW Curr: ['census', 'population', 'households', 'families']
weighted: tensor([0.4620])
Label: 0, Prediction: 0, logit: tensor([0.4620])
['population', 'census', 'households', 'household']
['warren', 'railroad', 'county', 'line']
Got the keywords in 0.3885 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['karamay', 'karamay', 'climate', 'temperature'], ['karamay', 'karamay', 'population', 'climate'], ['karamay', 'population', 'gdp', 'census'], ['kensett',

['climate', 'temperatures', 'warm', 'summers']
['jones', 'population', 'fort', 'census']
Got the keywords in 0.2696 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['scottsville', 'justice'], ['scottsburg', 'scottsville', 'jones', 'fort'], ['fort', 'jones', 'jones', 'fort'], ['climate', 'fort', 'jones', 'summers'], ['climate', 'jones', 'population', 'summers']], KW Curr: ['climate', 'temperatures', 'warm', 'summers']
weighted: tensor([0.4789])
Label: 0, Prediction: 0, logit: tensor([0.4789])
['jones', 'population', 'fort', 'census']
['population', 'census', 'households', 'families']
Got the keywords in 0.3717 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['scottsville', 'justice'], ['scottsburg', 'scottsville', 'jones', 'fort'], ['fort', 'jones', 'jones', 'fort'], ['climate', 'fort', 'jones', 'summers'], ['climate', 'jones', 'population', 'summers'], ['population', 'census', 'jones', 'population']], KW Curr: ['jones', 'populat

['airport', 'isle', 'presque', 'planes']
['maine', 'college', 'aroostook', 'school']
Got the keywords in 0.2196 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['settlers', 'settlements', 'fairbanks', 'pioneers'], ['industry', 'fairbanks', 'presque', 'isle'], ['industry', 'maine', 'potatoes', 'factory'], ['railway', 'maine', 'brunswick', 'agriculture'], ['airport', 'railway', 'brunswick', 'isle'], ['airport', 'maine', 'college', 'aroostook']], KW Curr: ['airport', 'isle', 'presque', 'planes']
weighted: tensor([0.6173])
Label: 0, Prediction: 0, logit: tensor([0.6173])
['maine', 'college', 'aroostook', 'school']
['hospital', 'isle', 'nurses', 'presque']
Got the keywords in 0.2893 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['settlers', 'settlements', 'fairbanks', 'pioneers'], ['industry', 'fairbanks', 'presque', 'isle'], ['industry', 'maine', 'potatoes', 'factory'], ['railway', 'maine', 'brunswick', 'agriculture'], ['airport',

['island', 'sky', '1953', 'film']
['المنامة', 'manãma', 'arabic', 'al']
Got the keywords in 0.1128 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['houlton', 'film'], ['sky', 'al']], KW Curr: ['island', 'sky', '1953', 'film']
weighted: tensor([0.2595])
Label: 1, Prediction: 0, logit: tensor([0.2595])
['المنامة', 'manãma', 'arabic', 'al']
['bahrain', 'mesopotamia', 'urbanisation', 'rural']
Got the keywords in 0.2837 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['houlton', 'film'], ['sky', 'al'], []], KW Curr: ['المنامة', 'manãma', 'arabic', 'al']
weighted: tensor([0.2294])
Label: 0, Prediction: 1, logit: tensor([0.2294])
['bahrain', 'mesopotamia', 'urbanisation', 'rural']
['bahrain', 'manama', 'bahraini', 'persia']
Got the keywords in 0.4862 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['bahrain', 'bahrain', 'manama', 'bahraini']], KW Curr: ['bahrain', 'mesopotamia', 'urbanisation', 'rural']
we

['manama', 'bahrain', 'bahraini', 'popular']
['vila', 'archaeological', 'joão', 'conde']
Got the keywords in 0.3717 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['manama', 'vila', 'archaeological', 'popular']], KW Curr: ['manama', 'bahrain', 'bahraini', 'popular']
weighted: tensor([0.3823])
Label: 1, Prediction: 0, logit: tensor([0.3823])
['vila', 'archaeological', 'joão', 'conde']
['vila', 'porto', 'conde', 'portugal']
Got the keywords in 0.4699 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['manama', 'vila', 'archaeological', 'popular'], ['vila', 'vila', 'porto', 'conde']], KW Curr: ['vila', 'archaeological', 'joão', 'conde']
weighted: tensor([0.3843])
Label: 0, Prediction: 0, logit: tensor([0.3843])
['vila', 'porto', 'conde', 'portugal']
['azurara', 'vila', 'árvore', 'municipality']
Got the keywords in 0.2502 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['manama', 'vila', 'archaeological',

['westfield', 'noblesville', 'newspaper', 'news']
['park', 'grand', 'softball', 'fieldhouse']
Got the keywords in 0.2137 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['westfield', 'westfield', 'quakers', 'quaker'], ['census', 'westfield', 'population', 'households'], [], []], KW Curr: ['westfield', 'noblesville', 'newspaper', 'news']
weighted: tensor([0.2081])
Label: 0, Prediction: 1, logit: tensor([0.2081])
['park', 'grand', 'softball', 'fieldhouse']
['westfield', 'indianapolis', 'noblesville', 'indiana']
Got the keywords in 0.2877 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['park', 'westfield', 'grand', 'indianapolis']], KW Curr: ['park', 'grand', 'softball', 'fieldhouse']
weighted: tensor([0.3549])
Label: 0, Prediction: 0, logit: tensor([0.3549])
['westfield', 'indianapolis', 'noblesville', 'indiana']
['rustica', 'alemanni', 'palaeolithic', 'auderiensium']
Got the keywords in 0.2567 seconds
Got the embeddings and comp

['area', 'census', 'land', 'city']
['population', 'census', 'households', 'families']
Got the keywords in 0.1896 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['park', 'westfield', 'grand', 'indianapolis'], ['westfield', 'rustica', 'indianapolis', 'alemanni'], ['umstadt', 'rustica', 'palaeolithic', 'auderiensium'], ['umstadt', 'umstadt', 'castles', 'umstaedter'], ['umstadt', 'hesse', 'langstadt', 'kleestadt'], ['umstadt', 'wenigumstadt', 'hesse', 'langstadt'], [], ['umstadt', 'september'], ['umstadt', 'neolithic'], ['unemployed', 'industry', 'unemployment', 'ghica'], ['picayune', 'unemployed', 'industry', 'unemployment'], [], ['katrina', 'census', 'orleans', 'land'], ['population', 'area']], KW Curr: ['area', 'census', 'land', 'city']
weighted: tensor([0.3320])
Label: 0, Prediction: 0, logit: tensor([0.3320])


<IPython.core.display.Javascript object>

In [419]:
print([x[1] for x in predictions])
print(true_labels)

[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

<IPython.core.display.Javascript object>

In [420]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [421]:
avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [422]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.32483221476510066
pk = 0.3087248322147651


<IPython.core.display.Javascript object>

## Prediction Tuning

In [423]:
pred_thresholds = [0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3]

<IPython.core.display.Javascript object>

In [424]:
pred_thresh = 0.26

<IPython.core.display.Javascript object>

In [425]:
for pred_thresh in pred_thresholds:
    modified_predictions = [
        1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
    ]

    pred_string = "".join(str(modified_predictions))
    true_string = "".join(str(true_labels))

    avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

    wd_score = windowdiff(pred_string, true_string, avg_k)
    pk_score = pk(pred_string, true_string, avg_k)

    print(f"pred_thresh = {pred_thresh}")
    print(f"k = {avg_k}")
    print(f"wd = {wd_score}")
    print(f"pk = {pk_score}")
    print("===========================================")

pred_thresh = 0.23
k = 6
wd = 0.3221476510067114
pk = 0.31409395973154364
pred_thresh = 0.24
k = 6
wd = 0.3302013422818792
pk = 0.3181208053691275
pred_thresh = 0.25
k = 6
wd = 0.3221476510067114
pk = 0.30604026845637583
pred_thresh = 0.26
k = 6
wd = 0.35838926174496644
pk = 0.33422818791946307
pred_thresh = 0.27
k = 6
wd = 0.3785234899328859
pk = 0.34630872483221475
pred_thresh = 0.28
k = 6
wd = 0.46174496644295304
pk = 0.41073825503355704
pred_thresh = 0.29
k = 6
wd = 0.4818791946308725
pk = 0.43087248322147653
pred_thresh = 0.3
k = 6
wd = 0.5100671140939598
pk = 0.45503355704697984


<IPython.core.display.Javascript object>

In [389]:
print(pred_string)
print(true_string)

[1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

<IPython.core.display.Javascript object>

In [375]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.4684563758389262
pk = 0.4214765100671141


<IPython.core.display.Javascript object>

## KeyBERT Embedding Comparison

In [172]:
curr = 230
prev = curr - 1

<IPython.core.display.Javascript object>

In [None]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

In [205]:
cohesion = coherence.get_coherence(
    [text_data[curr], text_data[prev]], coherence_threshold=0.25
)
print([k[0] for k in cohesion])

Got the keywords in 0.6567 seconds
Got the embeddings and comparisons in 0.0007 seconds
['cantonese', 'languages', 'vietnamese', 'communes']


<IPython.core.display.Javascript object>

In [206]:
# get the keywords for the current sentences
keywords_current = keywords_lib.get_keywords_with_kb_embeddings(text_data[curr])
keywords_prev = keywords_lib.get_keywords_with_kb_embeddings(text_data[prev])

# compute the word comparisons between the previous (with the coherence map)
# and the current (possibly the first sentence in a new segment)
word_comparisons_with_coherence, weights = compare_coherent_words(
    [keywords_prev], keywords_current
)

<IPython.core.display.Javascript object>

In [207]:
[(x[0], x[1]) for x in keywords_current], [(x[0], x[1]) for x in keywords_prev]

([('township', 0.2304),
  ('communes', 0.1857),
  ('hải', 0.1399),
  ('wards', 0.1397),
  ('đông', 0.1224)],
 [('cantonese', 0.5038),
  ('mandarin', 0.464),
  ('languages', 0.3483),
  ('language', 0.343),
  ('vietnamese', 0.3184)])

<IPython.core.display.Javascript object>

# KeyBERT Embedding Testing

In [679]:
docs = [
    "Hi my name is Devarsh",
    "Devarsh likes to play Basketball.",
    "I love to watch Cricket.",
    "I am a strong programmer. And my name is Devarsh",
]

<IPython.core.display.Javascript object>

In [680]:
from keybert import KeyBERT

kw_model = KeyBERT()
doc_embeddings, word_embeddings = kw_model.extract_embeddings(
    docs, min_df=1, stop_words="english"
)
keywords = kw_model.extract_keywords(
    docs,
    min_df=1,
    stop_words="english",
    doc_embeddings=doc_embeddings,
    word_embeddings=word_embeddings,
)

<IPython.core.display.Javascript object>

In [681]:
len(doc_embeddings)

4

<IPython.core.display.Javascript object>

In [682]:
len(word_embeddings)

10

<IPython.core.display.Javascript object>

In [683]:
keywords

[[('devarsh', 0.6267), ('hi', 0.5216)],
 [('devarsh', 0.6549),
  ('basketball', 0.5558),
  ('play', 0.3787),
  ('likes', 0.2284)],
 [('cricket', 0.7118), ('watch', 0.3656), ('love', 0.307)],
 [('programmer', 0.5942), ('devarsh', 0.5528), ('strong', 0.3452)]]

<IPython.core.display.Javascript object>

In [701]:
kw_model = KeyBERT()
import torch


def get_keywords_with_embeddings_test(
    data,
) -> list[tuple[str, float, torch.Tensor]]:
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(data)

    keywords = kw_model.extract_keywords(
        data, doc_embeddings=doc_embeddings, word_embeddings=word_embeddings
    )

    keywords_with_embeddings = []
    count = 0
    print(len(word_embeddings))
    for i, (kw, we) in enumerate(zip(keywords, word_embeddings)):
        for j, words in enumerate(kw):
            keywords_with_embeddings.append((words[0], words[1], torch.tensor(we)))
            count += 1

    return keywords_with_embeddings

<IPython.core.display.Javascript object>

In [702]:
embeddings = get_keywords_with_embeddings_test(docs)

10


<IPython.core.display.Javascript object>

In [703]:
len(embeddings)

12

<IPython.core.display.Javascript object>