In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [181]:
# initialize the coherence library
max_words_per_step = 3
coherence = Coherence(max_words_per_step=max_words_per_step)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-14 17:43:12.026334: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [14]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-14 13:30:34.417238: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [7]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [8]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [9]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [10]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [11]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [196]:
pruning = 0  # remove the lowest n important words from coherence map
pruning_min = 10  # only prune after n words in the coherence map


def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(
    coherence_map, keywords_current, suppress_errors=False, same_word_multiplier=False
):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]

                    if same_word_multiplier:
                        flattened_coherence_words_only = [
                            element[0]
                            for sublist in coherence_map
                            for element in sublist
                        ]

                        num_occurrences = flattened_coherence_words_only.count(
                            second_word
                        )

                        multiplier = 2
                        if num_occurrences > 0:
                            # amplify words that are found as duplicates in the coherence map
                            # if the word shows up 1 time, amplify the weight by 2 times
                            weighting_multiplier = flattened_coherence_words_only.count(
                                second_word
                            ) + (multiplier - 1)
                        else:
                            weighting_multiplier = (
                                1 / multiplier
                            )  # reduce the importance of this word

                    else:
                        weighting_multiplier = 1  # set to 1 in case this is turned off.

                    # this weight is a recipricol function that will grow smaller the further the keywords are away
                    # we want to put more importance on the current words, so we apply twice as much weight.
                    if i == 0:
                        weight = (weighting_multiplier * 2) / (i + 1)
                    else:
                        weight = (weighting_multiplier * 1) / (i + 1)

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * embedding_lib.get_similarity(word_one_emb, word_two_emb),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data,
    text_labels,
    max_tokens=400,
    max_str_length=30,
    prediction_thresh=0.7,
    dynamic_threshold=False,
    threshold_warmup=10,  # number of iterations before using dynamic threshold
    last_n_threshold=5,  # will only consider the last n thresholds for dynamic threshold
):
    coherence_map = []
    predictions = []
    thresholds = []
    for i, (row, label) in enumerate(zip(text_data, text_labels)):
        threshold = prediction_thresh
        if dynamic_threshold and (i + 1) > threshold_warmup:
            last_n_thresholds = thresholds[(0 - last_n_threshold) :]
            last_n_thresholds.sort()
            mid = len(last_n_thresholds) // 2
            threshold = (last_n_thresholds[mid] + last_n_thresholds[~mid]) / 2
            print(f"median threshold: {threshold}")
        # compare the current sentence to the previous one
        if i == 0:
            predictions.append((0, 0))
        else:
            prev_row = text_data[i - 1]

            row = truncate_by_token(row, max_tokens)
            prev_row = truncate_by_token(prev_row, max_tokens)

            cohesion, keywords_prev, keywords_current = coherence.get_coherence(
                [row, prev_row], coherence_threshold=0.2
            )

            pruning = 1  # remove one sentence worth of keywords
            pruning_min = 6  # remove the first sentence in the coherence map once it grows passed 6

            # add the keywords to the coherence map
            coherence_map.append(cohesion)
            if pruning > 0 and len(coherence_map) >= pruning_min:
                print("pruning...", len(coherence_map))
                coherence_map = coherence_map[::-1][pruning:][
                    ::-1
                ]  # get the last n - pruning values and reverse the list
                print("done pruning...", len(coherence_map))

            # truncate the strings for printing
            truncated_row = truncate_string(row, max_str_length)
            truncated_prev_row = truncate_string(prev_row, max_str_length)
            print(
                f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}, KW Curr: {[x[0] for x in keywords_current]}"
            )

            # compute the word comparisons between the previous (with the coherence map)
            # and the current (possibly the first sentence in a new segment)
            word_comparisons_with_coherence, weights = compare_coherent_words(
                [*coherence_map, keywords_prev], keywords_current
            )

            similarities_with_coherence = [
                comparison[2] for comparison in word_comparisons_with_coherence
            ]
            avg_similarity_with_coherence = sum(similarities_with_coherence) / (
                len(similarities_with_coherence) or 1
            )
            weighted_avg_similarity_with_coherence = get_weighted_average(
                similarities_with_coherence, weights
            )
            print(f"weighted: {weighted_avg_similarity_with_coherence}")

            # if the two sentences are similar, create a cohesive prediction
            # otherwise, predict a new segment
            if weighted_avg_similarity_with_coherence > threshold:
                print(
                    f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 0))
            else:
                # start of a new segment, empty the map
                coherence_map = []
                print(
                    f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 1))

            thresholds.append(weighted_avg_similarity_with_coherence)
            print("===============================================")

    return predictions

<IPython.core.display.Javascript object>

In [202]:
start = 50
num_samples = 500
max_tokens = 256  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

['hydrotherapy', 'hydrothermal', 'village', 'mineral', 'sources']
['bagarov', 'rheumatism', 'summerhouse', 'verdure', 'banya']
Got the keywords in 0.2880 seconds
Got the embeddings and comparisons in 0.0011 seconds
Coherence Map: [['bagarov', 'hydrotherapy', 'rheumatism']], KW Curr: ['hydrotherapy', 'hydrothermal', 'village', 'mineral', 'sources']
weighted: tensor([0.6836])
Label: 0, Prediction: 1, logit: tensor([0.6836])
['bagarov', 'rheumatism', 'summerhouse', 'verdure', 'banya']
['massachusetts', 'railroad', 'harvard', '1871', 'university']
Got the keywords in 0.6283 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['massachusetts', 'bagarov', 'railroad']], KW Curr: ['bagarov', 'rheumatism', 'summerhouse', 'verdure', 'banya']
weighted: tensor([0.7308])
Label: 1, Prediction: 0, logit: tensor([0.7308])
['massachusetts', 'railroad', 'harvard', '1871', 'university']
['620276', '096554', 'census', 'city', 'land']
Got the keywords in 0.1560 seconds
Got the embe

['tuvalu', 'kiribati', 'nauru', 'micronesia', 'geoscience']
['infrastructure', 'events', 'suva', 'capital', 'event']
Got the keywords in 0.9150 seconds
Got the embeddings and comparisons in 0.0008 seconds
pruning... 6
done pruning... 5
Coherence Map: [['males', 'male'], ['lomaiviti', 'males'], ['mangroves', 'lomaiviti', 'laucala'], ['suva', 'mangroves', 'area'], ['muanikau', 'suva', 'nabua']], KW Curr: ['tuvalu', 'kiribati', 'nauru', 'micronesia', 'geoscience']
weighted: tensor([0.7852])
Label: 0, Prediction: 0, logit: tensor([0.7852])
['infrastructure', 'events', 'suva', 'capital', 'event']
['grandstand', 'vodafone', 'multipurpose', 'arena', 'gymnasium']
Got the keywords in 0.7142 seconds
Got the embeddings and comparisons in 0.0002 seconds
pruning... 6
done pruning... 5
Coherence Map: [['males', 'male'], ['lomaiviti', 'males'], ['mangroves', 'lomaiviti', 'laucala'], ['suva', 'mangroves', 'area'], ['muanikau', 'suva', 'nabua']], KW Curr: ['infrastructure', 'events', 'suva', 'capital',

['habikino', 'kashiwara', 'yamato', 'fujidera', 'prefectural']
['kashiwara', 'katakami', 'katashimo', 'kokubu', 'kawachi']
Got the keywords in 0.7237 seconds
Got the embeddings and comparisons in 0.0012 seconds
pruning... 6
done pruning... 5
Coherence Map: [['males', 'male'], ['lomaiviti', 'males'], ['mangroves', 'lomaiviti', 'laucala'], ['suva', 'mangroves', 'area'], ['muanikau', 'suva', 'nabua']], KW Curr: ['habikino', 'kashiwara', 'yamato', 'fujidera', 'prefectural']
weighted: tensor([0.7380])
Label: 0, Prediction: 0, logit: tensor([0.7380])
['kashiwara', 'katakami', 'katashimo', 'kokubu', 'kawachi']
['529064', '111021', 'census', 'city', 'states']
Got the keywords in 0.6271 seconds
Got the embeddings and comparisons in 0.0002 seconds
pruning... 6
done pruning... 5
Coherence Map: [['males', 'male'], ['lomaiviti', 'males'], ['mangroves', 'lomaiviti', 'laucala'], ['suva', 'mangroves', 'area'], ['muanikau', 'suva', 'nabua']], KW Curr: ['kashiwara', 'katakami', 'katashimo', 'kokubu', 'k

['pontotoc', 'courthouse', 'wintersmith', 'mijo', 'sugg']
['texas', 'tulsa', 'oklahoma', 'census', 'city']
Got the keywords in 0.6858 seconds
Got the embeddings and comparisons in 0.0102 seconds
Coherence Map: [['males', 'male'], ['ardmoreite', 'males'], ['pontotoc', 'ardmoreite', 'courthouse'], ['texas', 'pontotoc', 'tulsa']], KW Curr: ['pontotoc', 'courthouse', 'wintersmith', 'mijo', 'sugg']
weighted: tensor([0.8106])
Label: 0, Prediction: 0, logit: tensor([0.8106])
['texas', 'tulsa', 'oklahoma', 'census', 'city']
['862', 'disparity', '803', '805', '977']
Got the keywords in 0.8737 seconds
Got the embeddings and comparisons in 0.0009 seconds
Coherence Map: [['males', 'male'], ['ardmoreite', 'males'], ['pontotoc', 'ardmoreite', 'courthouse'], ['texas', 'pontotoc', 'tulsa'], ['disparity', 'texas']], KW Curr: ['texas', 'tulsa', 'oklahoma', 'census', 'city']
weighted: tensor([0.7849])
Label: 0, Prediction: 0, logit: tensor([0.7849])
['862', 'disparity', '803', '805', '977']
['chickasaw',

['yacimientos', 'petrolíferos', 'yrigoyen', 'chubut', 'huemul']
['chubut', 'rivadavia', 'comodoro', 'argentina', 'san']
Got the keywords in 1.0063 seconds
Got the embeddings and comparisons in 0.0009 seconds
pruning... 6
done pruning... 5
Coherence Map: [['males', 'male'], ['ardmoreite', 'males'], ['pontotoc', 'ardmoreite', 'courthouse'], ['texas', 'pontotoc', 'tulsa'], ['disparity', 'texas']], KW Curr: ['yacimientos', 'petrolíferos', 'yrigoyen', 'chubut', 'huemul']
weighted: tensor([0.6814])
Label: 0, Prediction: 1, logit: tensor([0.6814])
['chubut', 'rivadavia', 'comodoro', 'argentina', 'san']
['h310', 'h314', 'rivadavia', 'cartography', 'caleta']
Got the keywords in 0.6823 seconds
Got the embeddings and comparisons in 0.0015 seconds
Coherence Map: [['h310', 'chubut', 'h314']], KW Curr: ['chubut', 'rivadavia', 'comodoro', 'argentina', 'san']
weighted: tensor([0.6663])
Label: 0, Prediction: 1, logit: tensor([0.6663])
['h310', 'h314', 'rivadavia', 'cartography', 'caleta']
['rivadavia',

['census', 'land', 'area', 'delphi', 'total']
['893', 'male', '694', 'census', 'female']
Got the keywords in 0.3373 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 6
done pruning... 5
Coherence Map: [[], [], [], [], ['sylva', 'sunbeam', 'school']], KW Curr: ['census', 'land', 'area', 'delphi', 'total']
weighted: tensor([0.7163])
Label: 0, Prediction: 0, logit: tensor([0.7163])
['893', 'male', '694', 'census', 'female']
['388', '575', '485', '454', '878']
Got the keywords in 0.9864 seconds
Got the embeddings and comparisons in 0.0000 seconds
pruning... 6
done pruning... 5
Coherence Map: [[], [], [], [], ['sylva', 'sunbeam', 'school']], KW Curr: ['893', 'male', '694', 'census', 'female']
weighted: tensor([0.8221])
Label: 0, Prediction: 0, logit: tensor([0.8221])
['388', '575', '485', '454', '878']
['schools', 'delphi', 'school', 'high', 'community']
Got the keywords in 0.7948 seconds
Got the embeddings and comparisons in 0.0000 seconds
pruning... 6
done pruning...

['mbishkodra', 'shkodër', 'hydrologically', 'maranaj', 'marshlands']
['wettest', 'precipitation', 'shkodër', 'climate', 'mediterranean']
Got the keywords in 0.9209 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['mcarthur', 'اچمی', 'story'], ['census', 'mcarthur', 'land'], ['male', 'census', 'census'], ['males', 'male', 'census'], ['dryland', 'males', 'stateline']], KW Curr: ['mbishkodra', 'shkodër', 'hydrologically', 'maranaj', 'marshlands']
weighted: tensor([0.7301])
Label: 0, Prediction: 0, logit: tensor([0.7301])
['wettest', 'precipitation', 'shkodër', 'climate', 'mediterranean']
['praevalitana', 'illyrian', 'slavs', 'heraclius', 'rozafa']
Got the keywords in 1.0159 seconds
Got the embeddings and comparisons in 0.0005 seconds
pruning... 6
done pruning... 5
Coherence Map: [['mcarthur', 'اچمی', 'story'], ['census', 'mcarthur', 'land'], ['male', 'census', 'census'], ['males', 'male', 'census'], ['dryland', 'males', 'statelin

['dushanzi', 'tacheng', 'kuytun', 'lanxin', 'karamay']
['dzungarian', 'huangyangquan', 'hoboksar', 'baiyang', 'kuytun']
Got the keywords in 0.8571 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['mcarthur', 'اچمی', 'story'], ['census', 'mcarthur', 'land'], ['male', 'census', 'census'], ['males', 'male', 'census'], ['dryland', 'males', 'stateline']], KW Curr: ['dushanzi', 'tacheng', 'kuytun', 'lanxin', 'karamay']
weighted: tensor([0.7082])
Label: 0, Prediction: 0, logit: tensor([0.7082])
['dzungarian', 'huangyangquan', 'hoboksar', 'baiyang', 'kuytun']
['karamay', 'humidity', 'summer', 'precipitation', 'bwk']
Got the keywords in 0.9726 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['mcarthur', 'اچمی', 'story'], ['census', 'mcarthur', 'land'], ['male', 'census', 'census'], ['males', 'male', 'census'], ['dryland', 'males', 'stateline']], KW Curr: ['dzungarian', 'huangy

['ophthalmologist', 'kamrowski', 'steffan', 'gerome', 'panzram']
['dwindling', 'ottitiewa', 'ranchers', 'scottsburg', 'scottsville']
Got the keywords in 1.0426 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['ophthalmologist', 'chalkboard', 'kamrowski'], ['dwindling', 'ophthalmologist', 'ottitiewa']], KW Curr: ['ophthalmologist', 'kamrowski', 'steffan', 'gerome', 'panzram']
weighted: tensor([0.7759])
Label: 1, Prediction: 0, logit: tensor([0.7759])
['dwindling', 'ottitiewa', 'ranchers', 'scottsburg', 'scottsville']
['yreka', 'brevet', 'siskiyou', 'confederate', 'march']
Got the keywords in 1.1363 seconds
Got the embeddings and comparisons in 0.0002 seconds
Coherence Map: [['ophthalmologist', 'chalkboard', 'kamrowski'], ['dwindling', 'ophthalmologist', 'ottitiewa'], ['yreka', 'dwindling', 'brevet']], KW Curr: ['dwindling', 'ottitiewa', 'ranchers', 'scottsburg', 'scottsville']
weighted: tensor([0.8461])
Label: 0, Prediction: 0, logit: tensor([0.8461])
['yrek

['aroostook', 'starch', 'agriculture', 'agricultural', 'years']
['aroostook', 'railroad', '1881', 'bangor', '1895']
Got the keywords in 0.6783 seconds
Got the embeddings and comparisons in 0.0003 seconds
pruning... 6
done pruning... 5
Coherence Map: [['precipitation', 'westside', 'climate'], ['climate', 'precipitation', 'geiger'], ['aroostook', 'climate', 'surveyors'], ['aroostook', 'aroostook', 'ashburton'], ['lumbermills', 'aroostook', 'gristmill']], KW Curr: ['aroostook', 'starch', 'agriculture', 'agricultural', 'years']
weighted: tensor([0.7437])
Label: 0, Prediction: 0, logit: tensor([0.7437])
['aroostook', 'railroad', '1881', 'bangor', '1895']
['skyway', 'airport', 'maine', 'war', 'military']
Got the keywords in 0.3517 seconds
Got the embeddings and comparisons in 0.0002 seconds
pruning... 6
done pruning... 5
Coherence Map: [['precipitation', 'westside', 'climate'], ['climate', 'precipitation', 'geiger'], ['aroostook', 'climate', 'surveyors'], ['aroostook', 'aroostook', 'ashburto

['safavids', 'safavid', 'qadeem', 'bushehr', 'qatif']
['reorganised', 'abdicate', 'hakim', 'customs', 'belgrave']
Got the keywords in 1.3046 seconds
Got the embeddings and comparisons in 0.0009 seconds
Coherence Map: [['safavids', 'achaemenid', 'safavid'], ['reorganised', 'safavids', 'abdicate']], KW Curr: ['safavids', 'safavid', 'qadeem', 'bushehr', 'qatif']
weighted: tensor([0.7757])
Label: 0, Prediction: 0, logit: tensor([0.7757])
['reorganised', 'abdicate', 'hakim', 'customs', 'belgrave']
['manamah', 'souq', 'governorates', 'manama', 'municipality']
Got the keywords in 1.0205 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['safavids', 'achaemenid', 'safavid'], ['reorganised', 'safavids', 'abdicate'], ['manamah', 'reorganised', 'souq']], KW Curr: ['reorganised', 'abdicate', 'hakim', 'customs', 'belgrave']
weighted: tensor([0.7454])
Label: 0, Prediction: 0, logit: tensor([0.7454])
['manamah', 'souq', 'governorates', 'manama', 'municipality']
['smelting',

['vila', 'azurara', 'municipality', 'civil', 'parishes']
['customshouse', 'requalification', 'buildings', 'centric', 'economy']
Got the keywords in 0.3541 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['customshouse', 'vila', 'requalification']], KW Curr: ['vila', 'azurara', 'municipality', 'civil', 'parishes']
weighted: tensor([0.7222])
Label: 0, Prediction: 0, logit: tensor([0.7222])
['customshouse', 'requalification', 'buildings', 'centric', 'economy']
['cerveira', 'famalicão', 'agglomeration', 'guimarães', 'bytransdev']
Got the keywords in 1.0063 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['customshouse', 'vila', 'requalification'], ['cerveira', 'customshouse', 'famalicão']], KW Curr: ['customshouse', 'requalification', 'buildings', 'centric', 'economy']
weighted: tensor([0.7252])
Label: 0, Prediction: 0, logit: tensor([0.7252])
['cerveira', 'famalicão', 'agglomeration', 'guimarães', 'bytransdev']
['junqueira', 'azura

['otzberg', 'kurpfalz', 'pfälzer', 'burghers', 'schloss']
['schlierbach', 'harperthausen', 'barbistere', 'hanau', 'kleestadt']
Got the keywords in 0.9220 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['otzberg', 'uninterrupted', 'autmundisstat'], ['otzberg', 'otzberg', 'kurpfalz'], ['schlierbach', 'otzberg', 'harperthausen']], KW Curr: ['otzberg', 'kurpfalz', 'pfälzer', 'burghers', 'schloss']
weighted: tensor([0.7650])
Label: 0, Prediction: 0, logit: tensor([0.7650])
['schlierbach', 'harperthausen', 'barbistere', 'hanau', 'kleestadt']
['wenigumstadt', 'dieburg', 'desecrated', 'landgraviate', 'umstadt']
Got the keywords in 0.8871 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['otzberg', 'uninterrupted', 'autmundisstat'], ['otzberg', 'otzberg', 'kurpfalz'], ['schlierbach', 'otzberg', 'harperthausen'], ['wenigumstadt', 'schlierbach', 'dieburg']], KW Curr: ['schlierbach', 'harperthausen', 'barbistere', 'hanau', 'kleestadt']
weig

['nauphlet', 'misspelled', 'norphlet', 'discovered', 'geological']
['317824', '663985', 'norphlet', 'census', 'city']
Got the keywords in 0.7030 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['picayune', 'kmjd', 'library'], ['nauphlet', 'picayune', 'misspelled'], ['norphlet', 'nauphlet', 'census']], KW Curr: ['nauphlet', 'misspelled', 'norphlet', 'discovered', 'geological']
weighted: tensor([0.6787])
Label: 0, Prediction: 1, logit: tensor([0.6787])
['317824', '663985', 'norphlet', 'census', 'city']
['398', '822', '864', 'males', 'census']
Got the keywords in 0.5188 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['males', 'norphlet', 'census']], KW Curr: ['317824', '663985', 'norphlet', 'census', 'city']
weighted: tensor([0.7051])
Label: 0, Prediction: 0, logit: tensor([0.7051])
['398', '822', '864', 'males', 'census']
['prekindergarten', 'norphlet', 'elementary', 'secondary', 'education']
Got the keywords in 0.8402 seconds
Go

['grapefruits', 'pomegranates', 'grapefruit', 'narindj', 'mulberries']
['jalalabad', 'pakhtunkhwa', 'resurfaced', 'isaf', 'nangarhar']
Got the keywords in 0.9810 seconds
Got the embeddings and comparisons in 0.0008 seconds
pruning... 6
done pruning... 5
Coherence Map: [['bactrian', 'shibetsu', 'sandrocottus'], ['jalalabad', 'bactrian', 'chowk'], ['jalalabad', 'jalalabad', 'agriculture'], ['jalalabad', 'jalalabad', 'frosts'], ['grapefruits', 'jalalabad', 'pomegranates']], KW Curr: ['grapefruits', 'pomegranates', 'grapefruit', 'narindj', 'mulberries']
weighted: tensor([0.7507])
Label: 0, Prediction: 0, logit: tensor([0.7507])
['jalalabad', 'pakhtunkhwa', 'resurfaced', 'isaf', 'nangarhar']
['jalalabad', 'qaderi', 'amanullah', 'ghazi', 'hamidullah']
Got the keywords in 1.1665 seconds
Got the embeddings and comparisons in 0.0006 seconds
pruning... 6
done pruning... 5
Coherence Map: [['bactrian', 'shibetsu', 'sandrocottus'], ['jalalabad', 'bactrian', 'chowk'], ['jalalabad', 'jalalabad', 'agr

['bulldgos', 'boys', 'arma', 'high', 'kansas']
['mubarak', 'overturning', 'dissidents', 'demonstrations', 'protests']
Got the keywords in 0.8307 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['mubarak', 'bulldgos', 'overturning']], KW Curr: ['bulldgos', 'boys', 'arma', 'high', 'kansas']
weighted: tensor([0.6874])
Label: 1, Prediction: 1, logit: tensor([0.6874])
['mubarak', 'overturning', 'dissidents', 'demonstrations', 'protests']
['protesters', 'kubra', 'egypt', 'workers', 'july']
Got the keywords in 1.0238 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['protesters', 'mubarak', 'kubra']], KW Curr: ['mubarak', 'overturning', 'dissidents', 'demonstrations', 'protests']
weighted: tensor([0.8031])
Label: 0, Prediction: 0, logit: tensor([0.8031])
['protesters', 'kubra', 'egypt', 'workers', 'july']
['geiger', 'climate', 'hot', 'köppen', 'bwh']
Got the keywords in 0.7383 seconds
Got the embeddings and comparisons in 0.0004 seconds

['kankakee', 'joliet', 'years', 'area', 'southwest']
['648', '497', '966', '097', '659']
Got the keywords in 0.9603 seconds
Got the embeddings and comparisons in 0.0000 seconds
pruning... 6
done pruning... 5
Coherence Map: [['hockenheim', 'kreisstadt', 'law'], ['hockenheimring', 'hockenheim', 'hockenheim'], ['hockenheimring', 'hockenheimring', 'hockenheim'], ['hohenstein', 'hockenheimring', 'hockenheim'], ['kankakee', 'hohenstein', 'muffler']], KW Curr: ['kankakee', 'joliet', 'years', 'area', 'southwest']
weighted: tensor([0.7215])
Label: 0, Prediction: 0, logit: tensor([0.7215])
['648', '497', '966', '097', '659']
['trashistan', 'rock', 'wilmington', 'confusion', 'song']
Got the keywords in 0.8587 seconds
Got the embeddings and comparisons in 0.0000 seconds
pruning... 6
done pruning... 5
Coherence Map: [['hockenheim', 'kreisstadt', 'law'], ['hockenheimring', 'hockenheim', 'hockenheim'], ['hockenheimring', 'hockenheimring', 'hockenheim'], ['hohenstein', 'hockenheimring', 'hockenheim'],

['boğazkere', 'sırın', 'işkın', 'meatballs', 'unleavened']
['elazığ', 'ağın', 'antalya', 'çemişgezek', 'adana']
Got the keywords in 1.1849 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['hockenheim', 'kreisstadt', 'law'], ['hockenheimring', 'hockenheim', 'hockenheim'], ['hockenheimring', 'hockenheimring', 'hockenheim'], ['hohenstein', 'hockenheimring', 'hockenheim'], ['kankakee', 'hohenstein', 'muffler']], KW Curr: ['boğazkere', 'sırın', 'işkın', 'meatballs', 'unleavened']
weighted: tensor([0.6455])
Label: 0, Prediction: 1, logit: tensor([0.6455])
['elazığ', 'ağın', 'antalya', 'çemişgezek', 'adana']
['elazığ', 'fırat', 'turkey', 'institutions', 'university']
Got the keywords in 0.7324 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['elazığ', 'elazığ', 'fırat']], KW Curr: ['elazığ', 'ağın', 'antalya', 'çemişgezek', 'adana']
weighted: tensor([0.6285])
Label: 0, Prediction: 1, logit: tensor([0.6285

['tanganyika', 'earthquake', 'kalemie', 'epicentre', 'destroyed']
['oberpfalz', 'magdeburger', 'strasse', 'weiden', '1241']
Got the keywords in 0.7027 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['steinheim', 'kalundu', 'kalemie'], ['tanganyika', 'steinheim', 'earthquake'], ['oberpfalz', 'tanganyika', 'magdeburger']], KW Curr: ['tanganyika', 'earthquake', 'kalemie', 'epicentre', 'destroyed']
weighted: tensor([0.7229])
Label: 1, Prediction: 0, logit: tensor([0.7229])
['oberpfalz', 'magdeburger', 'strasse', 'weiden', '1241']
['precipitation', 'climate', 'equable', 'subtype', 'coast']
Got the keywords in 0.7176 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['steinheim', 'kalundu', 'kalemie'], ['tanganyika', 'steinheim', 'earthquake'], ['oberpfalz', 'tanganyika', 'magdeburger'], ['precipitation', 'oberpfalz', 'climate']], KW Curr: ['oberpfalz', 'magdeburger', 'strasse', 'weiden', '1241']
weighted: tensor([0.7203])
Label: 0, Pr

['489', '671', '788', '803', '833']
['499', 'teachers', 'elementary', 'high', 'school']
Got the keywords in 0.8878 seconds
Got the embeddings and comparisons in 0.0000 seconds
Coherence Map: [['climate', 'jacksboro', 'winters'], ['inexplicably', 'climate', 'oconee'], ['homerville', 'inexplicably', 'valdosta'], [], []], KW Curr: ['489', '671', '788', '803', '833']
weighted: tensor([0.7038])
Label: 0, Prediction: 0, logit: tensor([0.7038])
['499', 'teachers', 'elementary', 'high', 'school']
['waxahachie', '35e', 'downtown', 'pecan', 'cities']
Got the keywords in 0.7722 seconds
Got the embeddings and comparisons in 0.0005 seconds
pruning... 6
done pruning... 5
Coherence Map: [['climate', 'jacksboro', 'winters'], ['inexplicably', 'climate', 'oconee'], ['homerville', 'inexplicably', 'valdosta'], [], []], KW Curr: ['499', 'teachers', 'elementary', 'high', 'school']
weighted: tensor([0.7311])
Label: 1, Prediction: 0, logit: tensor([0.7311])
['waxahachie', '35e', 'downtown', 'pecan', 'cities']

['mustangs', 'boys', 'high', 'schools', 'doniphan']
['kjccc', 'jayhawk', 'kansas', 'college', 'school']
Got the keywords in 0.6436 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['male', 'kansas', 'census'], [], [], ['kjccc', 'mustangs', 'jayhawk']], KW Curr: ['mustangs', 'boys', 'high', 'schools', 'doniphan']
weighted: tensor([0.7640])
Label: 0, Prediction: 0, logit: tensor([0.7640])
['kjccc', 'jayhawk', 'kansas', 'college', 'school']
['jodhpur', 'junagarh', 'mughals', 'rathore', 'singhji']
Got the keywords in 0.5682 seconds
Got the embeddings and comparisons in 0.0008 seconds
Coherence Map: [['male', 'kansas', 'census'], [], [], ['kjccc', 'mustangs', 'jayhawk'], ['jodhpur', 'kjccc', 'junagarh']], KW Curr: ['kjccc', 'jayhawk', 'kansas', 'college', 'school']
weighted: tensor([0.6863])
Label: 1, Prediction: 1, logit: tensor([0.6863])
['jodhpur', 'junagarh', 'mughals', 'rathore', 'singhji']
['jabalpur', 'guwahati', 'jodhpur', 'kurukshetra', 'bhopal']
Got the

['tianwei', 'polluted', 'baoding', 'china', 'currently']
['jingguang', 'huanghua', 'baoding', 'jingshi', 'baojin']
Got the keywords in 0.8291 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['tianwei', 'shijiazhuang', 'polluted'], ['jingguang', 'tianwei', 'huanghua']], KW Curr: ['tianwei', 'polluted', 'baoding', 'china', 'currently']
weighted: tensor([0.7378])
Label: 0, Prediction: 0, logit: tensor([0.7378])
['jingguang', 'huanghua', 'baoding', 'jingshi', 'baojin']
['baoding', 'prc', 'armies', 'beijing', 'army']
Got the keywords in 0.7008 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['tianwei', 'shijiazhuang', 'polluted'], ['jingguang', 'tianwei', 'huanghua'], ['baoding', 'jingguang', 'prc']], KW Curr: ['jingguang', 'huanghua', 'baoding', 'jingshi', 'baojin']
weighted: tensor([0.7041])
Label: 0, Prediction: 0, logit: tensor([0.7041])
['baoding', 'prc', 'armies', 'beijing', 'army']
['baiyangdian', 'chaoyang', '圈头村音乐会', 'yuhua'

['subdialects', 'nuanced', 'kisilier', 'urums', 'rumeíka']
['tsentralnyi', 'livoberezhnyi', 'illyichevsky', 'volonterobvka', 'preobrazheniye']
Got the keywords in 1.1244 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['ukrainians', 'ukrainians', 'priazovye'], ['subdialects', 'ukrainians', 'nuanced'], ['tsentralnyi', 'subdialects', 'livoberezhnyi']], KW Curr: ['subdialects', 'nuanced', 'kisilier', 'urums', 'rumeíka']
weighted: tensor([0.7007])
Label: 0, Prediction: 0, logit: tensor([0.7007])
['tsentralnyi', 'livoberezhnyi', 'illyichevsky', 'volonterobvka', 'preobrazheniye']
['mariupol', 'unemployed', 'economy', 'unemployment', 'industry']
Got the keywords in 0.8568 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['ukrainians', 'ukrainians', 'priazovye'], ['subdialects', 'ukrainians', 'nuanced'], ['tsentralnyi', 'subdialects', 'livoberezhnyi'], ['mariupol', 'tsentralnyi', 'unemployed']], KW Curr: ['tsentralnyi', 'livoberezhnyi', 

['telephone', 'soviet', 'ukrainian', 'operational', 'recently']
['gastroenterology', 'sanatoriums', 'microsurgery', 'pancreatic', 'thoracic']
Got the keywords in 0.4593 seconds
Got the embeddings and comparisons in 0.0011 seconds
Coherence Map: [['mashinobudivnykiv', 'budivelnykiv', 'nezalezhnosti'], ['pershotravnevy', 'mashinobudivnykiv', 'volodarsky'], ['telephone', 'pershotravnevy', 'soviet'], ['gastroenterology', 'telephone', 'sanatoriums']], KW Curr: ['telephone', 'soviet', 'ukrainian', 'operational', 'recently']
weighted: tensor([0.7750])
Label: 0, Prediction: 0, logit: tensor([0.7750])
['gastroenterology', 'sanatoriums', 'microsurgery', 'pancreatic', 'thoracic']
['priazovsky', 'mariupol', 'lyceums', 'establishments', 'azovsky']
Got the keywords in 0.8745 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['mashinobudivnykiv', 'budivelnykiv', 'nezalezhnosti'], ['pershotravnevy', 'mashinobudivnykiv', 'volodarsky'], ['telephone', 'pershotravnevy', 'soviet'

['machagai', 'bermejo', 'qom', 'language', 'slope']
['patagonia', 'machagai', 'chaco', 'argentines', 'populate']
Got the keywords in 0.4533 seconds
Got the embeddings and comparisons in 0.0008 seconds
pruning... 6
done pruning... 5
Coherence Map: [['mashinobudivnykiv', 'budivelnykiv', 'nezalezhnosti'], ['pershotravnevy', 'mashinobudivnykiv', 'volodarsky'], ['telephone', 'pershotravnevy', 'soviet'], ['gastroenterology', 'telephone', 'sanatoriums'], ['priazovsky', 'gastroenterology', 'mariupol']], KW Curr: ['machagai', 'bermejo', 'qom', 'language', 'slope']
weighted: tensor([0.6962])
Label: 0, Prediction: 1, logit: tensor([0.6962])
['patagonia', 'machagai', 'chaco', 'argentines', 'populate']
['1872', '1869', '1870', 'town', 'railroad']
Got the keywords in 0.7410 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['town', 'patagonia', 'railroad']], KW Curr: ['patagonia', 'machagai', 'chaco', 'argentines', 'populate']
weighted: tensor([0.7882])
Label: 1, Predictio

['bosnasaray', 'florentine', 'extraterritorial', 'novibazar', 'vassalage']
['ragusans', 'zamorin', 'luccari', 'lukarić', 'sponza']
Got the keywords in 1.5531 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['pelješac', 'ooryphas', 'dalmatia'], ['dubrovačka', 'pelješac', 'pelješac'], ['bosnasaray', 'dubrovačka', 'florentine'], ['ragusans', 'bosnasaray', 'zamorin']], KW Curr: ['bosnasaray', 'florentine', 'extraterritorial', 'novibazar', 'vassalage']
weighted: tensor([0.7386])
Label: 0, Prediction: 0, logit: tensor([0.7386])
['ragusans', 'zamorin', 'luccari', 'lukarić', 'sponza']
['unprepared', 'dalmatia', 'vlaho', 'epidaurus', 'njegoš']
Got the keywords in 1.5664 seconds
Got the embeddings and comparisons in 0.0008 seconds
Coherence Map: [['pelješac', 'ooryphas', 'dalmatia'], ['dubrovačka', 'pelješac', 'pelješac'], ['bosnasaray', 'dubrovačka', 'florentine'], ['ragusans', 'bosnasaray', 'zamorin'], ['unprepared', 'ragusans', 'dalmatia']], KW Curr: ['ragusans', 

['males', 'census', 'female', 'females', 'population']
['torzymska', 'lubusz', 'postglacial', 'lake', 'western']
Got the keywords in 0.8905 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['speedway', 'owenton', 'xfinity'], ['males', 'speedway', 'census'], ['torzymska', 'males', 'lubusz']], KW Curr: ['males', 'census', 'female', 'females', 'population']
weighted: tensor([0.7642])
Label: 1, Prediction: 0, logit: tensor([0.7642])
['torzymska', 'lubusz', 'postglacial', 'lake', 'western']
['głębiniec', 'łęczna', 'długie', 'rzepsko', 'rzepin']
Got the keywords in 0.5278 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['speedway', 'owenton', 'xfinity'], ['males', 'speedway', 'census'], ['torzymska', 'males', 'lubusz'], ['głębiniec', 'torzymska', 'łęczna']], KW Curr: ['torzymska', 'lubusz', 'postglacial', 'lake', 'western']
weighted: tensor([0.7314])
Label: 0, Prediction: 0, logit: tensor([0.7314])
['głębiniec', 'łęczna', 'długie', 'rz

['moharraq', 'zubaida', 'hamdani', 'abdulazeez', 'muharraq']
['zayayina', 'sanqal', 'khmais', 'khatir', 'ghatim']
Got the keywords in 1.0046 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['zayayina', 'moharraq', 'sanqal']], KW Curr: ['moharraq', 'zubaida', 'hamdani', 'abdulazeez', 'muharraq']
weighted: tensor([0.6685])
Label: 0, Prediction: 1, logit: tensor([0.6685])
['zayayina', 'sanqal', 'khmais', 'khatir', 'ghatim']
['1803', 'formed', 'town', '1787', '1780']
Got the keywords in 0.7979 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['formed', 'zayayina', 'town']], KW Curr: ['zayayina', 'sanqal', 'khmais', 'khatir', 'ghatim']
weighted: tensor([0.7142])
Label: 1, Prediction: 0, logit: tensor([0.7142])
['1803', 'formed', 'town', '1787', '1780']
['oconee', '571528', 'eatonton', '180921', 'georgia']
Got the keywords in 0.4305 seconds
Got the embeddings and comparisons in 0.0005 seconds
Coherence Map: [['formed', 'zayayina', 'tow

['higashiyodogawa', 'kōtoku', 'kojiki', 'toyosaki', 'yamato']
['nobunaga', 'heihachirō', 'ishiyama', 'kabuki', 'hideyoshi']
Got the keywords in 1.4625 seconds
Got the embeddings and comparisons in 0.0006 seconds
pruning... 6
done pruning... 5
Coherence Map: [['formed', 'zayayina', 'town'], ['oconee', 'formed', 'eatonton'], ['males', 'oconee', 'census'], ['oconee', 'males', 'schools'], ['orofino', 'oconee', 'mildest']], KW Curr: ['higashiyodogawa', 'kōtoku', 'kojiki', 'toyosaki', 'yamato']
weighted: tensor([0.7369])
Label: 0, Prediction: 0, logit: tensor([0.7369])
['nobunaga', 'heihachirō', 'ishiyama', 'kabuki', 'hideyoshi']
['slums', 'industrialization', 'daihatsu', 'oldsmobile', 'nishi']
Got the keywords in 1.5895 seconds
Got the embeddings and comparisons in 0.0017 seconds
pruning... 6
done pruning... 5
Coherence Map: [['formed', 'zayayina', 'town'], ['oconee', 'formed', 'eatonton'], ['males', 'oconee', 'census'], ['oconee', 'males', 'schools'], ['orofino', 'oconee', 'mildest']], KW 

['shōryō', 'ebisu', 'ikukunitama', 'sumiyoshi', 'matsuri']
['tennōji', 'planetarium', 'omnimax', 'celadon', 'storied']
Got the keywords in 0.9611 seconds
Got the embeddings and comparisons in 0.0010 seconds
pruning... 6
done pruning... 5
Coherence Map: [['kashiwagi', 'verbs', 'toshifumi'], ['disposing', 'kashiwagi', 'kepco'], ['hinterlands', 'disposing', 'sanyo'], ['kintetsu', 'hinterlands', 'kitakyushu'], ['yodobashi', 'kintetsu', 'tenjinbashi']], KW Curr: ['shōryō', 'ebisu', 'ikukunitama', 'sumiyoshi', 'matsuri']
weighted: tensor([0.7838])
Label: 0, Prediction: 0, logit: tensor([0.7838])
['tennōji', 'planetarium', 'omnimax', 'celadon', 'storied']
['newspapers', 'osaka', 'tokyo', 'newspaper', 'companies']
Got the keywords in 0.9124 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['kashiwagi', 'verbs', 'toshifumi'], ['disposing', 'kashiwagi', 'kepco'], ['hinterlands', 'disposing', 'sanyo'], ['kintetsu', 'hinterlands', 'kitakyu

['courthouse', 'gratiot', 'downtown', 'districts', 'places']
['platted', 'alleys', 'jeffery', 'gratiot', 'village']
Got the keywords in 0.4870 seconds
Got the embeddings and comparisons in 0.0007 seconds
pruning... 6
done pruning... 5
Coherence Map: [['minamikawachi', 'nishinomiya', 'morinomiya'], ['busan', 'minamikawachi', 'cities'], ['census', 'busan', 'area'], [], []], KW Curr: ['courthouse', 'gratiot', 'downtown', 'districts', 'places']
weighted: tensor([0.7198])
Label: 0, Prediction: 0, logit: tensor([0.7198])
['platted', 'alleys', 'jeffery', 'gratiot', 'village']
['coached', 'state', 'present', '1977', 'track']
Got the keywords in 0.7276 seconds
Got the embeddings and comparisons in 0.0006 seconds
pruning... 6
done pruning... 5
Coherence Map: [['minamikawachi', 'nishinomiya', 'morinomiya'], ['busan', 'minamikawachi', 'cities'], ['census', 'busan', 'area'], [], []], KW Curr: ['platted', 'alleys', 'jeffery', 'gratiot', 'village']
weighted: tensor([0.7084])
Label: 0, Prediction: 0, 

['acadians', 'gaurhept', 'soybean', 'vigilante', 'broussard']
['acadians', 'billeaud', 'broussard', 'contemporaneous', 'leblanc']
Got the keywords in 1.1567 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['acadians', 'acadians', 'billeaud']], KW Curr: ['acadians', 'gaurhept', 'soybean', 'vigilante', 'broussard']
weighted: tensor([0.7343])
Label: 0, Prediction: 0, logit: tensor([0.7343])
['acadians', 'billeaud', 'broussard', 'contemporaneous', 'leblanc']
['142329', '963644', 'broussard', 'census', 'land']
Got the keywords in 0.9278 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['acadians', 'acadians', 'billeaud'], ['broussard', 'acadians', 'census']], KW Curr: ['acadians', 'billeaud', 'broussard', 'contemporaneous', 'leblanc']
weighted: tensor([0.6601])
Label: 0, Prediction: 1, logit: tensor([0.6601])
['142329', '963644', 'broussard', 'census', 'land']
['515', '676', '619', '346', '874']
Got the keywords in 0.9030 seconds
Got 

<IPython.core.display.Javascript object>

In [204]:
print([x[1] for x in predictions])
print(true_labels)

[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

<IPython.core.display.Javascript object>

In [205]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [206]:
avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [207]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.4802675585284281
pk = 0.43210702341137125


<IPython.core.display.Javascript object>

## Prediction Tuning

In [216]:
pred_thresh = 0.64

<IPython.core.display.Javascript object>

In [217]:
modified_predictions = [
    1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
]

pred_string = "".join(str(modified_predictions))
true_string = "".join(str(true_labels))

avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [218]:
print(pred_string)
print(true_string)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

<IPython.core.display.Javascript object>

In [219]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.3505016722408027
pk = 0.34448160535117056


<IPython.core.display.Javascript object>

## KeyBERT Embedding Comparison

In [172]:
curr = 230
prev = curr - 1

<IPython.core.display.Javascript object>

In [205]:
cohesion = coherence.get_coherence(
    [text_data[curr], text_data[prev]], coherence_threshold=0.25
)
print([k[0] for k in cohesion])

Got the keywords in 0.6567 seconds
Got the embeddings and comparisons in 0.0007 seconds
['cantonese', 'languages', 'vietnamese', 'communes']


<IPython.core.display.Javascript object>

In [206]:
# get the keywords for the current sentences
keywords_current = keywords_lib.get_keywords_with_kb_embeddings(text_data[curr])
keywords_prev = keywords_lib.get_keywords_with_kb_embeddings(text_data[prev])

# compute the word comparisons between the previous (with the coherence map)
# and the current (possibly the first sentence in a new segment)
word_comparisons_with_coherence, weights = compare_coherent_words(
    [keywords_prev], keywords_current
)

<IPython.core.display.Javascript object>

In [207]:
[(x[0], x[1]) for x in keywords_current], [(x[0], x[1]) for x in keywords_prev]

([('township', 0.2304),
  ('communes', 0.1857),
  ('hải', 0.1399),
  ('wards', 0.1397),
  ('đông', 0.1224)],
 [('cantonese', 0.5038),
  ('mandarin', 0.464),
  ('languages', 0.3483),
  ('language', 0.343),
  ('vietnamese', 0.3184)])

<IPython.core.display.Javascript object>

# KeyBERT Embedding Testing

In [679]:
docs = [
    "Hi my name is Devarsh",
    "Devarsh likes to play Basketball.",
    "I love to watch Cricket.",
    "I am a strong programmer. And my name is Devarsh",
]

<IPython.core.display.Javascript object>

In [680]:
from keybert import KeyBERT

kw_model = KeyBERT()
doc_embeddings, word_embeddings = kw_model.extract_embeddings(
    docs, min_df=1, stop_words="english"
)
keywords = kw_model.extract_keywords(
    docs,
    min_df=1,
    stop_words="english",
    doc_embeddings=doc_embeddings,
    word_embeddings=word_embeddings,
)

<IPython.core.display.Javascript object>

In [681]:
len(doc_embeddings)

4

<IPython.core.display.Javascript object>

In [682]:
len(word_embeddings)

10

<IPython.core.display.Javascript object>

In [683]:
keywords

[[('devarsh', 0.6267), ('hi', 0.5216)],
 [('devarsh', 0.6549),
  ('basketball', 0.5558),
  ('play', 0.3787),
  ('likes', 0.2284)],
 [('cricket', 0.7118), ('watch', 0.3656), ('love', 0.307)],
 [('programmer', 0.5942), ('devarsh', 0.5528), ('strong', 0.3452)]]

<IPython.core.display.Javascript object>

In [701]:
kw_model = KeyBERT()
import torch


def get_keywords_with_embeddings_test(
    data,
) -> list[tuple[str, float, torch.Tensor]]:
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(data)

    keywords = kw_model.extract_keywords(
        data, doc_embeddings=doc_embeddings, word_embeddings=word_embeddings
    )

    keywords_with_embeddings = []
    count = 0
    print(len(word_embeddings))
    for i, (kw, we) in enumerate(zip(keywords, word_embeddings)):
        for j, words in enumerate(kw):
            keywords_with_embeddings.append((words[0], words[1], torch.tensor(we)))
            count += 1

    return keywords_with_embeddings

<IPython.core.display.Javascript object>

In [702]:
embeddings = get_keywords_with_embeddings_test(docs)

10


<IPython.core.display.Javascript object>

In [703]:
len(embeddings)

12

<IPython.core.display.Javascript object>