In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [73]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [4]:
# initialize the coherence library
coherence = Coherence(max_words_per_step=4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Metal device set to: Apple M1 Max


2023-04-10 19:55:51.283054: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-10 19:55:51.283490: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-04-10 19:55:53.189197: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-10 19:55:53.220352: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-10 19:55:54.102574: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [5]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-10 19:56:09.444770: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [6]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [7]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [8]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [9]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [10]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [72]:
pruning = 0  # remove the lowest n important words from coherence map
pruning_min = 10  # only prune after n words in the coherence map


def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(coherence_map, keywords_current, suppress_errors=False):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]
                    weight = 1 / (
                        i + 1
                    )  # this weight is a recipricol function that will grow smaller the further the keywords are away

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * embedding_lib.get_similarity(word_one_emb, word_two_emb),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data, text_labels, max_tokens=400, max_str_length=30, prediction_thresh=0.25
):
    coherence_map = []
    predictions = []
    for i, (row, label) in enumerate(zip(text_data, text_labels)):
        # compare the current sentence to the previous one
        if i == 0:
            predictions.append((0, 0))
        else:
            prev_row = text_data[i - 1]

            row = truncate_by_token(row, max_tokens)
            prev_row = truncate_by_token(prev_row, max_tokens)

            # add the keywords to the coherence map
            coherence_map.append(
                coherence.get_coherence([row, prev_row], coherence_threshold=0.2)
            )
            print(f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}")
            if pruning > 0 and len(coherence_map) >= pruning_min:
                print("pruning...", len(coherence_map))
                sorted_map = sorted(
                    coherence_map, key=lambda tup: tup[1]
                )  # sort asc by importance based on keybert
                coherence_map = sorted_map[pruning:][
                    ::-1
                ]  # get the last n - pruning values and reverse the list
                print("done pruning...", len(coherence_map))

            # truncate the strings for printing
            truncated_row = truncate_string(row, max_str_length)
            truncated_prev_row = truncate_string(prev_row, max_str_length)

            # get the keywords for the current sentences
            keywords_current = keywords_lib.get_keywords_with_embeddings(row)
            keywords_prev = keywords_lib.get_keywords_with_embeddings(prev_row)

            # compute the word comparisons between the previous (with the coherence map)
            # and the current (possibly the first sentence in a new segment)
            word_comparisons_with_coherence, weights = compare_coherent_words(
                [*coherence_map, keywords_prev], keywords_current
            )

            similarities_with_coherence = [
                comparison[2] for comparison in word_comparisons_with_coherence
            ]
            avg_similarity_with_coherence = sum(similarities_with_coherence) / len(
                similarities_with_coherence
            )
            weighted_avg_similarity_with_coherence = get_weighted_average(
                similarities_with_coherence, weights
            )
            print(f"weighted: {weighted_avg_similarity_with_coherence}")

            # if the two sentences are similar, create a cohesive prediction
            # otherwise, predict a new segment
            if weighted_avg_similarity_with_coherence > prediction_thresh:
                print(
                    f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 0))
            else:
                # start of a new segment, empty the map
                coherence_map = []
                print(
                    f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 1))

            print("===============================================")

    return predictions

<IPython.core.display.Javascript object>

In [83]:
start = 250
num_samples = 1000
max_tokens = 256  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

Coherence Map: [['atlanta', 'radio', 'mississippi', 'radio']]
weighted: tensor([0.3013])
Label: 0, Prediction: 0, logit: tensor([0.3013])
Coherence Map: [['atlanta', 'radio', 'mississippi', 'radio'], ['margaret', 'atlanta', 'reed', 'atlanta']]
weighted: tensor([0.2695])
Label: 0, Prediction: 0, logit: tensor([0.2695])
Coherence Map: [['atlanta', 'radio', 'mississippi', 'radio'], ['margaret', 'atlanta', 'reed', 'atlanta'], ['postal', 'margaret', 'geological', 'margaret']]
weighted: tensor([0.2791])
Label: 1, Prediction: 0, logit: tensor([0.2791])
Coherence Map: [['atlanta', 'radio', 'mississippi', 'radio'], ['margaret', 'atlanta', 'reed', 'atlanta'], ['postal', 'margaret', 'geological', 'margaret'], ['census', 'postal', 'area', 'postal']]
weighted: tensor([0.2672])
Label: 0, Prediction: 0, logit: tensor([0.2672])
Coherence Map: [['atlanta', 'radio', 'mississippi', 'radio'], ['margaret', 'atlanta', 'reed', 'atlanta'], ['postal', 'margaret', 'geological', 'margaret'], ['census', 'postal',

weighted: tensor([0.3131])
Label: 0, Prediction: 0, logit: tensor([0.3131])
Coherence Map: [['hindu', 'maurya', 'muslim', 'maurya'], ['districts', 'hindu', 'plots', 'hindu'], ['climate', 'districts', 'winter', 'districts'], ['grapefruits', 'climate', 'grapefruit', 'climate'], ['pakistan', 'grapefruits', 'afghan', 'grapefruits'], ['afghanistan', 'afghanistan', 'afghan', 'afghanistan'], ['garrison', 'afghanistan', 'settlement', 'afghanistan'], ['sanitation', 'garrison', 'mayor', 'garrison'], ['apes', 'sanitation', 'palace', 'sanitation'], ['climate', 'cameroon', 'rainy', 'cameroon'], ['bus', 'climate', 'buses', 'climate']]
weighted: tensor([0.3205])
Label: 0, Prediction: 0, logit: tensor([0.3205])
Coherence Map: [['hindu', 'maurya', 'muslim', 'maurya'], ['districts', 'hindu', 'plots', 'hindu'], ['climate', 'districts', 'winter', 'districts'], ['grapefruits', 'climate', 'grapefruit', 'climate'], ['pakistan', 'grapefruits', 'afghan', 'grapefruits'], ['afghanistan', 'afghanistan', 'afghan',

Coherence Map: [['districts', 'wetlands', 'settlements', 'wetlands'], ['baden', 'districts', 'mainz', 'districts'], ['mayor', 'baden', 'municipal', 'baden'], ['lion', 'mayor', 'symbols', 'mayor'], ['hockenheim', 'lion', 'racing', 'lion'], ['buildings', 'hockenheim', 'building', 'hockenheim'], ['germany', 'buildings', 'reunification', 'buildings'], ['route', 'samba', 'rt', 'samba']]
weighted: tensor([0.2399])
Label: 1, Prediction: 1, logit: tensor([0.2399])
Coherence Map: [['geographical', 'route', 'wilmington', 'route']]
weighted: tensor([0.2859])
Label: 0, Prediction: 0, logit: tensor([0.2859])
Coherence Map: [['geographical', 'route', 'wilmington', 'route'], ['median', 'geographical', 'census', 'geographical']]
weighted: tensor([0.2289])
Label: 0, Prediction: 1, logit: tensor([0.2289])
Coherence Map: [['charlie', 'median', 'wilmington', 'census']]
weighted: tensor([0.2066])
Label: 0, Prediction: 1, logit: tensor([0.2066])
Coherence Map: [['founded', 'wilmington', 'founded', 'song']]


Coherence Map: [['province', 'regime', 'manufactures', 'regime'], ['zinc', 'textiles', 'copper', 'textiles']]
weighted: tensor([0.3813])
Label: 0, Prediction: 0, logit: tensor([0.3813])
Coherence Map: [['province', 'regime', 'manufactures', 'regime'], ['zinc', 'textiles', 'copper', 'textiles'], ['railway', 'zinc', 'lines', 'zinc']]
weighted: tensor([0.3265])
Label: 0, Prediction: 0, logit: tensor([0.3265])
Coherence Map: [['province', 'regime', 'manufactures', 'regime'], ['zinc', 'textiles', 'copper', 'textiles'], ['railway', 'zinc', 'lines', 'zinc'], ['zambia', 'railway', 'tanzania', 'railway']]
weighted: tensor([0.3182])
Label: 0, Prediction: 0, logit: tensor([0.3182])
Coherence Map: [['province', 'regime', 'manufactures', 'regime'], ['zinc', 'textiles', 'copper', 'textiles'], ['railway', 'zinc', 'lines', 'zinc'], ['zambia', 'railway', 'tanzania', 'railway'], ['dialect', 'zambia', 'oriental', 'zambia']]
weighted: tensor([0.3125])
Label: 0, Prediction: 0, logit: tensor([0.3125])
Coher

weighted: tensor([0.2689])
Label: 0, Prediction: 0, logit: tensor([0.2689])
Coherence Map: [['county', 'courthouse', 'valdosta', 'courthouse'], ['census', 'county', 'median', 'county'], ['district', 'households', 'elementary', 'households'], ['oak', 'district', 'northwest', 'district']]
weighted: tensor([0.2649])
Label: 1, Prediction: 0, logit: tensor([0.2649])
Coherence Map: [['county', 'courthouse', 'valdosta', 'courthouse'], ['census', 'county', 'median', 'county'], ['district', 'households', 'elementary', 'households'], ['oak', 'district', 'northwest', 'district'], ['climate', 'oak', 'subtropical', 'oak']]
weighted: tensor([0.2487])
Label: 0, Prediction: 1, logit: tensor([0.2487])
Coherence Map: [['median', 'climate', 'census', 'climate']]
weighted: tensor([0.3403])
Label: 0, Prediction: 0, logit: tensor([0.3403])
Coherence Map: [['median', 'climate', 'census', 'climate'], ['red', 'households', 'elementary', 'households']]
weighted: tensor([0.2361])
Label: 0, Prediction: 1, logit: 

weighted: tensor([0.3503])
Label: 0, Prediction: 0, logit: tensor([0.3503])
Coherence Map: [['jayhawk', 'mustangs', 'college', 'mustangs'], ['mughal', 'jayhawk', 'maharaja', 'jayhawk'], ['ahmedabad', 'mughals', 'jammu', 'mughals'], ['climate', 'ahmedabad', 'temperatures', 'ahmedabad'], ['rajasthan', 'climate', 'palaces', 'climate'], ['palace', 'rajasthan', 'designed', 'rajasthan'], ['fort', 'palace', 'built', 'palace'], ['durga', 'fort', 'mata', 'fort'], ['temple', 'durga', 'rules', 'durga'], ['equine', 'temple', 'horses', 'temple']]
weighted: tensor([0.2496])
Label: 0, Prediction: 1, logit: tensor([0.2496])
Coherence Map: [['mongols', 'equine', 'dynasty', 'equine']]
weighted: tensor([0.2650])
Label: 1, Prediction: 0, logit: tensor([0.2650])
Coherence Map: [['mongols', 'equine', 'dynasty', 'equine'], ['mountains', 'mongols', 'northeast', 'mongols']]
weighted: tensor([0.2829])
Label: 0, Prediction: 0, logit: tensor([0.2829])
Coherence Map: [['mongols', 'equine', 'dynasty', 'equine'], ['

weighted: tensor([0.3353])
Label: 0, Prediction: 0, logit: tensor([0.3353])
Coherence Map: [['mongols', 'equine', 'dynasty', 'equine'], ['mountains', 'mongols', 'northeast', 'mongols'], ['humid', 'mountains', 'rainfall', 'mountains'], ['districts', 'humid', 'counties', 'humid'], ['mandarin', 'districts', 'beijing', 'districts'], ['tianjin', 'mandarin', 'beijing', 'mandarin'], ['turbines', 'tianjin', 'turbine', 'tianjin'], ['railway', 'turbines', 'beijing', 'turbines'], ['armies', 'railway', 'army', 'railway'], ['streets', 'armies', 'street', 'armies'], ['county', 'streets', 'relics', 'streets'], ['muscovy', 'qing', 'steppe', 'qing'], ['azov', 'muscovy', 'crimea', 'muscovy'], ['donetsk', 'azov', 'separatists', 'azov'], ['steppe', 'donetsk', 'donetsk', 'donetsk'], ['climate', 'steppe', 'crops', 'steppe']]
weighted: tensor([0.2699])
Label: 0, Prediction: 0, logit: tensor([0.2699])
Coherence Map: [['mongols', 'equine', 'dynasty', 'equine'], ['mountains', 'mongols', 'northeast', 'mongols'],

weighted: tensor([0.2838])
Label: 0, Prediction: 0, logit: tensor([0.2838])
Coherence Map: [['day', 'azure', 'holiday', 'azure'], ['armenians', 'day', 'greeks', 'day'], ['ukrainians', 'armenians', 'ukraine', 'armenians'], ['dialects', 'ukrainians', 'dialect', 'ukrainians'], ['borough', 'dialects', 'churches', 'dialects'], ['occupation', 'borough', 'economy', 'borough'], ['industrial', 'unemployment', 'metallurgy', 'unemployment'], ['million', 'industrial', 'gdp', 'industrial'], ['libraries', 'million', 'library', 'million'], ['ukrainian', 'libraries', 'ukraine', 'libraries']]
weighted: tensor([0.3043])
Label: 0, Prediction: 0, logit: tensor([0.3043])
Coherence Map: [['day', 'azure', 'holiday', 'azure'], ['armenians', 'day', 'greeks', 'day'], ['ukrainians', 'armenians', 'ukraine', 'armenians'], ['dialects', 'ukrainians', 'dialect', 'ukrainians'], ['borough', 'dialects', 'churches', 'dialects'], ['occupation', 'borough', 'economy', 'borough'], ['industrial', 'unemployment', 'metallurgy',

weighted: tensor([0.3927])
Label: 0, Prediction: 0, logit: tensor([0.3927])
Coherence Map: [['day', 'azure', 'holiday', 'azure'], ['armenians', 'day', 'greeks', 'day'], ['ukrainians', 'armenians', 'ukraine', 'armenians'], ['dialects', 'ukrainians', 'dialect', 'ukrainians'], ['borough', 'dialects', 'churches', 'dialects'], ['occupation', 'borough', 'economy', 'borough'], ['industrial', 'unemployment', 'metallurgy', 'unemployment'], ['million', 'industrial', 'gdp', 'industrial'], ['libraries', 'million', 'library', 'million'], ['ukrainian', 'libraries', 'ukraine', 'libraries'], ['beaches', 'ukrainian', 'coast', 'ukrainian'], ['monuments', 'beaches', 'monument', 'beaches'], ['romance', 'monuments', 'club', 'monuments'], ['sport', 'romance', 'sports', 'romance'], ['ukraine', 'sport', 'donetsk', 'sport'], ['civil', 'donetsk', 'residential', 'donetsk'], ['streets', 'buildings', 'street', 'buildings'], ['trams', 'streets', 'buses', 'streets'], ['telephone', 'trams', 'communications', 'trams']

weighted: tensor([0.3006])
Label: 0, Prediction: 0, logit: tensor([0.3006])
Coherence Map: [['day', 'azure', 'holiday', 'azure'], ['armenians', 'day', 'greeks', 'day'], ['ukrainians', 'armenians', 'ukraine', 'armenians'], ['dialects', 'ukrainians', 'dialect', 'ukrainians'], ['borough', 'dialects', 'churches', 'dialects'], ['occupation', 'borough', 'economy', 'borough'], ['industrial', 'unemployment', 'metallurgy', 'unemployment'], ['million', 'industrial', 'gdp', 'industrial'], ['libraries', 'million', 'library', 'million'], ['ukrainian', 'libraries', 'ukraine', 'libraries'], ['beaches', 'ukrainian', 'coast', 'ukrainian'], ['monuments', 'beaches', 'monument', 'beaches'], ['romance', 'monuments', 'club', 'monuments'], ['sport', 'romance', 'sports', 'romance'], ['ukraine', 'sport', 'donetsk', 'sport'], ['civil', 'donetsk', 'residential', 'donetsk'], ['streets', 'buildings', 'street', 'buildings'], ['trams', 'streets', 'buses', 'streets'], ['telephone', 'trams', 'communications', 'trams']

Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian']]
weighted: tensor([0.3183])
Label: 0, Prediction: 0, logit: tensor([0.3183])
Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian'], ['slavs', 'islands', 'croats', 'islands']]
weighted: tensor([0.3099])
Label: 0, Prediction: 0, logit: tensor([0.3099])
Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian'], ['slavs', 'islands', 'croats', 'islands'], ['byzantium', 'byzantine', 'byzantine', 'byzantine']]
weighted: tensor([0.3999])
Label: 0, Prediction: 0, logit: tensor([0.3999])
Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian'], ['slavs', 'islands', 'croats', 'islands'], ['byzantium', 'byzantine', 'byzantine', 'byzantine'], ['venice', 'byzantium', 'adriatic', 'byzantium']]
weighted: tensor([0.3911])
Label: 0, Prediction: 0, logit: tensor([0.3911])
Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian'], ['slavs', 'islands', 'croats', 'islands'], ['byzantium', 'byzantine', 'byzantin

weighted: tensor([0.4309])
Label: 0, Prediction: 0, logit: tensor([0.4309])
Coherence Map: [['islands', 'croatian', 'peninsula', 'croatian'], ['slavs', 'islands', 'croats', 'islands'], ['byzantium', 'byzantine', 'byzantine', 'byzantine'], ['venice', 'byzantium', 'adriatic', 'byzantium'], ['bosnian', 'venice', 'bosnia', 'venice'], ['ottomans', 'bosnian', 'ottoman', 'bosnian'], ['venetian', 'ottomans', 'baroque', 'ottomans'], ['ottomans', 'venetian', 'treaty', 'venetian'], ['napoleonic', 'ottomans', 'nobility', 'ottomans'], ['aristocracy', 'napoleonic', 'nobility', 'napoleonic'], ['aristocracy', 'aristocracy', 'families', 'aristocracy'], ['nobility', 'aristocracy', 'absolutism', 'aristocracy'], ['croatian', 'nobility', 'flag', 'nobility'], ['inhabitants', 'croatian', 'dubrovnik', 'croatian'], ['dialect', 'inhabitants', 'latin', 'inhabitants'], ['croatian', 'dialect', 'croatia', 'dialect'], ['croatian', 'croatian', 'croats', 'croatian']]
weighted: tensor([0.4638])
Label: 0, Prediction: 0,

weighted: tensor([0.2918])
Label: 0, Prediction: 0, logit: tensor([0.2918])
Coherence Map: [['tributary', 'valley', 'river', 'valley'], ['railway', 'tributary', 'founded', 'tributary'], ['etymology', 'railway', 'turnip', 'railway'], ['craftsmen', 'etymology', 'citizens', 'etymology'], ['medieval', 'craftsmen', 'alley', 'craftsmen'], ['woodland', 'medieval', 'forests', 'medieval'], ['square', 'woodland', 'squares', 'woodland'], ['cinema', 'square', 'cultural', 'square'], ['warsaw', 'cinema', 'frankfurt', 'cinema'], ['coal', 'warsaw', 'industry', 'warsaw'], ['county', 'coal', 'schools', 'coal'], ['county', 'county', 'southeast', 'county']]
weighted: tensor([0.2889])
Label: 0, Prediction: 0, logit: tensor([0.2889])
Coherence Map: [['tributary', 'valley', 'river', 'valley'], ['railway', 'tributary', 'founded', 'tributary'], ['etymology', 'railway', 'turnip', 'railway'], ['craftsmen', 'etymology', 'citizens', 'etymology'], ['medieval', 'craftsmen', 'alley', 'craftsmen'], ['woodland', 'medie

weighted: tensor([0.2220])
Label: 0, Prediction: 1, logit: tensor([0.2220])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid']]
weighted: tensor([0.3149])
Label: 0, Prediction: 0, logit: tensor([0.3149])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka']]
weighted: tensor([0.3481])
Label: 0, Prediction: 0, logit: tensor([0.3481])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka'], ['tokyo', 'osaka', 'koreans', 'osaka']]
weighted: tensor([0.3668])
Label: 0, Prediction: 0, logit: tensor([0.3668])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka'], ['tokyo', 'osaka', 'koreans', 'osaka'], ['kansai', 'tokyo', 'osaka', 'tokyo']]
weighted: tensor([0.3168])
Label: 0, Prediction: 0, logit: tensor([0.3168])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka'], ['tokyo', 'osaka', 'koreans', 'osaka'], ['kansai

weighted: tensor([0.4463])
Label: 0, Prediction: 0, logit: tensor([0.4463])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka'], ['tokyo', 'osaka', 'koreans', 'osaka'], ['kansai', 'tokyo', 'osaka', 'tokyo'], ['mayors', 'kansai', 'municipal', 'kansai'], ['kyoto', 'mayor', 'osaka', 'mayor'], ['osaka', 'kyoto', 'tokyo', 'kyoto'], ['shinkansen', 'osaka', 'osaka', 'osaka'], ['akihabara', 'shinkansen', 'osaka', 'shinkansen'], ['osaka', 'akihabara', 'matsuri', 'akihabara'], ['osaka', 'osaka', 'ceramics', 'osaka'], ['osaka', 'osaka', 'tokyo', 'osaka'], ['osaka', 'osaka', 'nihon', 'osaka'], ['nhk', 'osaka', 'osaka', 'osaka'], ['osaka', 'nhk', 'izumi', 'nhk'], ['osaka', 'osaka', 'nippon', 'osaka'], ['osaka', 'osaka', 'kansai', 'osaka'], ['budapest', 'osaka', 'buenos', 'osaka']]
weighted: tensor([0.2974])
Label: 0, Prediction: 0, logit: tensor([0.2974])
Coherence Map: [['tokyo', 'humid', 'japanese', 'humid'], ['osaka', 'osaka', 'shinsekai', 'osaka'], 

Coherence Map: [['heritage', 'creole', 'properties', 'creole'], ['area', 'heritage', 'miles', 'heritage']]
weighted: tensor([0.2477])
Label: 0, Prediction: 1, logit: tensor([0.2477])
Coherence Map: [['households', 'km²', 'census', 'km²']]
weighted: tensor([0.3552])
Label: 0, Prediction: 0, logit: tensor([0.3552])
Coherence Map: [['households', 'km²', 'census', 'km²'], ['triomphe', 'population']]
weighted: tensor([0.1670])
Label: 0, Prediction: 1, logit: tensor([0.1670])
Coherence Map: [['texas', 'golf', 'southwest', 'club']]
weighted: tensor([0.2339])
Label: 1, Prediction: 1, logit: tensor([0.2339])
Coherence Map: [['census', 'odessa', 'median', 'southwest']]
weighted: tensor([0.2650])
Label: 0, Prediction: 0, logit: tensor([0.2650])
Coherence Map: [['census', 'odessa', 'median', 'southwest'], ['district', 'households', 'school', 'households']]
weighted: tensor([0.2339])
Label: 0, Prediction: 1, logit: tensor([0.2339])
Coherence Map: [['jews', 'county', 'jewish', 'county']]
weighted: t

weighted: tensor([0.2831])
Label: 0, Prediction: 0, logit: tensor([0.2831])
Coherence Map: [['police', 'assam', 'officers', 'assam'], ['judiciary', 'police', 'court', 'police'], ['infrastructure', 'judiciary', 'nehru', 'judiciary'], ['metro', 'infrastructure', 'cities', 'infrastructure'], ['kerosene', 'guwahati', 'refinery', 'metro'], ['kolkata', 'refinery', 'mumbai', 'refinery'], ['railways', 'kolkata', 'railway', 'kolkata'], ['buses', 'railways', 'bus', 'railways'], ['transport', 'buses', 'transportation', 'buses'], ['institutes', 'transport', 'assam', 'transport'], ['stadium', 'institutes', 'stadiums', 'institutes'], ['doordarshan', 'stadium', 'airtel', 'stadium'], ['bihar', 'doordarshan', 'maharashtra', 'doordarshan']]
weighted: tensor([0.3308])
Label: 0, Prediction: 0, logit: tensor([0.3308])
Coherence Map: [['police', 'assam', 'officers', 'assam'], ['judiciary', 'police', 'court', 'police'], ['infrastructure', 'judiciary', 'nehru', 'judiciary'], ['metro', 'infrastructure', 'citie

weighted: tensor([0.3097])
Label: 0, Prediction: 0, logit: tensor([0.3097])
Coherence Map: [['municipality', 'apparel', 'san', 'apparel'], ['taino', 'municipality', 'conquistadors', 'municipality'], ['tobacco', 'taino', 'cigar', 'taino'], ['university', 'tobacco', 'institution', 'tobacco'], ['municipality', 'university', 'density', 'university'], ['iata', 'municipality', 'icao', 'municipality'], ['fc', 'iata', 'río', 'iata']]
weighted: tensor([0.3116])
Label: 0, Prediction: 0, logit: tensor([0.3116])
Coherence Map: [['municipality', 'apparel', 'san', 'apparel'], ['taino', 'municipality', 'conquistadors', 'municipality'], ['tobacco', 'taino', 'cigar', 'taino'], ['university', 'tobacco', 'institution', 'tobacco'], ['municipality', 'university', 'density', 'university'], ['iata', 'municipality', 'icao', 'municipality'], ['fc', 'iata', 'río', 'iata'], ['novgorod', 'fc', 'leningrad', 'fc']]
weighted: tensor([0.2364])
Label: 1, Prediction: 1, logit: tensor([0.2364])
Coherence Map: [['municip

Coherence Map: [['vicksburg', 'mississippi', 'muskets', 'mississippi'], ['riverfront', 'vicksburg', 'foundation', 'vicksburg'], ['households', 'riverfront', 'median', 'riverfront'], ['founded', 'households', 'nomadic', 'households'], ['museum', 'tributary', 'exhibits', 'tributary']]
weighted: tensor([0.2860])
Label: 0, Prediction: 0, logit: tensor([0.2860])
Coherence Map: [['vicksburg', 'mississippi', 'muskets', 'mississippi'], ['riverfront', 'vicksburg', 'foundation', 'vicksburg'], ['households', 'riverfront', 'median', 'riverfront'], ['founded', 'households', 'nomadic', 'households'], ['museum', 'tributary', 'exhibits', 'tributary'], ['administrative', 'museum', 'municipal', 'museum']]
weighted: tensor([0.2439])
Label: 0, Prediction: 1, logit: tensor([0.2439])
Coherence Map: [['dupage', 'district']]
weighted: tensor([0.1784])
Label: 1, Prediction: 1, logit: tensor([0.1784])
Coherence Map: [['area', 'tavern', 'land', 'tavern']]
weighted: tensor([0.2669])
Label: 0, Prediction: 0, logit

Coherence Map: [['whaling', '1914', 'whales', '1914'], ['island', 'whaling', 'settlement', 'whaling'], ['oceanic', 'island', 'classification', 'isthmus']]
weighted: tensor([0.1920])
Label: 0, Prediction: 1, logit: tensor([0.1920])
Coherence Map: [['census', 'climate', 'households', 'climate']]
weighted: tensor([0.2669])
Label: 0, Prediction: 0, logit: tensor([0.2669])
Coherence Map: [['census', 'climate', 'households', 'climate'], ['city', 'census', 'council', 'census']]
weighted: tensor([0.2113])
Label: 0, Prediction: 1, logit: tensor([0.2113])
Coherence Map: [['electric', 'mayor', 'bills', 'mayor']]
weighted: tensor([0.2932])
Label: 0, Prediction: 0, logit: tensor([0.2932])
Coherence Map: [['electric', 'mayor', 'bills', 'mayor'], ['chatham', 'fishing', 'elementary', 'electric']]
weighted: tensor([0.2639])
Label: 0, Prediction: 0, logit: tensor([0.2639])
Coherence Map: [['electric', 'mayor', 'bills', 'mayor'], ['chatham', 'fishing', 'elementary', 'electric'], ['railroad', 'elementary'

Coherence Map: [['libraries', 'school', 'library', 'school']]
weighted: tensor([0.3274])
Label: 0, Prediction: 0, logit: tensor([0.3274])
Coherence Map: [['libraries', 'school', 'library', 'school'], ['census', 'libraries', 'area', 'libraries']]
weighted: tensor([0.2795])
Label: 1, Prediction: 0, logit: tensor([0.2795])
Coherence Map: [['libraries', 'school', 'library', 'school'], ['census', 'libraries', 'area', 'libraries'], ['township', 'census', 'erie', 'area']]
weighted: tensor([0.2184])
Label: 0, Prediction: 1, logit: tensor([0.2184])
Coherence Map: [['households', 'erie', 'census', 'erie']]
weighted: tensor([0.2509])
Label: 0, Prediction: 0, logit: tensor([0.2509])
Coherence Map: [['households', 'erie', 'census', 'erie'], ['households', 'households', 'median', 'households']]
weighted: tensor([0.4911])
Label: 0, Prediction: 0, logit: tensor([0.4911])
Coherence Map: [['households', 'erie', 'census', 'erie'], ['households', 'households', 'median', 'households'], ['preschool', 'house

weighted: tensor([0.4223])
Label: 0, Prediction: 0, logit: tensor([0.4223])
Coherence Map: [['poland', 'denoting', 'teutonic', 'denoting'], ['villages', 'poland', 'village', 'poland'], ['nobility', 'villages', 'mayor', 'villages'], ['landowners', 'nobility', 'nobles', 'nobility'], ['habsburg', 'landowners', 'protestant', 'landowners'], ['prussia', 'habsburg', 'prussian', 'habsburg'], ['prussian', 'prussia', 'prussia', 'prussia'], ['prussia', 'prussian', 'prussian', 'prussian']]
weighted: tensor([0.4315])
Label: 0, Prediction: 0, logit: tensor([0.4315])
Coherence Map: [['poland', 'denoting', 'teutonic', 'denoting'], ['villages', 'poland', 'village', 'poland'], ['nobility', 'villages', 'mayor', 'villages'], ['landowners', 'nobility', 'nobles', 'nobility'], ['habsburg', 'landowners', 'protestant', 'landowners'], ['prussia', 'habsburg', 'prussian', 'habsburg'], ['prussian', 'prussia', 'prussia', 'prussia'], ['prussia', 'prussian', 'prussian', 'prussian'], ['prussian', 'prussia', 'prussia',

weighted: tensor([0.3179])
Label: 0, Prediction: 0, logit: tensor([0.3179])
Coherence Map: [['population', 'bolivia', 'inhabitants', 'bolivia'], ['prefecture', 'census', 'flooding', 'census'], ['humid', 'prefecture', 'climate', 'prefecture'], ['census', 'humid', 'population', 'humid'], ['shogunate', 'census', 'meiji', 'census'], ['elected', 'shogunate', 'unicameral', 'shogunate'], ['regional', 'mayor', 'dominating', 'mayor'], ['school', 'regional', 'public', 'regional'], ['administrative', 'school', 'municipality', 'school']]
weighted: tensor([0.2661])
Label: 1, Prediction: 0, logit: tensor([0.2661])
Coherence Map: [['population', 'bolivia', 'inhabitants', 'bolivia'], ['prefecture', 'census', 'flooding', 'census'], ['humid', 'prefecture', 'climate', 'prefecture'], ['census', 'humid', 'population', 'humid'], ['shogunate', 'census', 'meiji', 'census'], ['elected', 'shogunate', 'unicameral', 'shogunate'], ['regional', 'mayor', 'dominating', 'mayor'], ['school', 'regional', 'public', 'regi

Coherence Map: [['climate', 'regency', 'tropical', 'regency'], ['elections', 'climate', 'mayor', 'tropical'], ['indonesia', 'mayor', 'kota', 'mayor'], ['ethnic', 'surabaya', 'javanese', 'surabaya'], ['dialect', 'indonesians', 'indonesian', 'indonesians'], ['hinduism', 'dialect', 'indonesians', 'dialect'], ['ports', 'hinduism', 'exports', 'hinduism'], ['jakarta', 'ports', 'indonesia', 'ports'], ['surabaya', 'jakarta', 'jalan', 'jakarta'], ['surabaya', 'surabaya', 'malls', 'jalan'], ['buildings', 'malls', 'indonesia', 'malls'], ['submarine', 'buildings', 'navy', 'buildings'], ['transportation', 'submarine', 'transport', 'submarine']]
weighted: tensor([0.3015])
Label: 0, Prediction: 0, logit: tensor([0.3015])
Coherence Map: [['climate', 'regency', 'tropical', 'regency'], ['elections', 'climate', 'mayor', 'tropical'], ['indonesia', 'mayor', 'kota', 'mayor'], ['ethnic', 'surabaya', 'javanese', 'surabaya'], ['dialect', 'indonesians', 'indonesian', 'indonesians'], ['hinduism', 'dialect', 'ind

weighted: tensor([0.2877])
Label: 0, Prediction: 0, logit: tensor([0.2877])
Coherence Map: [['climate', 'regency', 'tropical', 'regency'], ['elections', 'climate', 'mayor', 'tropical'], ['indonesia', 'mayor', 'kota', 'mayor'], ['ethnic', 'surabaya', 'javanese', 'surabaya'], ['dialect', 'indonesians', 'indonesian', 'indonesians'], ['hinduism', 'dialect', 'indonesians', 'dialect'], ['ports', 'hinduism', 'exports', 'hinduism'], ['jakarta', 'ports', 'indonesia', 'ports'], ['surabaya', 'jakarta', 'jalan', 'jakarta'], ['surabaya', 'surabaya', 'malls', 'jalan'], ['buildings', 'malls', 'indonesia', 'malls'], ['submarine', 'buildings', 'navy', 'buildings'], ['transportation', 'submarine', 'transport', 'submarine'], ['airport', 'transportation', 'surabaya', 'transportation'], ['ports', 'airport', 'port', 'airport'], ['jakarta', 'ports', 'kota', 'ports'], ['surabaya', 'surabaya', 'other', 'surabaya'], ['uber', 'terminal', 'taxi', 'terminal'], ['bridge', 'uber', 'seaport', 'uber'], ['stadium', 'br

weighted: tensor([0.2867])
Label: 0, Prediction: 0, logit: tensor([0.2867])
Coherence Map: [['creole', 'catholic', 'creoles', 'catholic'], ['caribbean', 'creole', 'rum', 'creole'], ['ferries', 'caribbean', 'cruise', 'caribbean'], ['caribbean', 'ferries', 'culture', 'ferries'], ['danish', 'caribbean', 'danes', 'caribbean'], ['drive', 'danish', 'taxi', 'danish'], ['charlotte', 'drive', 'st', 'drive'], ['seminole', 'charlotte', 'cavalry', 'charlotte']]
weighted: tensor([0.2529])
Label: 1, Prediction: 0, logit: tensor([0.2529])
Coherence Map: [['creole', 'catholic', 'creoles', 'catholic'], ['caribbean', 'creole', 'rum', 'creole'], ['ferries', 'caribbean', 'cruise', 'caribbean'], ['caribbean', 'ferries', 'culture', 'ferries'], ['danish', 'caribbean', 'danes', 'caribbean'], ['drive', 'danish', 'taxi', 'danish'], ['charlotte', 'drive', 'st', 'drive'], ['seminole', 'charlotte', 'cavalry', 'charlotte'], ['alamo', 'seminole', 'films', 'seminole']]
weighted: tensor([0.2695])
Label: 0, Prediction:

weighted: tensor([0.3545])
Label: 0, Prediction: 0, logit: tensor([0.3545])
Coherence Map: [['galleries', 'mining', 'northwest', 'mining'], ['diploma', 'galleries', 'academy', 'galleries'], ['interstate', 'diploma', 'route', 'diploma'], ['routes', 'highways', 'buses', 'highways'], ['aviation', 'routes', 'airport', 'routes']]
weighted: tensor([0.2818])
Label: 0, Prediction: 0, logit: tensor([0.2818])
Coherence Map: [['galleries', 'mining', 'northwest', 'mining'], ['diploma', 'galleries', 'academy', 'galleries'], ['interstate', 'diploma', 'route', 'diploma'], ['routes', 'highways', 'buses', 'highways'], ['aviation', 'routes', 'airport', 'routes'], ['sewer', 'aviation', 'municipal', 'aviation']]
weighted: tensor([0.3009])
Label: 0, Prediction: 0, logit: tensor([0.3009])
Coherence Map: [['galleries', 'mining', 'northwest', 'mining'], ['diploma', 'galleries', 'academy', 'galleries'], ['interstate', 'diploma', 'route', 'diploma'], ['routes', 'highways', 'buses', 'highways'], ['aviation', 'ro

weighted: tensor([0.3200])
Label: 0, Prediction: 0, logit: tensor([0.3200])
Coherence Map: [['bishop', 'earthquake', 'diocese', 'earthquake'], ['municipality', 'bishop', 'doña', 'bishop'], ['beaches', 'municipality', 'gravel', 'municipality'], ['rainfall', 'beaches', 'climate', 'beaches'], ['castle', 'rainfall', 'tower', 'rainfall'], ['españa', 'castle', 'monuments', 'castle'], ['pope', 'españa', 'renaissance', 'españa'], ['archaeological', 'pope', 'museo', 'pope']]
weighted: tensor([0.2785])
Label: 0, Prediction: 0, logit: tensor([0.2785])
Coherence Map: [['bishop', 'earthquake', 'diocese', 'earthquake'], ['municipality', 'bishop', 'doña', 'bishop'], ['beaches', 'municipality', 'gravel', 'municipality'], ['rainfall', 'beaches', 'climate', 'beaches'], ['castle', 'rainfall', 'tower', 'rainfall'], ['españa', 'castle', 'monuments', 'castle'], ['pope', 'españa', 'renaissance', 'españa'], ['archaeological', 'pope', 'museo', 'pope'], ['baroque', 'archaeological', 'architecture', 'archaeologi

weighted: tensor([0.2439])
Label: 0, Prediction: 1, logit: tensor([0.2439])
Coherence Map: [['km²', 'winter', 'monroe', 'winter']]
weighted: tensor([0.2396])
Label: 1, Prediction: 1, logit: tensor([0.2396])
Coherence Map: [['households', 'km²', 'census', 'km²']]
weighted: tensor([0.2944])
Label: 0, Prediction: 0, logit: tensor([0.2944])
Coherence Map: [['households', 'km²', 'census', 'km²'], ['monroe', 'census', 'mayor', 'census']]
weighted: tensor([0.2157])
Label: 0, Prediction: 1, logit: tensor([0.2157])
Coherence Map: [['engineer', 'monroe', 'monroe', 'monroe']]
weighted: tensor([0.2597])
Label: 0, Prediction: 0, logit: tensor([0.2597])
Coherence Map: [['engineer', 'monroe', 'monroe', 'monroe'], ['founded', 'engineer', 'handicapped', 'engineer']]
weighted: tensor([0.2688])
Label: 1, Prediction: 0, logit: tensor([0.2688])
Coherence Map: [['engineer', 'monroe', 'monroe', 'monroe'], ['founded', 'engineer', 'handicapped', 'engineer'], ['census', 'founded', 'area', 'founded']]
weighted: 

weighted: tensor([0.4084])
Label: 0, Prediction: 0, logit: tensor([0.4084])
Coherence Map: [['prostitution', 'station', 'motels', 'station'], ['junior', 'prostitution', 'proposed', 'prostitution'], ['sabah', 'junior', 'sarawak', 'junior'], ['plan', 'sabah', 'proposed', 'sarawak'], ['malaysia', 'proposed', 'sarawak', 'proposed'], ['supermarket', 'malaysia', 'mall', 'malaysia'], ['sarawak', 'supermarket', 'checkpoint', 'supermarket'], ['boat', 'sarawak', 'transport', 'sarawak'], ['kota', 'boat', 'sabah', 'boat'], ['brunei', 'kota', 'sarawak', 'kota'], ['kampung', 'brunei', 'brunei', 'brunei'], ['malay', 'kampung', 'brunei', 'kampung']]
weighted: tensor([0.3774])
Label: 0, Prediction: 0, logit: tensor([0.3774])
Coherence Map: [['prostitution', 'station', 'motels', 'station'], ['junior', 'prostitution', 'proposed', 'prostitution'], ['sabah', 'junior', 'sarawak', 'junior'], ['plan', 'sabah', 'proposed', 'sarawak'], ['malaysia', 'proposed', 'sarawak', 'proposed'], ['supermarket', 'malaysia',

Coherence Map: [['operation', '1919', '1887', '1919']]
weighted: tensor([0.2407])
Label: 1, Prediction: 1, logit: tensor([0.2407])
Coherence Map: [['river', 'italy', 'confluence', 'italy']]
weighted: tensor([0.2831])
Label: 0, Prediction: 0, logit: tensor([0.2831])
Coherence Map: [['river', 'italy', 'confluence', 'italy'], ['households', 'census', 'census', 'census']]
weighted: tensor([0.3028])
Label: 0, Prediction: 0, logit: tensor([0.3028])
Coherence Map: [['river', 'italy', 'confluence', 'italy'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households']]
weighted: tensor([0.4579])
Label: 0, Prediction: 0, logit: tensor([0.4579])
Coherence Map: [['river', 'italy', 'confluence', 'italy'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households'], ['choctaw', 'households', 'louisiana', 'households']]
weighted: tensor([0.2728])
Label: 1, Prediction: 0, logit: tensor([0.2728])
Coherence Map: [['river', 'it

weighted: tensor([0.2664])
Label: 0, Prediction: 0, logit: tensor([0.2664])
Coherence Map: [['river', 'italy', 'confluence', 'italy'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households'], ['choctaw', 'households', 'louisiana', 'households'], ['metroplex', 'choctaw', 'texas', 'choctaw'], ['census', 'metroplex', 'population', 'metroplex'], ['choctaw', 'census', 'tulsa', 'census'], ['memorial', 'choctaw', 'oklahoma', 'choctaw'], ['soccer', 'memorial', 'baseball', 'memorial'], ['parks', 'soccer', 'park', 'soccer'], ['mayor', 'parks', 'municipal', 'parks'], ['aerospace', 'mayor', 'degree', 'mayor'], ['oklahoma', 'aerospace', 'department', 'aerospace'], ['district', 'oklahoma', 'schools', 'oklahoma'], ['cbs', 'district', 'radio', 'district'], ['freeway', 'cbs', 'highway', 'cbs'], ['airfield', 'freeway', 'airport', 'freeway']]
weighted: tensor([0.2872])
Label: 0, Prediction: 0, logit: tensor([0.2872])
Coherence Map: [['river', 'italy', 'confluenc

Coherence Map: [['annual', 'oregon', 'salem', 'oregon']]
weighted: tensor([0.3061])
Label: 0, Prediction: 0, logit: tensor([0.3061])
Coherence Map: [['annual', 'oregon', 'salem', 'oregon'], ['associations', 'annual', 'groups', 'annual']]
weighted: tensor([0.3267])
Label: 0, Prediction: 0, logit: tensor([0.3267])
Coherence Map: [['annual', 'oregon', 'salem', 'oregon'], ['associations', 'annual', 'groups', 'annual'], ['festival', 'associations', 'fairgrounds', 'associations']]
weighted: tensor([0.2933])
Label: 0, Prediction: 0, logit: tensor([0.2933])
Coherence Map: [['annual', 'oregon', 'salem', 'oregon'], ['associations', 'annual', 'groups', 'annual'], ['festival', 'associations', 'fairgrounds', 'associations'], ['salem', 'salem', 'intersection', 'salem']]
weighted: tensor([0.3908])
Label: 0, Prediction: 0, logit: tensor([0.3908])
Coherence Map: [['annual', 'oregon', 'salem', 'oregon'], ['associations', 'annual', 'groups', 'annual'], ['festival', 'associations', 'fairgrounds', 'associa

weighted: tensor([0.2826])
Label: 0, Prediction: 0, logit: tensor([0.2826])
Coherence Map: [['households', 'households', 'median', 'households'], ['census', 'households', 'area', 'households'], ['households', 'census', 'median', 'census'], ['schools', 'households', 'school', 'households'], ['contains', 'schools', 'contains', 'school']]
weighted: tensor([0.1700])
Label: 1, Prediction: 1, logit: tensor([0.1700])
Coherence Map: [['psg', 'nam', 'beach', 'nam']]
weighted: tensor([0.2669])
Label: 1, Prediction: 0, logit: tensor([0.2669])
Coherence Map: [['psg', 'nam', 'beach', 'nam'], ['census', 'development', 'area', 'development']]
weighted: tensor([0.2640])
Label: 1, Prediction: 0, logit: tensor([0.2640])
Coherence Map: [['psg', 'nam', 'beach', 'nam'], ['census', 'development', 'area', 'development'], ['households', 'census', 'median', 'census']]
weighted: tensor([0.3347])
Label: 0, Prediction: 0, logit: tensor([0.3347])
Coherence Map: [['psg', 'nam', 'beach', 'nam'], ['census', 'developm

Coherence Map: [['census', 'climate', 'hispanic', 'climate'], ['households', 'households', 'inhabitants', 'households'], ['covina', 'households', 'san', 'households'], ['san', 'covina', 'pomona', 'covina'], ['sheriff', 'san', 'san', 'san']]
weighted: tensor([0.3761])
Label: 0, Prediction: 0, logit: tensor([0.3761])
Coherence Map: [['census', 'climate', 'hispanic', 'climate'], ['households', 'households', 'inhabitants', 'households'], ['covina', 'households', 'san', 'households'], ['san', 'covina', 'pomona', 'covina'], ['sheriff', 'san', 'san', 'san'], ['san', 'sheriff', 'fire', 'sheriff']]
weighted: tensor([0.4324])
Label: 0, Prediction: 0, logit: tensor([0.4324])
Coherence Map: [['census', 'climate', 'hispanic', 'climate'], ['households', 'households', 'inhabitants', 'households'], ['covina', 'households', 'san', 'households'], ['san', 'covina', 'pomona', 'covina'], ['sheriff', 'san', 'san', 'san'], ['san', 'sheriff', 'fire', 'sheriff'], ['ted', 'san', 'filmed', 'san']]
weighted: tens

weighted: tensor([0.2458])
Label: 0, Prediction: 1, logit: tensor([0.2458])
Coherence Map: [['voters', 'stadium', 'incumbents', 'stadium']]
weighted: tensor([0.2739])
Label: 0, Prediction: 0, logit: tensor([0.2739])
Coherence Map: [['voters', 'stadium', 'incumbents', 'stadium'], ['dallas', 'voters', 'irving', 'voters']]
weighted: tensor([0.2542])
Label: 0, Prediction: 0, logit: tensor([0.2542])
Coherence Map: [['voters', 'stadium', 'incumbents', 'stadium'], ['dallas', 'voters', 'irving', 'voters'], ['postal', 'hospital', 'offices', 'hospital']]
weighted: tensor([0.3092])
Label: 0, Prediction: 0, logit: tensor([0.3092])
Coherence Map: [['voters', 'stadium', 'incumbents', 'stadium'], ['dallas', 'voters', 'irving', 'voters'], ['postal', 'hospital', 'offices', 'hospital'], ['irving', 'irving', 'carrollton', 'irving']]
weighted: tensor([0.2837])
Label: 0, Prediction: 0, logit: tensor([0.2837])
Coherence Map: [['voters', 'stadium', 'incumbents', 'stadium'], ['dallas', 'voters', 'irving', 'vo

weighted: tensor([0.3268])
Label: 1, Prediction: 0, logit: tensor([0.3268])
Coherence Map: [['census', 'township', 'residents', 'township'], ['households', 'census', 'median', 'census'], ['census', 'households', 'area', 'households'], ['households', 'census', 'census', 'census']]
weighted: tensor([0.3790])
Label: 0, Prediction: 0, logit: tensor([0.3790])
Coherence Map: [['census', 'township', 'residents', 'township'], ['households', 'census', 'median', 'census'], ['census', 'households', 'area', 'households'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households']]
weighted: tensor([0.4760])
Label: 0, Prediction: 0, logit: tensor([0.4760])
Coherence Map: [['census', 'township', 'residents', 'township'], ['households', 'census', 'median', 'census'], ['census', 'households', 'area', 'households'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households'], ['census', 'households', 'land', 'households']]
w

Coherence Map: [['flooding', 'river', 'typhoon', 'river'], ['climate', 'flooding', 'humidity', 'flooding']]
weighted: tensor([0.4153])
Label: 0, Prediction: 0, logit: tensor([0.4153])
Coherence Map: [['flooding', 'river', 'typhoon', 'river'], ['climate', 'flooding', 'humidity', 'flooding'], ['jesuit', 'climate', 'pueblo', 'climate']]
weighted: tensor([0.2753])
Label: 0, Prediction: 0, logit: tensor([0.2753])
Coherence Map: [['flooding', 'river', 'typhoon', 'river'], ['climate', 'flooding', 'humidity', 'flooding'], ['jesuit', 'climate', 'pueblo', 'climate'], ['rizal', 'jesuit', 'luzon', 'jesuit']]
weighted: tensor([0.3485])
Label: 0, Prediction: 0, logit: tensor([0.3485])
Coherence Map: [['flooding', 'river', 'typhoon', 'river'], ['climate', 'flooding', 'humidity', 'flooding'], ['jesuit', 'climate', 'pueblo', 'climate'], ['rizal', 'jesuit', 'luzon', 'jesuit'], ['flooding', 'rizal', 'flood', 'rizal']]
weighted: tensor([0.3235])
Label: 0, Prediction: 0, logit: tensor([0.3235])
Coherence M

weighted: tensor([0.2940])
Label: 0, Prediction: 0, logit: tensor([0.2940])
Coherence Map: [['malls', 'footwear', 'avenue', 'footwear'], ['market', 'marikina', 'market', 'malls'], ['malls', 'market', 'manila', 'market'], ['mayor', 'malls', 'manila', 'malls'], ['districts', 'mayor', 'district', 'mayor'], ['symbolizes', 'districts', 'represents', 'districts'], ['thai', 'symbolizes', 'filipino', 'symbolizes'], ['festival', 'thai', 'festivities', 'thai'], ['sports', 'festival', 'gymnasium', 'festival'], ['jeepneys', 'sports', 'lrt', 'sports'], ['rizal', 'jeepneys', 'quezon', 'jeepneys'], ['barangay', 'rizal', 'pasig', 'rizal'], ['ferries', 'barangay', 'ferry', 'barangay'], ['hospitals', 'ferries', 'hospital', 'ferries']]
weighted: tensor([0.3340])
Label: 0, Prediction: 0, logit: tensor([0.3340])
Coherence Map: [['malls', 'footwear', 'avenue', 'footwear'], ['market', 'marikina', 'market', 'malls'], ['malls', 'market', 'manila', 'market'], ['mayor', 'malls', 'manila', 'malls'], ['districts',

Coherence Map: [['residents', 'east', 'population', 'east'], ['households', 'households', 'median', 'households'], ['shrine', 'population', 'hachiman', 'population']]
weighted: tensor([0.2192])
Label: 1, Prediction: 1, logit: tensor([0.2192])
Coherence Map: [['kobe', 'shrine', 'route', 'shrine']]
weighted: tensor([0.3530])
Label: 0, Prediction: 0, logit: tensor([0.3530])
Coherence Map: [['kobe', 'shrine', 'route', 'shrine'], ['oak', 'kobe', 'oakland', 'kobe']]
weighted: tensor([0.2584])
Label: 1, Prediction: 0, logit: tensor([0.2584])
Coherence Map: [['kobe', 'shrine', 'route', 'shrine'], ['oak', 'kobe', 'oakland', 'kobe'], ['oakland', 'oak', 'city', 'oak']]
weighted: tensor([0.2845])
Label: 0, Prediction: 0, logit: tensor([0.2845])
Coherence Map: [['kobe', 'shrine', 'route', 'shrine'], ['oak', 'kobe', 'oakland', 'kobe'], ['oakland', 'oak', 'city', 'oak'], ['hispanic', 'oakland', 'households', 'city']]
weighted: tensor([0.2494])
Label: 0, Prediction: 1, logit: tensor([0.2494])
Coherenc

Coherence Map: [['treaty', 'joão', 'portugal', 'joão']]
weighted: tensor([0.3675])
Label: 0, Prediction: 0, logit: tensor([0.3675])
Coherence Map: [['treaty', 'joão', 'portugal', 'joão'], ['erected', 'treaty', 'building', 'treaty']]
weighted: tensor([0.2439])
Label: 1, Prediction: 1, logit: tensor([0.2439])
Coherence Map: [['marcus', 'marcus', 'located', 'marcus']]
weighted: tensor([0.2984])
Label: 0, Prediction: 0, logit: tensor([0.2984])
Coherence Map: [['marcus', 'marcus', 'located', 'marcus'], ['households', 'census', 'census', 'census']]
weighted: tensor([0.3284])
Label: 0, Prediction: 0, logit: tensor([0.3284])
Coherence Map: [['marcus', 'marcus', 'located', 'marcus'], ['households', 'census', 'census', 'census'], ['households', 'households', 'median', 'households']]
weighted: tensor([0.4620])
Label: 0, Prediction: 0, logit: tensor([0.4620])
Coherence Map: [['marcus', 'marcus', 'located', 'marcus'], ['households', 'census', 'census', 'census'], ['households', 'households', 'media

weighted: tensor([0.2816])
Label: 0, Prediction: 0, logit: tensor([0.2816])
Coherence Map: [['winters', 'temple', 'winter', 'temple'], ['ahmedabad', 'winters', 'kolkata', 'winters'], ['highways', 'ahmedabad', 'south', 'ahmedabad'], ['howrah', 'highways', 'station', 'highways'], ['airlines', 'howrah', 'airways', 'howrah'], ['spring', 'airlines', 'winter', 'airlines'], ['khan', 'spring', 'village', 'spring'], ['villages', 'khan', 'village', 'khan'], ['dialects', 'villages', 'languages', 'villages'], ['factory', 'dialects', 'apple', 'dialects'], ['zeus', 'factory', 'persia', 'factory'], ['buddhist', 'zeus', 'monks', 'zeus'], ['archaeological', 'buddhist', 'statuettes', 'buddhist'], ['region', 'archaeological', 'western', 'archaeological'], ['airport', 'nearest', 'is', 'nearest'], ['founded', 'airport', 'fief', 'airport']]
weighted: tensor([0.2632])
Label: 1, Prediction: 0, logit: tensor([0.2632])
Coherence Map: [['winters', 'temple', 'winter', 'temple'], ['ahmedabad', 'winters', 'kolkata'

Coherence Map: [['filmed', 'mascot', 'film', 'mascot'], ['founded', 'filmed', '1932', 'filmed'], ['administrative', 'founded', 'district', 'founded'], ['settlement', 'administrative', '1957', 'administrative'], ['administrative', 'settlement', 'town', 'settlement'], ['railway', 'administrative', 'perm', 'administrative'], ['porcelain', 'railway', 'factory', 'railway']]
weighted: tensor([0.2992])
Label: 1, Prediction: 0, logit: tensor([0.2992])
Coherence Map: [['filmed', 'mascot', 'film', 'mascot'], ['founded', 'filmed', '1932', 'filmed'], ['administrative', 'founded', 'district', 'founded'], ['settlement', 'administrative', '1957', 'administrative'], ['administrative', 'settlement', 'town', 'settlement'], ['railway', 'administrative', 'perm', 'administrative'], ['porcelain', 'railway', 'factory', 'railway'], ['village', 'porcelain', 'villagers', 'porcelain']]
weighted: tensor([0.2777])
Label: 0, Prediction: 0, logit: tensor([0.2777])
Coherence Map: [['filmed', 'mascot', 'film', 'mascot

Coherence Map: [['province', 'modernization', 'mongolia', 'modernization'], ['humid', 'province', 'climate', 'province'], ['development', 'humid', 'district', 'humid'], ['billion', 'counties', 'largest', 'counties'], ['harbin', 'billion', 'railways', 'billion'], ['sawmill', 'harbin', 'built', 'harbin'], ['closure', 'sawmill', 'residents', 'sawmill'], ['creek', 'closure', 'confluence', 'closure'], ['climate', 'creek', 'continental', 'creek'], ['000', 'humid', 'population', 'humid'], ['population', '1906', 'households', '000'], ['households', 'households', 'median', 'households'], ['settlement', 'census', 'hill', 'census']]
weighted: tensor([0.2423])
Label: 1, Prediction: 1, logit: tensor([0.2423])
Coherence Map: [['miles', 'settlement', 'kansas', 'settlement']]
weighted: tensor([0.2251])
Label: 0, Prediction: 1, logit: tensor([0.2251])
Coherence Map: [['winters', 'miles', 'climate', 'kansas']]
weighted: tensor([0.1912])
Label: 0, Prediction: 1, logit: tensor([0.1912])
Coherence Map: [['

Coherence Map: [['mining', 'saline', 'victor', 'saline'], ['victor', 'mining', 'destroyed', 'mining'], ['strike', 'victor', 'unionized', 'victor'], ['mining', 'strike', 'miners', 'strike']]
weighted: tensor([0.4249])
Label: 0, Prediction: 0, logit: tensor([0.4249])
Coherence Map: [['mining', 'saline', 'victor', 'saline'], ['victor', 'mining', 'destroyed', 'mining'], ['strike', 'victor', 'unionized', 'victor'], ['mining', 'strike', 'miners', 'strike'], ['mining', 'mining', 'mines', 'mining']]
weighted: tensor([0.4723])
Label: 0, Prediction: 0, logit: tensor([0.4723])
Coherence Map: [['mining', 'saline', 'victor', 'saline'], ['victor', 'mining', 'destroyed', 'mining'], ['strike', 'victor', 'unionized', 'victor'], ['mining', 'strike', 'miners', 'strike'], ['mining', 'mining', 'mines', 'mining'], ['census', 'mining', 'victor', 'mining']]
weighted: tensor([0.2454])
Label: 0, Prediction: 1, logit: tensor([0.2454])
Coherence Map: [['households', 'census', 'median', 'census']]
weighted: tensor

weighted: tensor([0.2703])
Label: 0, Prediction: 0, logit: tensor([0.2703])
Coherence Map: [['painting', 'elementary', 'mural', 'elementary'], ['weaver', 'painting', 'weaver', 'mural']]
weighted: tensor([0.2075])
Label: 1, Prediction: 1, logit: tensor([0.2075])
Coherence Map: [['households', 'census', 'census', 'census']]
weighted: tensor([0.3089])
Label: 0, Prediction: 0, logit: tensor([0.3089])
Coherence Map: [['households', 'census', 'census', 'census'], ['fulmer', 'census', 'calhoun', 'census']]
weighted: tensor([0.2177])
Label: 0, Prediction: 1, logit: tensor([0.2177])
Coherence Map: [['conquests', 'fulmer', 'metropolis', 'coaching']]
weighted: tensor([0.1828])
Label: 1, Prediction: 1, logit: tensor([0.1828])
Coherence Map: [['excavations', 'indians', 'citadel', 'indians']]
weighted: tensor([0.3119])
Label: 0, Prediction: 0, logit: tensor([0.3119])
Coherence Map: [['excavations', 'indians', 'citadel', 'indians'], ['manufacture', 'excavations', 'ohio', 'excavations']]
weighted: ten

<IPython.core.display.Javascript object>

In [84]:
print([x[1] for x in predictions])
print(true_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 

<IPython.core.display.Javascript object>

In [126]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [127]:
avg_k = len(true_labels) // true_labels.count(1)  # get avg segment size

<IPython.core.display.Javascript object>

In [128]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.3071786310517529
pk = 0.28213689482470783


<IPython.core.display.Javascript object>

## Prediction Tuning

In [129]:
pred_thresh = 0.24

<IPython.core.display.Javascript object>

In [130]:
modified_predictions = [
    1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
]

pred_string = "".join(str(modified_predictions))
true_string = "".join(str(true_labels))

avg_k = len(true_labels) // true_labels.count(1)  # get avg segment size

<IPython.core.display.Javascript object>

In [131]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.27879799666110183
pk = 0.2647746243739566


<IPython.core.display.Javascript object>

## Sample Segments

In [26]:
segments_to_test = 10

text_segments_to_check = [
    [truncate_by_token(s, max_tokens) for s in segment]
    for segment in text_segments[:segments_to_test]
]
text_labels_to_check = segments_labels[:segments_to_test]

<IPython.core.display.Javascript object>

In [28]:
len(text_segments_to_check[0]), len(text_labels_to_check[0])

(36, 36)

<IPython.core.display.Javascript object>

In [41]:
different_segment_1 = [
    text_segments_to_check[0][-3],
    text_segments_to_check[0][-2],
    text_segments_to_check[0][-1],
    text_segments_to_check[1][0],
    text_segments_to_check[1][1],
    text_segments_to_check[1][2],
]
different_segment_2 = [
    text_segments_to_check[1][-3],
    text_segments_to_check[1][-2],
    text_segments_to_check[1][-1],
    text_segments_to_check[2][0],
    text_segments_to_check[2][1],
    text_segments_to_check[2][2],
]
different_segment_3 = [
    text_segments_to_check[2][-3],
    text_segments_to_check[2][-2],
    text_segments_to_check[2][-1],
    text_segments_to_check[3][0],
    text_segments_to_check[3][1],
    text_segments_to_check[3][2],
]
different_segment_4 = [
    text_segments_to_check[3][-3],
    text_segments_to_check[3][-2],
    text_segments_to_check[3][-1],
    text_segments_to_check[4][0],
    text_segments_to_check[4][1],
    text_segments_to_check[4][2],
]
different_segment_5 = [
    text_segments_to_check[4][-3],
    text_segments_to_check[4][-2],
    text_segments_to_check[4][-1],
    text_segments_to_check[5][0],
    text_segments_to_check[5][1],
    text_segments_to_check[5][2],
]

same_segment_1 = [
    text_segments_to_check[0][0],
    text_segments_to_check[0][1],
    text_segments_to_check[0][2],
]
same_segment_2 = [
    text_segments_to_check[1][0],
    text_segments_to_check[1][1],
    text_segments_to_check[1][2],
]
same_segment_3 = [
    text_segments_to_check[2][0],
    text_segments_to_check[2][1],
    text_segments_to_check[2][2],
]
same_segment_4 = [
    text_segments_to_check[3][0],
    text_segments_to_check[3][1],
    text_segments_to_check[3][2],
]
same_segment_5 = [
    text_segments_to_check[4][0],
    text_segments_to_check[4][1],
    text_segments_to_check[4][2],
]

<IPython.core.display.Javascript object>

In [47]:
num_sentences_to_check = 6  # test only the first n in the segment
target_labels = [0, 0, 0, 1, 0, 0]

# test coherence on different segments
coherence_tester(
    different_segment_1[:num_sentences_to_check], target_labels[:num_sentences_to_check]
)
coherence_tester(
    different_segment_2[:num_sentences_to_check], target_labels[:num_sentences_to_check]
)
coherence_tester(
    different_segment_3[:num_sentences_to_check], target_labels[:num_sentences_to_check]
)
coherence_tester(
    different_segment_4[:num_sentences_to_check], target_labels[:num_sentences_to_check]
)
coherence_tester(
    different_segment_5[:num_sentences_to_check], target_labels[:num_sentences_to_check]
)

Coherence Map: ['universidad', 'michelin', 'universities', 'michelin']
Label: 0, Prediction: 0
Coherence Map: ['universidad', 'michelin', 'universities', 'michelin', 'uci', 'universidad', 'sociedad', 'universidad']
Label: 0, Prediction: 0
Coherence Map: ['universidad', 'michelin', 'universities', 'michelin', 'uci', 'universidad', 'sociedad', 'universidad', 'sioux', 'uci', 'railroad', 'uci']
pruning... 12
done pruning... 8
Label: 1, Prediction: 1
Coherence Map: ['census', 'sioux', 'area', 'sioux']
Label: 0, Prediction: 1
Coherence Map: ['households', 'census', 'census', 'census']
Label: 0, Prediction: 0
Coherence Map: ['households', 'households', 'median', 'households']
Label: 0, Prediction: 0
Coherence Map: ['households', 'households', 'median', 'households', 'elementary', 'households', 'elementary', 'median']
Label: 0, Prediction: 1
Coherence Map: ['cathedral', 'elementary', 'buildings', 'elementary']
Label: 1, Prediction: 0
Coherence Map: ['cathedral', 'elementary', 'buildings', 'ele

[(tensor(0.3717), 0),
 (tensor(0.3243), 0),
 (tensor(0.2591), 1),
 (tensor(0.2496), 1),
 (tensor(0.3721), 0)]

<IPython.core.display.Javascript object>

In [18]:
# test coherence on same segments
coherence_tester(same_segment_1, [0, 0, 0])
coherence_tester(same_segment_2, [0, 0, 0])
coherence_tester(same_segment_3, [0, 0, 0])
coherence_tester(same_segment_4, [0, 0, 0])
coherence_tester(same_segment_5, [0, 0, 0])

Coherence: ['shoreline', 'basque', 'seashore', 'basque']
0, tensor([0.2451]), In spite of appearances both t.. <> The city is in the north of th..
0, tensor([0.3396]), In spite of appearances both t.. <> The city is in the north of th..
Coherence: ['precipitation', 'shoreline', 'climate', 'shoreline']
0, tensor([0.2880]), The city is in the north of th.. <> San Sebastián features an ocea..
0, tensor([0.3303]), The city is in the north of th.. <> San Sebastián features an ocea..
Coherence: ['census', 'sioux', 'area', 'sioux']
0, tensor([0.2409]), Hospers was founded in 1872 wh.. <> Hospers is located at 43 07103..
0, tensor([0.2987]), Hospers was founded in 1872 wh.. <> Hospers is located at 43 07103..
Coherence: ['households', 'census', 'census', 'census']
0, tensor([0.2941]), Hospers is located at 43 07103.. <> As of the census of 2010 there..
0, tensor([0.3422]), Hospers is located at 43 07103.. <> As of the census of 2010 there..
Coherence: ['1943', 'cossacks', '1942', 'cossacks']
0

<IPython.core.display.Javascript object>

In [48]:
# test on a full segment
predictions = coherence_tester(text_segments_to_check[0], text_labels_to_check[0])

Coherence Map: ['shoreline', 'basque', 'seashore', 'basque']
Label: 0, Prediction: 0
Coherence Map: ['shoreline', 'basque', 'seashore', 'basque', 'precipitation', 'shoreline', 'climate', 'shoreline']
Label: 0, Prediction: 0
Coherence Map: ['shoreline', 'basque', 'seashore', 'basque', 'precipitation', 'shoreline', 'climate', 'shoreline', 'paleolithic', 'precipitation', 'evidence', 'precipitation']
pruning... 12
done pruning... 8
Label: 0, Prediction: 1
Coherence Map: ['basque', 'paleolithic', 'roman', 'paleolithic']
Label: 0, Prediction: 1
Coherence Map: ['castile', 'san', 'colonizers', 'san']
Label: 0, Prediction: 0
Coherence Map: ['castile', 'san', 'colonizers', 'san', 'navarre', 'castile', 'iberian', 'castile']
Label: 0, Prediction: 0
Coherence Map: ['castile', 'san', 'colonizers', 'san', 'navarre', 'castile', 'iberian', 'castile', 'neoclassical', 'navarre', 'bourgeois', 'navarre']
pruning... 12
done pruning... 8
Label: 0, Prediction: 1
Coherence Map: ['railway', 'neoclassical', 'bri

<IPython.core.display.Javascript object>

# pruning test

In [44]:
data = [("word1", 0.3), ("word2", 0.1), ("word3", 0.7)]

sorted_arr = sorted(data, key=lambda tup: tup[1])

<IPython.core.display.Javascript object>

In [45]:
sorted_arr

[('word2', 0.1), ('word1', 0.3), ('word3', 0.7)]

<IPython.core.display.Javascript object>