In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [8]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint


from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [5]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [6]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [10]:
all_segments = table.get_all_segments()

segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [16]:
flattened_segments = flatten(segments)
flattened_labels = flatten(segments_labels)
len(flattened_segments), len(flattened_labels)

(92833, 92833)

<IPython.core.display.Javascript object>

## Predictions

In [59]:
# initialize the coherence library
coherence = Coherence(max_words_per_step=4, coherence_threshold=0.6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-09 11:24:23.289098: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<IPython.core.display.Javascript object>

In [60]:
num_samples = 25
start = 100

predictions = coherence.predict_verbose(
    text_data=flattened_segments[start : start + num_samples],
    max_tokens=400,
    prediction_thresh=0.25,
    pruning=5,
    pruning_min=20,
)

Coherence Map before pruning: []
Coherence Map: [], Keywords Current: ['accredited', 'accreditation', 'programs', 'program', 'ecu']
Similarity: 0.26778197288513184, Prediction: 0
Coherence Map before pruning: []
Coherence Map: [], Keywords Current: ['ada', 'center', 'primary', 'secondary', 'school']
Similarity: 0.26031675934791565, Prediction: 0
Coherence Map before pruning: ['ada', 'ada']
Coherence Map: ['ada', 'ada'], Keywords Current: ['tech', 'technology', 'ada', 'formerly', 'located']
Similarity: 0.31694144010543823, Prediction: 0
Coherence Map before pruning: ['ada', 'ada']
Coherence Map: ['ada', 'ada'], Keywords Current: ['convicted', 'grisham', 'prosecutor', 'prosecutors', 'prosecution']
Similarity: 0.18328769505023956, Prediction: 1
Coherence Map before pruning: []
Coherence Map: [], Keywords Current: ['argentine', 'concha', 'jorge', 'colonia', 'patagonia']
Similarity: 0.17850324511528015, Prediction: 1
Coherence Map before pruning: []
Coherence Map: [], Keywords Current: ['wi

<IPython.core.display.Javascript object>

In [67]:
predictions

[(0, 0),
 (tensor(0.2678), 0),
 (tensor(0.2603), 0),
 (tensor(0.3169), 0),
 (tensor(0.1833), 1),
 (tensor(0.1785), 1),
 (tensor(0.2021), 1),
 (tensor(0.2638), 0),
 (tensor(0.2675), 0),
 (tensor(0.2765), 0),
 (tensor(0.2241), 1),
 (tensor(0.3192), 0),
 (tensor(0.3078), 0),
 (tensor(0.2181), 1),
 (tensor(0.1935), 1),
 (tensor(0.3885), 0),
 (tensor(0.3242), 0),
 (tensor(0.2934), 0),
 (tensor(0.2845), 0),
 (tensor(0.2329), 1),
 (tensor(0.1993), 1),
 (tensor(0.2309), 1),
 (tensor(0.3376), 0),
 (tensor(0.3800), 0),
 (tensor(0.4170), 0)]

<IPython.core.display.Javascript object>

In [65]:
flattened_segments[start : start + num_samples]

['The economy of Ada is diversified. In the mid and late 20th century, the town was a manufacturing center, producing products such as Wrangler jeans, auto parts, cement and concrete, plasticware, and other products. Since the start of the 21st century, however, most large manufacturing centers have left or have downsized considerably.\nIn 1975, the Chickasaw Nation opened its headquarters in Ada. Revenues for the Nation were over 12 billion dollars in 2011, most of which is funneled through Ada. The Robert S. Kerr Environmental Research Center, a large water research lab staffed by the Environmental Protection Agency, opened in 1966. LegalShield, a multi-level marketing provider of pre-paid legal services, is headquartered in the city. Oil and natural gas are still very much a part of the regional economy, but no large companies that provide significant employment exist in the city.\nThe largest employers in the region are the following:\n- Ada City Schools\n- Chickasaw Nation\n- East

<IPython.core.display.Javascript object>

In [66]:
[x[1] for x in predictions], flattened_labels[start : start + num_samples]

([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0])

<IPython.core.display.Javascript object>