In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [3]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [4]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [5]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [6]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [7]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [54]:
# initialize the coherence library
max_words_per_step = 4
coherence = Coherence(max_words_per_step=max_words_per_step)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2023-04-21 23:13:42.323174: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
Some weights of the model checkpoint at /Users/amitmaraj/.cache/torch/sentence_transformers/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-04-21 23:13:45.391854: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_r

<IPython.core.display.Javascript object>

In [57]:
def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(
    coherence_map,
    keywords_current,
    suppress_errors=False,
    same_word_multiplier=2,  # if set to 1, don't amplify the same words found
    no_same_word_penalty=1,  # if set to 1, don't penalize for not finding the same word.
):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]
                second_word_importance = second_word_tuple[1]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]

                    if same_word_multiplier > 1:
                        flattened_coherence_words_only = [
                            element[0]
                            for sublist in coherence_map
                            for element in sublist
                        ]

                        num_occurrences = flattened_coherence_words_only.count(
                            second_word
                        )

                        if num_occurrences > 0:
                            # amplify words that are found as duplicates in the coherence map
                            # if the word shows up 1 time, amplify the weight by 2 times
                            weighting_multiplier = flattened_coherence_words_only.count(
                                second_word
                            ) + (same_word_multiplier - 1)
                        else:
                            # no same word penalty
                            weighting_multiplier = (
                                1 / no_same_word_penalty
                            )  # reduce the importance of this word

                    else:
                        weighting_multiplier = 1  # set to 1 in case this is turned off.

                    # this weight is a recipricol function that will grow smaller the further the keywords are away
                    # we want to put more importance on the current words, so we apply twice as much weight.
                    if i == 0:
                        weight = (weighting_multiplier * 2) / (i + 1)
                    else:
                        weight = (weighting_multiplier * 1) / (i + 1)

                    # multiply the weighting factor by the importance of the second word
                    weight *= second_word_importance

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * coherence.embedding_lib.get_similarity(
                                word_one_emb, word_two_emb
                            ),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data,
    text_labels,
    max_tokens=256,
    max_str_length=30,
    prediction_thresh=0.7,
    pruning=1,  # remove one sentence worth of keywords
    pruning_min=7,  # remove the first sentence in the coherence map once it grows passed 6
    dynamic_threshold=False,
    threshold_warmup=10,  # number of iterations before using dynamic threshold
    last_n_threshold=5,  # will only consider the last n thresholds for dynamic threshold
):
    coherence_map = []
    predictions = []
    thresholds = []
    for i, (row, label) in enumerate(zip(text_data, text_labels)):
        threshold = prediction_thresh
        if dynamic_threshold and (i + 1) > threshold_warmup:
            last_n_thresholds = thresholds[(0 - last_n_threshold) :]
            last_n_thresholds.sort()
            mid = len(last_n_thresholds) // 2
            threshold = (last_n_thresholds[mid] + last_n_thresholds[~mid]) / 2
            print(f"median threshold: {threshold}")
        # compare the current sentence to the previous one
        if i == 0:
            predictions.append((0, 0))
        else:
            print(f"Sample Number: {i}")
            prev_row = text_data[i - 1]

            row = truncate_by_token(row, max_tokens)
            prev_row = truncate_by_token(prev_row, max_tokens)

            cohesion, keywords_prev, keywords_current = coherence.get_coherence(
                [row, prev_row], coherence_threshold=0
            )

            # add the keywords to the coherence map
            coherence_map.append(cohesion)
            if pruning > 0 and len(coherence_map) >= pruning_min:
                print("pruning...", len(coherence_map))
                coherence_map = coherence_map[
                    pruning:
                ]  # remove the pruning amount from the beginning of the list
                print("done pruning...", len(coherence_map))

            # truncate the strings for printing
            truncated_row = truncate_string(row, max_str_length)
            truncated_prev_row = truncate_string(prev_row, max_str_length)
            print(
                f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}, KW Curr: {[x[0] for x in keywords_current]}"
            )

            # compute the word comparisons between the previous (with the coherence map)
            # and the current (possibly the first sentence in a new segment)
            word_comparisons_with_coherence, weights = compare_coherent_words(
                [*coherence_map, keywords_prev], keywords_current
            )

            similarities_with_coherence = [
                comparison[2] for comparison in word_comparisons_with_coherence
            ]
            avg_similarity_with_coherence = sum(similarities_with_coherence) / (
                len(similarities_with_coherence) or 1
            )
            weighted_avg_similarity_with_coherence = get_weighted_average(
                similarities_with_coherence, weights
            )
            print(f"weighted: {weighted_avg_similarity_with_coherence}")

            # if the two sentences are similar, create a cohesive prediction
            # otherwise, predict a new segment
            if weighted_avg_similarity_with_coherence > threshold:
                print(
                    f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 0))
            else:
                # start of a new segment, empty the map
                coherence_map = []
                print(
                    f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 1))

            thresholds.append(weighted_avg_similarity_with_coherence)
            print("===============================================")

    return predictions

<IPython.core.display.Javascript object>

In [58]:
start = 0
num_samples = 250
max_tokens = 256  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

Sample Number: 1
['sebastiáne', 'donesebastia', 'sebastián', 'sebastiane'] In spite of appearances both the Basque form Donostia and the Spanish form San Sebastián have the same meaning of Saint Sebastian The dona done doni element in Basque place names signifies saint and is derived from Latin domine the second part of Donostia contains shortened form of the saint name There are two hypotheses regarding the evolution of the Basque name one says it was Done Sebastiáne Donasa astiai Donasastia Donastia Donostia the other one says it was Done Sebastiane Done Sebastiae Done Sebastie Donesebastia Donasastia Donastia Donostia
['sebastián', 'surroundings', 'wetlands', 'picturesque'] The city is in the north of the Basque Autonomous Community on the southern coast of the Bay of Biscay San Sebastián picturesque shoreline makes it popular beach resort The seaside environment is enhanced by hilly surroundings that are easily accessible Urgull at the heart of the city by the seashore romantic Mou

['conquered', 'inhabiting', 'sebastián', 'pamplona'] After long period of silence in evidence in 1014 the monastery of St Sebastián with its apple orchards for cider located in the term of Hernani is donated to the Abbey of Leire by Sancho III of Pamplona By 1181 the city is chartered given fuero by king Sancho VI of Pamplona on the site of Izurum having jurisdiction over all the territory between the rivers Oria and Bidasoa In 1200 the city was conquered by Castile whose king Alfonso VIII confirmed its charter fuero but the Kingdom of Navarre was deprived of its main direct access out to the sea Perhaps as soon as 1204 or earlier the city nucleus at the foot of Urgull started to be populated with Gascon speaking colonizers from Bayonne and beyond who left an important imprint in the city identity in the centuries to come In 1265 the use of the city as seaport is granted to Navarre as part of wedding pact The large quantity of Gascons inhabiting the town favoured the development of tra

['sebastián', 'cristina', 'demolished', 'haussmannian'] As result of Donostia sprawling in all directions first into the flatlands shaped by the river Urumea and later up the hills new districts arose after the walls of the city were demolished in 1863 The first expansion of the old town stretched out to the river mouth on the old quarter called Zurriola name later given by Council decision to the sand area and the avenue across the river The orthogonal layout nowadays making up the city centre the Cortazar development was built up to 1914 first phase finished much in tune with Parisian Haussmannian style The arcades of the Buen Pastor square were fashioned after the ones of the Rue de Rivoli with the Maria Cristina Bridge being inspired by the Pont Alexandre III that spans the Seine The Estación del Norte train station standing right across the bridge was inaugurated in 1864 just after the arrival of the railway to San Sebastián with its metallic roof being designed by Gustave Eiffel 

['overshadowed', 'offices', 'sancho', 'nowadays'] This city expansion to the south came about as of the 1940s after the works to canalize the river were achieved Nowadays the name Amara usually applies to this sector the newer district having overshadowed the original nucleus both in size and population The district harbours the main road entrance to the city Facilities of many state run agencies were established here and presently Amara buildings house many business offices The district revolves around the axis of Avenida Sancho el Sabio and Avenida de Madrid
['proprietors', 'zurriola', 'demolished', 'chofre'] The district is built on the sandy terrain across the river The Gros or Zurriola surf beach by the river mouth bears witness to that type of soil In the 19th century shanties and workshops started to dot the area Tomas Gros being one of its main proprietors as well as providing the name for this part of the city The area held the former monumental bullring Chofre demolished in 1

['bidebieta', 'inhabitants', 'sebastián', 'trintxerpe'] Altza Basque for alder tree is the easternmost district of San Sebastián along with Bidebieta and Trintxerpe It was but quaint village comprising scattered farmhouses and small nucleus century ago 683 inhabitants in 1910 yet on the arrival of thousands of immigrants in the 1960s and 1970s rapid and chaotic housing and building activity ensued resulting in maze of grey landscape of skyscrapers and 32 531 inhabitants crammed in them data of 1970 the figure is 20 000 as of 2013
['sebastián', 'institutions', 'konporta', 'demolished'] Ibaeta stands on the former location for various factories Cervezas El León of San Sebastián with the buildings of the old industrial estate being demolished in the late 20th century The levelling of this large flat area paved the ground for carefully planned modern and elegant housing estate featuring new university campus for the public University of the Basque Country UPV EHU and institutions such as t

['institutions', 'architectonic', 'nursery', 'nurseries'] This part stands on the east side of the city at the foot of the Mount Ulia Park on the left hand side of the road heading from Donostia to Pasaia and Irun It consists of residential area besides holding number of educational institutions culture and sports centres built since 1980 The Park of Nurseries of Ulia sits at the base of the road leading to Mount Ulia with its name deriving from its function as nursery of plants for the public gardens of Donostia during all the 20th century and until 2008 It includes two ancient water tanks architectonic elements and specific flora and fauna
['zubieta', 'reconstruction', 'neighbours', 'picturesque'] The exclave Zubieta meaning place of bridges was picturesque old village up to recent years with bunch of houses unique handball pitch on account of its single wall as opposed to the regular two and church Yet it has undergone great urban development which has rendered the location built up

['carnival', 'tamborrada', 'fountains', 'sebastián'] Every year on 20 January the feast of Saint Sebastian the people of San Sebastián celebrate festival known as the Tamborrada At midnight in the Konstituzio plaza in the Alde Zaharra Parte Vieja Old Part the mayor raises the flag of San Sebastián see in the infobox For 24 hours the entire city is awash with the sound of drums The adults dressed as cooks and soldiers march around the city They march all night with their cook hats and white aprons with the March of San Sebastián On this day procession was held in the early 19th century from the Santa Maria Church in the Old Part to the San Sebastián Church in the district of Antiguo while later limited on the grounds of weather conditions to the in wall area The event finished with popular dancing accompanied on the military band flutes and drums In addition every day soldier parade took place to change the guards at the town southern walls Since the San Sebastián Day was the first fest

['crowds', 'rhythmically', 'carnival', 'began'] This is local festival held on the first Saturday of February linked to the upcoming Carnival where different groups of people dressed in Romani Gypsy tinkers attire take to the streets banging rhythmically hammer or spoon against pot or pan and usually bar hop while they sing the traditional songs for the occasion They were just male voices some time ago but women participate and sing currently too and the main event is at the City Hall where the city band plays marches while the crowds bang the pots and pans The festival began in 1884
['crowds', 'gipuzkoa', 'konstituzio', 'chorizo'] This popular festival takes place on the 21 December date frequently shrouded in winter cold From early in the morning stalls are arranged across the city centre and people from all Gipuzkoa flock to the streets of the centre and the Old Part with crowds of people often dressed in traditional Basque farmer outfit turning out and filling the area Traditional 

['universidad', 'sebastián', 'robotics', 'informatics'] Donostia San Sebastián has become an important University town Four universities and superior conservatory are present in the city University of the Basque Country UPV EHU San Sebastián hosts the Gipuzkoa Campus of the public university University of Navarra The private university has an engineering centered campus Tecnun in San Sebastián Universidad de Deusto Built in 1956 the San Sebastián campus of the private university offers different university degrees Mondragon University The pioneering Faculty of Gastronomic Sciences of this private university is located in San Sebastián Musikene The Higher School of Music of the Basque Country is located in San Sebastián The secondary studies activity is having an increasing impact on social cultural technological and economical levels of the city and surroundings With its pushing innovative and research centers and its research strategies it is becoming one of Spain main Science product

['households', 'km²', 'families', 'females'] As of the census of 2000 there were 672 people 262 households and 183 families residing in the city The population density was 427 people per square mile 552 km² There were 280 housing units at an average density of 594 per square mile 230 km² The racial makeup of the city was 96 28 White 04 Native American 45 Asian 60 from other races and 64 from two or more races Hispanic or Latino of any race were 60 of the population There were 262 households out of which 31 had children under the age of 18 living with them 64 were married couples living together had female householder with no husband present and 29 were non families 27 of all households were made up of individuals and 14 had someone living alone who was 65 years of age or older The average household size was 56 and the average family size was 14 In the city the population was spread out with 28 under the age of 18 from 18 to 24 25 from 25 to 44 19 from 45 to 64 and 18 who were 65 years 

['decommunization', 'lennon', 'february', 'renamed'] In order to comply with decommunization laws the local Soviet Square was renamed John Lennon Square in February 2016
['köppen', 'subtype', 'classification', 'summer'] The Köppen Climate Classification subtype for this climate is Dfb Warm Summer Continental Climate
Got the keywords in 0.6835 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['spalding', 'furniture', 'hospers', 'catholic'], ['neoclassical', 'counterattacking', 'eliminated', 'baroque'], ['counterattacking', 'eliminated', 'kharkov', 'separatists'], ['decommunization', 'lennon', 'february', 'separatists'], ['decommunization', 'lennon', 'köppen', 'february']], KW Curr: ['decommunization', 'lennon', 'february', 'renamed']
weighted: tensor([0.7806])
Label: 0, Prediction: 0, logit: tensor([0.7806])
Sample Number: 46
['köppen', 'subtype', 'classification', 'summer'] The Köppen Climate Classification subtype for this climate is Dfb Warm Summer Contine

['sashtinska', 'tourism', 'hotels', 'rehabilitation'] The health resort village of Banya is in large park at the foot of the Sashtinska Sredna gora mountain near the geographic centre of the country in the Rose Valley between the Balkan and the Sredna Gora mountains It is easily reachable by car train or bus The beaches swimming pools sport complexes balneological hotels and rehabilitation establishments in town create wonderful conditions for pleasant summer holidays and tourism The fans of the Bulgarian wines may sample the quality drinks produced by the famous Rose Valley winery
['hydrothermal', 'hydrotherapy', 'village', 'sources'] The remains of an ancient village prove that the town was used for hydrotherapy in ancient times There are nine hydrothermal sources There are plenty of mineral springs
Got the keywords in 0.9607 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['decommunization', 'lennon', 'köppen', 'february'],

['households', 'km²', 'families', 'females'] As of the census of 2000 there were 998 people 385 households and 259 families residing in the city The population density was 558 people per square mile 602 km² There were 450 housing units at an average density of 702 per square mile 271 km² The racial makeup of the city was 95 19 White 10 African American 70 Native American 10 Asian 21 from other races and 70 from two or more races Hispanic or Latino of any race were 12 32 of the population There were 385 households out of which 31 had children under the age of 18 living with them 56 were married couples living together had female householder with no husband present and 32 were non families 28 of all households were made up of individuals and 13 had someone living alone who was 65 years of age or older The average household size was 51 and the average family size was 08 In the city the population was spread out with 28 under the age of 18 from 18 to 24 23 from 25 to 44 22 from 45 to 64 an

['muanikau', 'cunningham', 'recreational', 'tamavua'] The city six wards beginning from the city centre then north then clockwise rotation Central city centre CBD nucleus of the city Tamavua residential and urban area Cunningham semi urban and residential area Nabua military base Southern Division Police Headquarters urban residential separate town centre and industrial zone Samabula urban residential separate town centre university and large industrial zones Muanikau residential urban large sporting venues university and recreational areas
['conurbation', 'stretches', 'highway', 'corridor'] Suva sits in the middle of an urban conurbation that stretches from Lami to the immediate west of the city along the Queens Highway and Nasinu town on its eastern border all the way to the Rewa River along the Kings Highway This conurbation sometimes known as the Suva Urban Complex continues till Nausori over the Rewa River The north of the city to its northeast contains the rainforest park areas o

['councillors', 'corporations', 'constituencies', 'dismissed'] Suva has municipal status and is supposed to be governed by Lord Mayor and 20 member city council The Suva City Council is the municipal law making body of the city of Suva Fiji capital It consists of 20 Councillors elected for three year terms from four multi member constituencies called wards Councillors who are elected by residents landowners and representatives of corporations owning or occupying rateable property in Suva elect Lord Mayor and Deputy Lord Mayor from among their own members they serve one year terms and are eligible for re election In 2009 the Military backed interim government dismissed all municipal governments throughout Fiji and appointed special administrators to run the urban areas As of 2015 elected municipal government has not been restored The special administrator of Suva along with nearby Nasinu is Chandu Umaria former Lord Mayor of Suva
['institution', 'residence', 'preservation', 'tradition']

['polytechnic', 'geoscience', 'nursing', 'micronesia'] Suva is host to more international and regional intergovernmental agencies and NGOs than any other Pacific Island capital Some of the bodies with presence in Suva are The TRAFFIC Oceania South Pacific Programme funded by the UK Foreign and Commonwealth Office is in Suva in the offices of the WWF South Pacific Programme The programme assists in the implementation of CITES and strengthens collaboration with the World Wide Fund for Nature The Fiji School of Medicine which is now classed as regional agency and member of the Council of Regional Organisations in the Pacific The University of Fiji The Fiji School of Nursing The University of the South Pacific which operates campus in Suva as well as at other South Pacific locations The Fiji National University which is major polytechnic in Fiji and caters students from many small Pacific Island nations It has centres in other Fiji towns of Nadi Ba and Labasa The Fiji College of Advanced L

['vegetables', 'influenced', 'fijians', 'labourers'] Suva offers varied and interesting culinary experience where almost every if not all major cuisines are represented Particularly popular cuisines are Fijian Indian Chinese American and foods from other cultural and ethnic backgrounds Fijians of Indian descent have influenced Fiji cuisine in the process creating the uniquely Fiji Indian curry Indentured labourers brought with them spices chilies and other herbs and vegetables which now are part of the Fijian culinary experience
['tradeshow', 'carnival', 'performances', 'festivals'] During the course of the year arts music and trade festivals are held in Suva albeit on small scale There are few large and notable festivals that occur annually and these include the Hibiscus Festival largest carnival in the South Pacific islands the New Years Street Party and the Fiji Show Case tradeshow that includes carnival rides food as well as magic and circus performances
Got the keywords in 0.9735 

['corporation', 'newspapers', 'repúblika', 'affairs'] Headquartered in Suva are the three main national television stations Fiji One FBC TV and MAI TV along with the Fiji Ministry of Information which produces government programs as well as national news and current affairs bulletins Fiji One produces and airs its evening National News bulletin from its studios in Gladstone Road in Central FBC TV airs its FBC News bulletin from its studios also on Gladstone Road Sky Pacific and Pacific Broadcasting Services Fiji are the two pay satellite television company headquartered here Suva is home to the national radio broadcasters Fiji Broadcasting Corporation FBC and Communications Fiji Limited CFL between them providing 12 of the national radio stations The two dailies The Fiji Times and The Fiji Sun are printed here and formerly the Fiji Post Many other weekly newspapers are headquartered and published in Suva including Inside Fiji Nai Lalakai iTaukei language weekly Shanti Dut Fiji Hindi we

['yalimaiwai', 'civoniceva', 'koroibulu', 'professionnel'] This is list of famous people who are either currently living in or are originally from Suva Petero Civoniceva born in Suva Australian rugby league player Noor Dean Fiji Indian lawyer and politician Suva City Council and House of Representatives Josua Koroibulu plays rugby league for the Fiji national rugby league team Nalini Krishan Star Wars film actress Craig Parker New Zealand actor Paulini born in Suva Australian singer and songwriter Semi Radradra Parramatta Eels player and plays for the Fiji national rugby league team Waisale Serevi Fiji Rugby Team Devanesh Sharma leading Suva lawyer and former President of the Fiji Law Society Sitiveni Sivivatu All Black Chiefs Super rugby franchise Son Sungah member of Korean female vocal group Nine Muses band Semi Tadulala plays rugby union for Gloucester Rugby in England and Fiji in rugby union previously rugby league player for Melbourne Storm Bradford Bulls and the Fiji national ru

['households', 'km²', 'families', 'couples'] As of the census of 2000 there were 392 people 141 households and 101 families residing in the city The population density was 822 people per square mile 315 km² There were 160 housing units at an average density of 335 sq mi 128 km² The racial makeup of the city was 83 93 White 77 Native American 79 Asian 11 22 from other races and 30 from two or more races 17 86 of the population were Hispanic or Latino of any race There were 141 households out of which 38 had children under the age of 18 living with them 55 were married couples living together 13 had female householder with no husband present and 27 were non families 22 of all households were made up of individuals and 10 had someone living alone who was 65 years of age or older The average household size was 78 and the average family size was 25 In the city the population was spread out with 29 under the age of 18 12 from 18 to 24 29 from 25 to 44 14 from 45 to 64 and 14 who were 65 year

['hanska', 'incorporated', 'village', 'platted'] post office called Hanska has been in operation since 1890 Hanska was platted in 1899 and incorporated as village in 1901
['census', 'bureau', 'total', 'states'] According to the United States Census Bureau the city has total area of all of it land
Got the keywords in 0.6955 seconds
Got the embeddings and comparisons in 0.0003 seconds
Coherence Map: [['hanska', 'incorporated', 'village', 'platted']], KW Curr: ['hanska', 'incorporated', 'village', 'platted']
weighted: tensor([0.7644])
Label: 0, Prediction: 0, logit: tensor([0.7644])
Sample Number: 91
['census', 'bureau', 'total', 'states'] According to the United States Census Bureau the city has total area of all of it land
['households', 'residents', 'couples', 'families'] As of the census of 2010 there were 402 people 176 households and 105 families residing in the city The population density was There were 197 housing units at an average density of The racial makeup of the city was 99

['households', 'residents', 'couples', 'families'] As of the census of 2010 there were 248 people 110 households and 73 families residing in the city The population density was There were 123 housing units at an average density of The racial makeup of the city was 99 White Asian and from two or more races Hispanic or Latino of any race were of the population There were 110 households of which 26 had children under the age of 18 living with them 50 were married couples living together 11 had female householder with no husband present had male householder with no wife present and 33 were non families 30 of all households were made up of individuals and 20 had someone living alone who was 65 years of age or older The average household size was 25 and the average family size was 77 The median age in the city was 45 years 21 of residents were under the age of 18 were between the ages of 18 and 24 19 were from 25 to 44 28 were from 45 to 64 and 23 were 65 years of age or older The gender mak

['tulsa', 'located', 'oklahoma', 'hills'] Ada is located in the rolling hills of southeastern Oklahoma Ada is from Oklahoma City from Tulsa and from Dallas Texas According to the United States Census Bureau the city has total area of of which is land and 44 is water
['hispanics', 'households', 'disparity', 'km²'] As of the 2010 census Ada 16 810 residents consisted of 697 households and 803 families The population density was 999 people per square mile 385 km² The 862 housing units were dispersed at an average density of 475 per square mile 183 km² Ada 2006 racial makeup was 73 81 White 54 African American 15 10 Native American 83 Asian 01 Pacific Islander 89 from other races and 81 from two or more races Hispanics or Latinos of any race were 89 of the population Of Ada 697 households 25 had children under the age of 18 living with them 40 were married couples living together 12 had female householder with no husband present and 43 were non families The 15 of those 65 years or older li

['glenwood', 'childhood', 'junior', 'washington'] Ada Public Schools has six primary and secondary schools Glenwood Early Childhood Center Hayes Grade Center Washington Grade Center Willard Grade Center Ada Junior High School Ada High School
['pontotoc', 'located', 'technology', 'area'] Pontotoc Technology Center formerly Pontotoc Area Vo Tech is located in Ada
Got the keywords in 0.7042 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['plasticware', 'diversified', 'dollars', 'institution'], ['institution', 'internationally', 'glenwood', 'accreditation'], ['pontotoc', 'located', 'glenwood', 'technology']], KW Curr: ['glenwood', 'childhood', 'junior', 'washington']
weighted: tensor([0.8065])
Label: 0, Prediction: 0, logit: tensor([0.8065])
Sample Number: 104
['pontotoc', 'located', 'technology', 'area'] Pontotoc Technology Center formerly Pontotoc Area Vo Tech is located in Ada
['innocence', 'imprisonments', 'prosecutors', 'disagreements'] Due to its short p

['petrolíferos', 'frondizi', 'tourism', 'accommodation'] In 1903 six hundred Afrikaner families arrived in Argentina following the loss of the Second Boer War They were given farming land in the lands around Comodoro Rivadavia but due to shortage of water had to bring water in by ox wagon The lack of water was big impediment to the development of the settlement At the Afrikaners insistence drilling began in 1907 in an effort to look for water but instead they struck oil Although much of the oil was discovered on land given to Afrikaans settlers they could not benefit directly from the discovery due to Argentinian law which decrees that all mineral deposits belong to the state Therefore most of the town Afrikaans settlers moved on to Sarmiento and surrounding regions to set up farms there The discovery of oil in 1907 boosted economic growth in Comodoro Rivadavia By the end of 1919 most of the 1719 workers were given accommodation in small metal sheet houses without any heating or electr

['petrolíferos', 'frondizi', 'concession', 'nación'] The oil production started in Chubut in 1907 when drilling rig which was looking for water discovered oil instead The Argentine oil industry started in Comodoro Rivadavia and was facilitated by the 1886 National Mining Code Codigo de Mineria de la Nación This code established that the oil fields belonged to the State and that they could also be exploited by the private sector by concession In 1922 YPF Yacimientos Petrolíferos Fiscales the first state owned oil company in the world was created by President Hipólito Yrigoyen government This company helped the society by improving the construction of houses providing new jobs and health care Engineer Enrique Mosconi was in charge of running the company By 1933 648 wells had been drilled in Comodoro Rivadavia 88 of them were economically productive In 1935 the First Oil Law was passed It established that the National and provincial States would receive as contribution the 12 of the Gross

['petroquimica', 'puzolanic', 'bricklaying', 'industry'] The city also is home to factory that produces concrete property of Petroquimica Comodoro Rivadavia It produces different types of concrete Standard Portland Puzolanic BCA ARI bricklaying concrete Caltex concrete for oil industry The total production of concrete for the year 2002 was 228 000 tons
['plans', 'comodoro', 'regions', 'america'] The wind farm of Comodoro Rivadavia has capacity of 18 820 kW with 26 generators and is the most important in Latin America Plans exist to connection it the national energy which could allow the sale of energy to other regions
Got the keywords in 0.7462 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['sarmiento', 'routes', 'chubut', 'petrolíferos'], ['sarmiento', 'routes', 'córdova', 'latitude'], ['córdova', 'latitude', 'shipments', 'comodoro'], ['shipments', 'comodoro', 'construction', 'stopped'], ['petroquimica', 'puzolanic', 'brick

['hohenwald', 'located', 'census', 'bureau'] Hohenwald is located at 35 5479 87 5520 According to the United States Census Bureau the city has total area of all land
['households', 'km²', 'families', '056'] As of the census of 2000 there were 754 people 534 households and 989 families residing in the city The population density was 861 people per square mile 332 km² There were 708 housing units at an average density of 391 per square mile 151 km² The racial makeup of the city was 96 59 White 08 Black 11 Native American 16 Asian 32 from other races and 75 from two or more races Hispanic or Latino people of any race were 12 of the population There were 534 households out of which 28 had children under the age of 18 living with them 48 were married couples living together 13 had female householder with no husband present and 35 were non families 32 of all households were made up of individuals and 17 had someone living alone who was 65 years of age or older The average household size was 

['corporation', 'opportunities', 'mississippi', 'facilities'] Bay Springs was the site of one of six Sunbeam plants in Mississippi When Albert Dunlap downsized the company and closed the plant 300 people lost their jobs The last workers left the plant at the same time that Dunlap was negotiating new contract for himself worth over 46 million The average annual salary at the Bay Springs plant had been less than 25 000 More recently the Hol Mac Corporation has located light industrial manufacturing facilities in and around the Bay Springs area One of the county largest employers this developing corporation has partnered with nearby Jones County Junior College with regard to job training and continues to expand employment opportunities in the local community Hol Mac operates facilities both in the town of Bay Springs and north of town in designated industrial areas between Bay Springs and the community of Louin Mississippi As of 2015 it has three main manufacturing facilities in the area 

['households', 'km²', 'families', 'females'] As of the census of 2000 there were 015 people 161 households and 748 families residing in the city The population density was 179 people per square mile 454 km² There were 241 housing units at an average density of 485 per square mile 187 km² The racial makeup of the city was 92 57 White 13 African American 20 Native American 27 Asian 87 from other races and 96 from two or more races Hispanic or Latino of any race were 12 17 of the population There were 161 households out of which 31 had children under the age of 18 living with them 52 were married couples living together had female householder with no husband present and 35 were non families 31 of all households were made up of individuals and 17 had someone living alone who was 65 years of age or older The average household size was 50 and the average family size was 13 In the city the population was spread out with 25 under the age of 18 from 18 to 24 29 from 25 to 44 18 from 45 to 64 an

['households', 'residents', 'couples', 'families'] As of the census of 2010 there were 184 people 55 households and 46 families residing in the city The population density was There were 68 housing units at an average density of The racial makeup of the city was 81 White Native American from other races and from two or more races Hispanic or Latino of any race were of the population There were 55 households of which 43 had children under the age of 18 living with them 63 were married couples living together 10 had female householder with no husband present had male householder with no wife present and 16 were non families of all households were made up of individuals and had someone living alone who was 65 years of age or older The average household size was 35 and the average family size was 54 The median age in the city was 35 years 34 of residents were under the age of 18 were between the ages of 18 and 24 26 were from 25 to 44 21 were from 45 to 64 and 11 were 65 years of age or ol

['chapel', 'municipality', 'nossa', 'guarani'] The town was founded in the beginning of the 19th century and grew around chapel dedicated to Our Lady of Mount Carmel Nossa Senhora do Carmo In 1847 it became district of the municipality Franca under the name Carmo de Franca It became an independent municipality in 1885 In 1899 the name was changed to Ituverava which is Tupi Guarani for shining waterfall
['antiquity', 'σκόδρα', 'linguists', 'linguistic'] The etymology of the term Shkodër is subject which attracts debate The name was first attested in antiquity in the Latin form Scodra the Ancient Greek Σκόδρα and the Ancient Greek genitive of the Skodrians which was discovered on coins from the 2nd century BC Although the ultimate origin of the term is uncertain The further development of the name has been subject of discussion among linguists over the linguistic provenance of the Albanian people and the Albanian language While Eqrem Çabej and Shaban Demiraj treat the development from Sk

['conquered', 'inhabited', 'macedonia', 'praevalitana'] The earliest signs of human activity in the lands of Shkodër can be traced back to the Bronze Age The favorable conditions on the fertile plain around the lake have brought people here from early antiquity Artefacts and inscriptions discovered in the Rozafa Castle are assumed to be the earliest examples of symbolic behaviour in humans in the city Although it was known under the name Scodra and was inhabited by the Illyrian tribe of the Ardiaei which ruled over large territory between modern Albania up to Croatia Queen Teuta King Agron and King Gentius were among the most famous personalities of the Labeates The city was first mentioned during the antiquity as the site of the Illyrian Labeates in which he minted coins and that of Queen Teuta In 168 BC the city was captured by the Romans and became an important trade and military route The Romans colonized the town Scodra remained in the province of Illyricum and later Dalmatia By i

['catholicism', 'islamic', 'catholics', 'egyptians'] Shkodër is the 4th populous city in Albania and the largest city in the Shkodër County According to the Institute of Statistics INSTAT the city of Shkodër include 77 075 people as of the 2011 Census About 197 357 91 65 of the population are Albanians 694 32 Ashkali and Balkan Egyptians 282 13 Montenegrins and 13 665 35 did not declare their ethnicity The city of Shkodër was one of the most important centers for Islamic scholars and cultural and literary activity in Albania Here stands the site of the only institution in Albania which provides high level education in Arabic Turkish and Islamic Studies Shkodër is the center of Roman Catholicism in Albania The Roman Catholic Church is represented in Shkodër by the episcopal seat of the Metropolitan Roman Catholic Archdiocese of Shkodër Pult Scutari Pulati in Shkodër Cathedral with the current seat of the prelacy According to Institute of Statistics INSTAT Catholics make up about 47 of t

['institutions', 'carnival', 'migjeni', 'prestigious'] Shkodër is an important educational and industrial center The city produces various mechanical and electrical components along with textile and food products Luigj Gurakuqi University of Shkodër is one of the more prestigious learning centers of Albania The public library of the city contains more than 250 000 books Other cultural institutions include the Cultural Center the Marubi Photo Archives the Artists and Writers Association the Migjeni Theatre named after Millosh Gjergj Nikolla the Gallery of Arts and the Museum of History Historic cultural architecture includes the Castle of Shkodër the Turkish Bath and the Lead Mosque The Castle of Shkodër became famous during the First Balkan War when it was protected by the Turkish general Hasan Riza Pasha and Esad Pasha Many festivals take place on an annual basis such as Carnival Children Festival Lake Day and Shkodra Jazz Fest
['sophisticated', 'clarinet', 'descriptions', 'hafizi'] C

['友谊馆', 'neglecting', 'friendship', 'sentences'] On December 1994 fire broke out in Friendship Theatre 友谊馆 Karamay which caused the death of 325 people including 288 school children according to official figures Many teachers were killed while trying to protect and evacuate their students from the building which lacked adequate safety features show was being organized at that moment for number of local government officials who managed to escape ahead of the others on spotting the fire and were afterwards charged with neglecting their duty and received prison sentences of up to years
['dushanzi', 'jurisdiction', 'lanxin', 'districts'] Karamay City has jurisdiction over districts qu They are not contiguous as Dushanzi District is located south of the Lanxin Railway and forms an exclave separated from the rest of Karamay City by Kuytun City Together with Kuytun City Karamay City forms an enclave surrounded on all sides by Tacheng Prefecture
Got the keywords in 1.0556 seconds
Got the embed

['discovered', 'ranks', 'reached', 'cities'] In 1955 one of the largest oil fields in China was discovered there Since then the city has grown into an oil producing and refining center In 2008 the GDP reached 66 billion and GDP per capita reached 242 391 US 34 901 The GDP per capita ranks first among 659 cities in Mainland China
['territory', 'railroad', '1872', 'platted'] Kensett was platted in 1872 shortly after the railroad was built through that territory in 1871
Got the keywords in 0.6851 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['dushanzi', '友谊馆', 'jurisdiction', 'lanxin'], ['dushanzi', 'jurisdiction', 'lanxin', 'districts'], ['达尔不特河', 'reservoirs', 'huangyangquan', 'replenished'], ['inhabitants', 'kazakhs', 'ethnicity', 'minorities'], ['inhabitants', 'kazakhs', 'ethnicity', 'minorities'], ['territory', 'railroad', '1872', 'platted']], KW Curr: ['discovered', 'ranks', 'reached', 'cities']
weighted: tensor([0.7734]

['construction', 'incident', 'relocation', 'petitioned'] Warren was platted in 1879 and named for Charles Warren railroad official post office has been in operation at Warren since 1880 Although several times larger than the next largest city in the county Warren prominence as the county seat has been threatened several times in its history The original plan for the Soo Line Railroad completed in 1905 branch line that passes through Warren called for it to run from Thief River Falls to Argyle and then west Argyle interests hoped the establishment of railroad junction there would lead to the removal of the county seat from Warren to Argyle Other interests prevailed although the railroad line forms parabola extending north from Thief River Falls and then south to Warren as if the plan changed while the line was being built In 1974 citizens of the eastern part of the county noting Warren location in the western quarter of the long county and very nearly at its southern boundary petitioned

['evangelical', 'lutheran', 'catholic', 'churches'] The city has three Lutheran churches two affiliated with the Evangelical Lutheran Church in America and one affiliated with the Lutheran Church Missouri Synod Roman Catholic parish and cemetery an Evangelical Covenant church United Methodist church and an Assembly of God church In the rural area surrounding the city there are three small Lutheran churches
['agribusiness', 'agriculture', 'floods', 'principal'] Agriculture and agribusiness have been the mainstays of Warren throughout its history In fact for most of its history there has been no other industry at all Although the Red River Valley has short growing season which is often made even shorter due to floods the area has excellent crop yields which help make farming in the area less risky than in other areas with poorer soil The principal crops are wheat soybeans potatoes and sugar beets Warren was at one time home to the largest independent elevator in Minnesota Northwest Grain

['confederate', 'routes', 'dragoons', 'evacuate'] Located at the post of Fort Jones was established on October 18 1852 by its first commandant Captain brevet Major Edward Fitzgerald Company 1st Dragoons Fort Jones was named in honor of Colonel Roger Jones who had been the Adjutant General of the Army from March 1825 to July 1852 Such military posts were to be established in the vicinity of major stage routes which would have meant locating the post in the vicinity of Yreka sixteen miles to the Northeast The areas around Yreka did not contain sufficient resources including forage for their animals so Capt Fitzgerald located his troop some sixteen miles to the southwest in what was then known as Beaver Valley Fort Jones would continue to serve Siskiyou County military needs until the order was received to evacuate some six years later on June 23 1858 Among the officers stationed at Fort Jones who would attain national prominence in ensuing years were Phil Sheridan Union Army William Wing

['households', 'km²', 'families', 'females'] As of the census of 2000 there were 660 people 298 households and 185 families residing in the city The population density was 096 people per square mile 424 km² There were 328 housing units at an average density of 545 per square mile 211 km² The racial makeup of the city was 88 64 White 15 African American 18 Native American 45 Pacific Islander 52 from other races and 06 from two or more races Hispanic or Latino of any race were 03 of the population There were 298 households out of which 28 had children under the age of 18 living with them 44 were married couples living together 12 had female householder with no husband present and 37 were non families 33 of all households were made up of individuals and 17 had someone living alone who was 65 years of age or older The average household size was 21 and the average family size was 81 In the city the population was spread out with 23 under the age of 18 from 18 to 24 23 from 25 to 44 24 from 

['consolidated', 'residents', 'district', 'bono'] Bono residents are served by the Westside Consolidated School District The district opened in 1966
['precipitation', 'köppen', 'temperatures', 'subtropical'] Climate is characterized by relatively high temperatures and evenly distributed precipitation throughout the year The Köppen Climate Classification sub type for this climate is Cfa Humid Subtropical Climate
Got the keywords in 0.7351 seconds
Got the embeddings and comparisons in 0.0004 seconds
Coherence Map: [['consolidated', 'residents', 'district', 'precipitation']], KW Curr: ['consolidated', 'residents', 'district', 'bono']
weighted: tensor([0.7931])
Label: 0, Prediction: 0, logit: tensor([0.7931])
Sample Number: 188
['precipitation', 'köppen', 'temperatures', 'subtropical'] Climate is characterized by relatively high temperatures and evenly distributed precipitation throughout the year The Köppen Climate Classification sub type for this climate is Cfa Humid Subtropical Climate


['purchased', 'agriculture', 'agricultural', 'experiment'] During the last 30 years of the 19th century agriculture became important and the entire county became noted for its production of potatoes On October 1851 the first Northern Maine Fair exhibition opened starch factory was opened in 1874 providing ready market for local potatoes In 1914 the Aroostook Farm was purchased as Maine Agricultural Experiment Station
['brunswick', 'railroad', 'aroostook', 'railway'] In 1881 the New Brunswick Railway created the first rail connection in Presque Isle The Bangor and Aroostook Railroad arrived in 1895
Got the keywords in 0.7920 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['köppen', 'precipitation', 'köppen', 'temperatures'], ['köppen', 'loyalists', 'agriculture', 'ownership'], ['accommodate', 'disputes', 'maysville', 'british'], ['lumbermills', 'furniture', 'gristmill', 'carriage'], ['lumbermills', 'furniture', 'gristmill', 'c

['launched', 'celebration', 'organizations', 'commemorative'] On August 11 1978 the Double Eagle II was launched from Presque Isle field carrying three passengers It made the first successful transatlantic balloon crossing In honor of the Double Eagle II the city holds an annual celebration called The Crown of Maine Balloon Festival The popular event includes balloon rides plane tours amateur photo contests and children fair rides Sponsored by the Chamber of Commerce and other local organizations it is held in late August The field from which the Double Eagle II lifted off now features commemorative model balloon
['köppen', '679321', '002166', 'located'] Presque Isle is located at 46 679321 68 002166 According to the United States Census Bureau the city has total area of of which is land and is water Presque Isle is drained by the Aroostook River and Presque Isle Stream Presque Isle has humid continental climate Köppen Dfb typified by long cold winters and short warm summers
Got the ke

['snowmobiling', 'agriculture', 'surrounding', 'industry'] Due to being Aroostook County largest city Presque Isle is the retail center for large number of both American and Canadian towns The Aroostook Centre Mall became major shopping center for residents during the 1990s Super Walmart center also attracts large number of shoppers Agriculture remains top industry of Presque Isle and the surrounding area with potatoes being the top crop During the winter months many local businesses rely on the snowmobiling industry as there is highly regarded trail system connecting far away towns and cities with Presque Isle Presque Isle is also home to significant industrial park near the Northern Maine Regional Airport Presque Isle typically has slightly higher unemployment level when compared to the state average
['wlbz', 'wvii', 'wagm', 'affiliate'] Presque Isle is home to one of the smallest TV markets as defined by Nielsen market research It consists of WAGM TV channel CBS Fox affiliate and WM

['archipelago', 'agricultural', 'persecution', 'economy'] The Safavids sidelining Manama designated the nearby town of Bilad Al Qadeem as the provincial capital The town was also the seat of the Persian governor and the Shaikh al Islam of the islands The position of Shaikh al Islam lied under jurisdiction of the central Safavid government and as such candidates were carefully vetted by the Isfahan courts During the Safavid era the islands continued to be centre for Twelver Shi ism scholarship producing clerics for use in mainland Persia Additionally the rich agricultural northern region of Bahrain continued to flourish due to an abundance of date palm farms and orchards The Portuguese traveler Pedro Teixeira commented on the extensive cultivation of crops like barley and wheat The opening of Persian markets to Bahraini exports especially pearls boosted the islands export economy The yearly income of exported Bahraini pearls was 600 000 ducats collected by around 000 pearling dhows Anot

['multinationals', 'tourism', 'institutions', 'islamic'] Manama is the focal point of the Bahraini economy While petroleum has decreased in importance in recent years due to depleting reserves and growth in other industries it is still the mainstay of the economy Heavy industry aluminium smelting ship repair banking and finance and tourism are among the industries which have experienced recent growth Several multinationals have facilities and offices in and around Manama The primary industry in Manama itself is financial services with over two hundred financial institutions and banks based in the CBD and the Diplomatic Area Manama is financial hub for the Persian Gulf region and center of Islamic banking There is also large retail sector in the shopping malls around Seef while the center of Manama is dominated by small workshops and traders Manama economy in the early 20th century relied heavily on pearling in 1907 the pearling industry was estimated to include 917 boats providing empl

['comprehensive', 'routes', 'transportation', 'launched'] Manama has recently reformed comprehensive bus service that launched on April 2015 with fleet of 141 MAN buses Regulated by the Ministry of Transportation bus routes extend across Bahrain and around Manama with fares of minimum 200 Fils BD0 200 around 50 USD 30
['strategically', 'destinations', 'muharraq', 'arabia'] Bahrain International Airport is located on the nearby Muharraq Island approximately from the CBD It is premier hub airport in the Middle East Strategically located in the Northern Persian Gulf between the major markets of Saudi Arabia and Iran the airport has one of the widest range and highest frequency of regional services with connections to major international destinations in Europe Asia Africa and North America
Got the keywords in 0.9107 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['councils', 'neighborhoods', 'municipality', 'constituencies'], ['c

['nightclubs', 'influences', 'attracting', 'muharram'] The country attracts large number of foreigners and foreign influences with just under one third of the population hailing from abroad Alcohol is legal in the country with bars and nightclubs operating in the city Bahrain gave women the right to vote in elections for the first time in 2002 Football is the most popular sport in Manama and the rest of the country with teams from Manama participating in the Bahraini Premier League The central areas of Manama are the main location for Muharram processions in the country attracting hundreds of thousands of people annually
['paleolithic', 'guimarães', 'civilizations', 'agricultural'] Vila do Conde is one of the oldest settlements in northern Portugal Geological artifacts dating to the Paleolithic have been discovered in sites in the parishes of Modivas Malta and Labruge dating from 100 000 to 15 000 years In other parishes there have also been discoveries of implements and mounds dating 

['municipality', 'subdivided', 'árvore', 'parishes'] The municipality is subdivided into the following local government civil parishes of which Azurara Árvore and Vila do Conde form the urbanized city of Vila do Conde
['tourist', 'concentrated', 'involving', 'communities'] The municipal authorities have promoted future looking environment in Vila do Conde that have concentrated on the tourist market involving the requalification of many of the historical buildings such as the Convent and customshouse in addition to promoting an ocean centric tourist economy associated with the Programa Polis initiative The communities along the coast which include the main beaches and bird sanctuary are essentially beach resorts with high technology industrial park in Mindelo with some semi conductor industries
Got the keywords in 0.8722 seconds
Got the embeddings and comparisons in 0.0004 seconds
pruning... 7
done pruning... 6
Coherence Map: [['located', 'generally', 'temperatures', 'climatic'], ['nig

['agrícolas', 'gastronomia', 'artesanato', 'nacional'] Vila do Conde is the centre of one of the more prestigious fairs of traditional artisans that include quilts wool sweaters ironworks in addition to needlework During the summer the municipality is known for the several secular and religious celebrations These include the the Feira Nacional de Artesanato theFeira da Gastronomia the Feira das Actividades Agrícolas and the Festival of São João Every four years the religious Festivals of Corpo de Deus are significant and known for the road covered mats of flowers used in processions from the circuit the main churches
['vietnam', 'revenue', 'located', 'regarding'] The volume of trade between Vietnam and China through the Móng Cái border gate reached US billion and billion in 2007 and 2008 respectively the highest among the Vietnam China border crossings The cash flow via the banks located in Móng Cái was VND180 469 billion US 11 billion The volume of trade is increasing significantly ye

['slavery', 'jolietville', 'revitalization', 'wesleyan'] Westfield was founded on May 1834 by North Carolina Quakers Asa Bales Ambrose Osborne and Simon Moon It is believed that the town was planned as stop on the Underground Railroad with many families of the Religious Society of Friends and the Wesleyan Methodist Church supporting the cause When the laws against aiding escaped slaves were made harsher part of the Westfield Quaker Friends Meeting House split into the Anti Slavery Friends meeting Westfield was incorporated as town in 1848 On January 2008 Westfield was incorporated as city and Andy Cook was sworn in as mayor With recent annexations in southern Washington Township and rapid population growth in areas already occupied by the pre existing town the city population in 2010 30 068 was more than triple that of 2000 293 Because of the growing size of the city officials are planning major revitalization of city downtown New additions to downtown Westfield are expected to include

['indianapolis', '032266', 'highway', 'noblesville'] Westfield is located in western Hamilton County at 40 032266 86 129015 It is bordered to the east by Noblesville and to the south by Carmel To the west it is bordered by Zionsville in Boone County Route 31 is the main highway through the city leading north to Kokomo and south to Interstate 465 the beltway around Indianapolis Downtown Indianapolis is south of the center of Westfield Indiana State Road 32 is Westfield Main Street and leads east to Noblesville the county seat and west to Lebanon According to the 2010 census Westfield has total area of of which or 99 11 is land and or 89 is water
['palaeolithic', 'alemannic', 'frankish', 'excavated'] Traces of palaeolithic habitation have been identified outside the town The settlement of Civitas Auderiensium was founded at Dieburg in AD 125 in the context of the Roman occupation of the section of the province of Germania Superior on the right bank of the Rhine The foundation was followe

['harperthausen', 'barbistere', 'conquered', 'kleestadt'] In 1504 the town was conquered by Landgrave Wilhelm II of Hesse who occupied it until the ownership of the town was clarified by the Diet of Worms in 1521 This resulted in another condominium this time between Hesse and the Palatinate As compensation Hanau received 12 000 guilders and several nearby villages viz Harperthausen Kleestadt Langstadt and Schlierbach Under this common ownership the town developed until shortly before the Thirty Years War the still extant town hall was built from 1596 During the war the town was protected by its strong fortifications preventing major destruction but nevertheless there was great suffering not least during the plague in 1634 36 The town was briefly occupied and laid waste by force of 600 dragoons under Marquis of Barbistere in December 1688 during the war of the Palatine succession
['inhabitants', 'landgraviate', 'desecrated', 'darmstadt'] In 1802 the Landgraviate of Hesse Darmstadt took

['lumberyard', 'tourism', 'initiatives', 'industry'] As of 2003 it had an unemployment rate of 18 much higher than the country average and the town was declared an underdeveloped region The town is in the centre of large coal field and there are also smaller amounts of oil in the area The coal mine which in 1989 employed 000 people was finally closed down in 2005 leaving the last group of miners 260 in all unemployed The other major industry of the town is forestry but the large lumberyard and factory were also closed this area of enterprise is now dominated by small businesses There have been new initiatives in recent months to attract investment into the area in both the industrial and tourism sectors
['mississippi', 'newspaper', 'federally', 'picayune'] Picayune was founded in 1904 named by Eliza Jane Poitevent Nicholson the owner and publisher of the New Orleans Daily Picayune newspaper named for the Spanish coin The post office contains mural Lumber Regions of Mississippi painted 

<IPython.core.display.Javascript object>

In [59]:
print([x[1] for x in predictions])
print(true_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

<IPython.core.display.Javascript object>

In [60]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [61]:
avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [62]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 6
wd = 0.3919463087248322
pk = 0.35570469798657717


<IPython.core.display.Javascript object>

## Prediction Tuning

In [94]:
pred_thresholds = [
    0.2,
    0.21,
    0.22,
    0.23,
    0.24,
    0.25,
    0.26,
    0.27,
    0.28,
    0.29,
    0.3,
]  # bert base uncased
pred_thresholds = [
    0.4,
    0.41,
    0.42,
    0.43,
    0.44,
    0.45,
    0.46,
    0.47,
    0.48,
    0.49,
    0.5,
]  # labse
pred_thresholds = [
    0.06,
    0.07,
    0.08,
    0.09,
    0.1,
    0.11,
    0.12,
    0.13,
    0.14,
    0.15,
    0.16,
    0.17,
    0.18,
    0.19,
    0.2,
    0.11,
    0.06,
]  # sentence-transformers
pred_thresholds = [
    0.6,
    0.61,
    0.62,
    0.63,
    0.64,
    0.65,
    0.66,
    0.67,
    0.68,
    0.69,
    0.7,
]  # USE
pred_thresholds = [
    0.65,
    0.66,
    0.67,
    0.68,
    0.69,
    0.7,
    0.71,
    0.72,
    0.73,
    0.74,
    0.75,
    0.76,
    0.77,
    0.78,
    0.79,
    0.64,
]  # Roberta

<IPython.core.display.Javascript object>

In [95]:
for pred_thresh in pred_thresholds:
    modified_predictions = [
        1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
    ]

    pred_string = "".join(str(modified_predictions))
    true_string = "".join(str(true_labels))

    avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

    wd_score = windowdiff(pred_string, true_string, avg_k)
    pk_score = pk(pred_string, true_string, avg_k)

    print(f"pred_thresh = {pred_thresh}")
    print(f"k = {avg_k}")
    print(f"wd = {wd_score}")
    print(f"pk = {pk_score}")
    print("===========================================")

pred_thresh = 0.65
k = 6
wd = 0.2536912751677852
pk = 0.2536912751677852
pred_thresh = 0.66
k = 6
wd = 0.2778523489932886
pk = 0.2778523489932886
pred_thresh = 0.67
k = 6
wd = 0.2778523489932886
pk = 0.2778523489932886
pred_thresh = 0.68
k = 6
wd = 0.2697986577181208
pk = 0.2657718120805369
pred_thresh = 0.69
k = 6
wd = 0.3046979865771812
pk = 0.28859060402684567
pred_thresh = 0.7
k = 6
wd = 0.38926174496644295
pk = 0.3530201342281879
pred_thresh = 0.71
k = 6
wd = 0.3932885906040268
pk = 0.348993288590604
pred_thresh = 0.72
k = 6
wd = 0.44966442953020136
pk = 0.40134228187919463
pred_thresh = 0.73
k = 6
wd = 0.4818791946308725
pk = 0.425503355704698
pred_thresh = 0.74
k = 6
wd = 0.5503355704697986
pk = 0.4657718120805369
pred_thresh = 0.75
k = 6
wd = 0.5865771812080537
pk = 0.49395973154362416
pred_thresh = 0.76
k = 6
wd = 0.625503355704698
pk = 0.5181208053691275
pred_thresh = 0.77
k = 6
wd = 0.6899328859060403
pk = 0.5624161073825503
pred_thresh = 0.78
k = 6
wd = 0.7463087248322148
p

<IPython.core.display.Javascript object>

In [96]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

print(pred_string)
print(true_string)

tn, fp, fn, tp = confusion_matrix(modified_predictions, true_labels).ravel()
precision, recall, f1, _ = precision_recall_fscore_support(
    true_labels, modified_predictions, average="macro"
)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

<IPython.core.display.Javascript object>

In [97]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")
print(f"tn = {tn}")
print(f"fp = {fp}")
print(f"fn = {fn}")
print(f"tp = {tp}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1 = {f1}")

k = 6
wd = 0.23758389261744967
pk = 0.23758389261744967
tn = 210
fp = 31
fn = 3
tp = 6
precision = 0.7690179806362378
recall = 0.5740388275599544
f1 = 0.5929898486879909


<IPython.core.display.Javascript object>

## KeyBERT Embedding Comparison

In [172]:
curr = 230
prev = curr - 1

<IPython.core.display.Javascript object>

In [None]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

In [205]:
cohesion = coherence.get_coherence(
    [text_data[curr], text_data[prev]], coherence_threshold=0.25
)
print([k[0] for k in cohesion])

Got the keywords in 0.6567 seconds
Got the embeddings and comparisons in 0.0007 seconds
['cantonese', 'languages', 'vietnamese', 'communes']


<IPython.core.display.Javascript object>

In [206]:
# get the keywords for the current sentences
keywords_current = keywords_lib.get_keywords_with_kb_embeddings(text_data[curr])
keywords_prev = keywords_lib.get_keywords_with_kb_embeddings(text_data[prev])

# compute the word comparisons between the previous (with the coherence map)
# and the current (possibly the first sentence in a new segment)
word_comparisons_with_coherence, weights = compare_coherent_words(
    [keywords_prev], keywords_current
)

<IPython.core.display.Javascript object>

In [207]:
[(x[0], x[1]) for x in keywords_current], [(x[0], x[1]) for x in keywords_prev]

([('township', 0.2304),
  ('communes', 0.1857),
  ('hải', 0.1399),
  ('wards', 0.1397),
  ('đông', 0.1224)],
 [('cantonese', 0.5038),
  ('mandarin', 0.464),
  ('languages', 0.3483),
  ('language', 0.343),
  ('vietnamese', 0.3184)])

<IPython.core.display.Javascript object>

# KeyBERT Embedding Testing

In [679]:
docs = [
    "Hi my name is Devarsh",
    "Devarsh likes to play Basketball.",
    "I love to watch Cricket.",
    "I am a strong programmer. And my name is Devarsh",
]

<IPython.core.display.Javascript object>

In [680]:
from keybert import KeyBERT

kw_model = KeyBERT()
doc_embeddings, word_embeddings = kw_model.extract_embeddings(
    docs, min_df=1, stop_words="english"
)
keywords = kw_model.extract_keywords(
    docs,
    min_df=1,
    stop_words="english",
    doc_embeddings=doc_embeddings,
    word_embeddings=word_embeddings,
)

<IPython.core.display.Javascript object>

In [681]:
len(doc_embeddings)

4

<IPython.core.display.Javascript object>

In [682]:
len(word_embeddings)

10

<IPython.core.display.Javascript object>

In [683]:
keywords

[[('devarsh', 0.6267), ('hi', 0.5216)],
 [('devarsh', 0.6549),
  ('basketball', 0.5558),
  ('play', 0.3787),
  ('likes', 0.2284)],
 [('cricket', 0.7118), ('watch', 0.3656), ('love', 0.307)],
 [('programmer', 0.5942), ('devarsh', 0.5528), ('strong', 0.3452)]]

<IPython.core.display.Javascript object>

In [701]:
kw_model = KeyBERT()
import torch


def get_keywords_with_embeddings_test(
    data,
) -> list[tuple[str, float, torch.Tensor]]:
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(data)

    keywords = kw_model.extract_keywords(
        data, doc_embeddings=doc_embeddings, word_embeddings=word_embeddings
    )

    keywords_with_embeddings = []
    count = 0
    print(len(word_embeddings))
    for i, (kw, we) in enumerate(zip(keywords, word_embeddings)):
        for j, words in enumerate(kw):
            keywords_with_embeddings.append((words[0], words[1], torch.tensor(we)))
            count += 1

    return keywords_with_embeddings

<IPython.core.display.Javascript object>

In [702]:
embeddings = get_keywords_with_embeddings_test(docs)

10


<IPython.core.display.Javascript object>

In [703]:
len(embeddings)

12

<IPython.core.display.Javascript object>