In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete


In [2]:
from urllib import request
url = 'https://www.gutenberg.org/cache/epub/56796/pg56796.txt'
lines = [line.decode('utf-8') for line in request.urlopen(url=url)]
print(len(lines))
text = ' '.join(lines).replace('\n', ' ').replace('\r', ' ')
text = ' '.join(text.split())


8355


In [3]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = list(tokenizer.tokenize(text))
print(len(sentences))

2668


In [4]:
# sentences = sentences[14:]
sentences = sentences[13:-120]

In [5]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

MIN_DF = 3
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'

model_start = now()
model = KeyBERT(model=MODEL,)
# model.max_seq_length = 512
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=STOP_WORDS, min_df=MIN_DF,)
document_embeddings, word_embeddings = model.extract_embeddings(docs=sentences, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=sentences, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:47.105653
we have 2535 documents and 2913 words.
model time: 0:00:48.789089


In [6]:
import pandas as pd
from plotly.express import scatter
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df = pd.DataFrame(data=umap_model.fit_transform(X=document_embeddings), columns=['u0', 'u1'])
df['text'] = sentences
df['short text'] = [' '.join(item.split()[:20]) for item in sentences]
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]
IGNORE = {'-none-', 'section', 'paragraph'}
scatter(data_frame=df[~df['keyword'].isin(IGNORE)], x='u0', y='u1', hover_name='short text',
        height=900, hover_data=['keyword']
       ).show()
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:15.568940


In [7]:
df['keyword'].value_counts()

keyword
-none-       94
patient      18
physician    16
mls          14
doctor       13
             ..
art           1
annual        1
music         1
teachers      1
errors        1
Name: count, Length: 1226, dtype: int64

In [8]:
import numpy as np
from plotly.express import scatter
from plotly.graph_objects import Figure

def plot_words(arg_words: list, arg_keywords: list, arg_model: UMAP, arg_embeddings: np.ndarray) -> Figure:
    top_indices = [arg_words.tolist().index(keyword) for keyword in arg_keywords]
    result_df = pd.DataFrame(data=arg_model.transform(X=[arg_embeddings[index] for index in top_indices]), 
                             columns=['u0', 'u1'])
    result_df['word'] = arg_keywords
    return scatter(data_frame=result_df, x='u0', y='u1', text='word', height=900).update_traces(marker={'size': 3})

plot_words(arg_words=vectorizer.get_feature_names_out(),
           arg_keywords=[item for item in df['keyword'].unique().tolist() if item != '-none-'],
           arg_model=umap_model, arg_embeddings=word_embeddings).show()