In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete


In [2]:
import pandas as pd
filename = '/kaggle/input/the-online-plain-text-english-dictionary-opted/OPTED-Dictionary.csv'
df = pd.read_csv(filepath_or_buffer=filename)
df = df.dropna(subset=['Word'])
dash_words = [item for item in df['Word'].values if '-' in item]
df = df[~df['Word'].isin(dash_words)]
df.head()

Unnamed: 0,Word,Count,POS,Definition
0,A,1,"""""","""The first letter of the English and of many o..."
1,A,1,"""""","""The name of the sixth tone in the model major..."
2,A,1,"""""","""An adjective commonly called the indefinite ..."
3,A,1,"""""","""In each; to or for each; as """"""""twenty leagu..."
4,A,1,"""prep.""","""In; on; at; by."""


In [3]:
SIZE = 500
sample_df = df[['Word', 'Definition']].sample(n=SIZE, random_state=2024)
raw_documents = sample_df['Definition'].values.tolist() + sample_df['Word'].values.tolist()

In [4]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

MIN_DF = 1
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'

model_start = now()
model = KeyBERT(model=MODEL,)
# model.max_seq_length = 512
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF,)
document_embeddings, word_embeddings = model.extract_embeddings(docs=raw_documents, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=raw_documents, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:18.447666
we have 1000 documents and 2427 words.
model time: 0:00:19.165811


In [5]:
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
umap_result = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:11.271782


In [6]:
from plotly.express import scatter
from plotly.graph_objects import Figure
import numpy as np

def plot_words(arg_words: list, arg_keywords: list, arg_model: UMAP, arg_embeddings: np.ndarray) -> Figure:
    top_indices = [arg_words.tolist().index(keyword) for keyword in arg_keywords]
    result_df = pd.DataFrame(data=arg_model.transform(X=[arg_embeddings[index] for index in top_indices]), 
                             columns=['u0', 'u1'])
    result_df['word'] = arg_keywords
    return scatter(data_frame=result_df, x='u0', y='u1', text='word', height=900).update_traces(marker={'size': 1})

plot_words(arg_words=vectorizer.get_feature_names_out(),
           arg_keywords=[item.lower() for item in sample_df['Word'].values.tolist() if item.lower() in vectorizer.get_feature_names_out()],
           arg_model=umap_model,
          arg_embeddings=word_embeddings).show()