In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete


In [2]:
import pandas as pd
filename = '/kaggle/input/marcel-proust-in-search-of-lost-time/proust_dataset_ENG.csv'
df = pd.read_csv(filepath_or_buffer=filename, sep='@', index_col=[0])
df['token count'] = df['paragraph'].str.split().apply(len)
df['short paragraph'] = df['paragraph'].apply(func=lambda x: ' '.join(x.split()[:20]))
df.head()

Unnamed: 0,paragraph,volume,chapter,token count,short paragraph
0,For a long time I used to go to bed early. Som...,1,1,262,For a long time I used to go to bed early. Som...
1,I would ask myself what o’clock it could be; I...,1,1,116,I would ask myself what o’clock it could be; I...
2,I would lay my cheeks gently against the comfo...,1,1,171,I would lay my cheeks gently against the comfo...
3,"I would fall asleep, and often I would be awak...",1,1,201,"I would fall asleep, and often I would be awak..."
4,"Sometimes, too, just as Eve was created from a...",1,1,199,"Sometimes, too, just as Eve was created from a..."


In [3]:
from plotly.express import histogram
histogram(data_frame=df, x='token count', log_y=True, color='volume')

In [4]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

COLUMN = 'paragraph'
MIN_DF = 2
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'

model_start = now()
model = KeyBERT(model=MODEL)
# if we set this to 512 we get about 99% of the input intact
model.max_seq_length = 512
raw_documents = df[COLUMN].values
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF,)
document_embeddings, word_embeddings = model.extract_embeddings(docs=raw_documents, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=raw_documents, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:04:45.615215
we have 4416 documents and 16264 words.
model time: 0:04:55.701980


In [5]:
from plotly.express import scatter
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df, x='u0', y='u1', hover_name='short paragraph', height=900, color='volume', ).show()
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:40.521689
