In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [2]:
import pandas as pd
from nltk.tokenize import sent_tokenize

ESSAYS = '/kaggle/input/aeon-essays-dataset/essays.csv'
USECOLS = ['title', 'description', 'essay', 'authors', ]
df = pd.read_csv(filepath_or_buffer=ESSAYS, usecols=USECOLS)
# split the essay data into sentences 
s = df['essay'].apply(func=sent_tokenize, ).apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'essay'
df = df.drop(columns=['essay']).join(s)
# for performance reasons we need to take a sample
# we have lots of options for taking samples, so let's pick one that will
# yield a relatively small number of classes
# df = df.sample(n=20000, random_state=2024)
df = df.sample(frac=1).sort_values(by='title').head(n=20000)
df.head()

Unnamed: 0,title,description,authors,essay
1947,Addiction,"Back on the islands of my childhood, I’m cling...",Amy Liptrot,"On my passenger seat is a bat detector, tuned ..."
235,Addiction,The neuroscientific picture of addiction overl...,Zoey Lavallee,Dopamine circuits operate in a larger context.
1930,Addiction,"When I stopped smoking weed, my appetite shriv...",Malcolm Harris,But was it because I was smoking more appetite...
2177,Addiction,Mucking out the pigs together can be just as h...,Tobias Jones,People discover that in a real community there...
1407,Addiction,"Most addicts just stop using in time, without ...",Stacey McKenna,When my roommate and I decided to move across ...


In [3]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

MAX_DF = 1.0
MIN_DF = 4
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = df['essay'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# model.max_seq_length = 
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:06:15.394309
we have 20000 documents and 9539 words.
model time: 0:06:43.903488


In [4]:
df['title'].value_counts()

title
Anthropology          6537
Animals and humans    4691
Architecture          3401
Archaeology           2328
Addiction             1791
Art                    637
Ageing and death       615
Name: count, dtype: int64

We have chosen to sample out of a small number of titles, so we have fewer buckets to put documents in; and by breaking our essays into sentences we should get vectors that represent fewer words than using the full essays; as a result we expect to see obvious clusters in our scatter plot below rather than something random.

In [5]:
import pandas as pd
from plotly.express import scatter
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
df['short text'] = df['essay'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...' if len(x.split()) > 20 else x)
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df[~df['keyword'].isin(IGNORE)].sort_values(by='title'), x='u0', y='u1', hover_name='short text',
        hover_data=['keyword', 'title', 'description', 'authors'],
        color = 'title', height=900
       ).show()
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:47.803004


We see more concentrated clusters in this analysis because we have shorter documents and fewer classes to put them in; but we also see that the middle part of our graph is dominated by documents where the model assigns them somewhat generic keywords that could be as easily in one class or another. 