In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete


In [2]:
import pandas as pd

ESSAYS = '/kaggle/input/aeon-essays-dataset/essays.csv'
USECOLS = ['title', 'description', 'essay', 'authors', ]
df = pd.read_csv(filepath_or_buffer=ESSAYS, usecols=USECOLS)
df['title token count'] = df['title'].apply(func=lambda x: len(x.split()))
df['description token count'] = df['description'].apply(func=lambda x: len(x.split()))
df['essay token count'] = df['essay'].apply(func=lambda x: len(x.split()))
df.head()

Unnamed: 0,title,description,essay,authors,title token count,description token count,essay token count
0,Space exploration,When self-replicating craft bring life to the ...,"Some time late this century, someone will push...",Jay Olson,2,21,3608
1,History of science,"To the detriment of the public, scientists and...",Would boycotting Russian scientists be an effe...,Lorraine Daston & Peter Harrison,3,20,3372
2,Religion,"Once a centre of Afghan culture, Sufism seems ...",My introduction into the world of Afghanistan’...,Annika Schmeding,1,22,3798
3,Thinkers and theories,The intrepid logician Kurt Gödel believed in t...,"As the foremost logician of the 20th century, ...",Alexander T Englert,3,19,4168
4,Thinkers and theories,"For Rachel Bespaloff, philosophy was a sensual...",Shortly after Rachel Bespaloff’s suicide in 19...,Isabel Jacobs,3,20,2790


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   title                    2235 non-null   object
 1   description              2235 non-null   object
 2   essay                    2235 non-null   object
 3   authors                  2235 non-null   object
 4   title token count        2235 non-null   int64 
 5   description token count  2235 non-null   int64 
 6   essay token count        2235 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 122.4+ KB


In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='essay token count')

To capture the entire essay 90% of the time we will need a max sequence size of 4k or so.

In [5]:
histogram(data_frame=df, y='title', height=1900)

Here the title is more of a category, and clearly we have a smallish corpus that has a lot of classes of widely varying sizes. Let's try picking a subset.

In [6]:
top_df = df[df['title'].isin(df['title'].value_counts().head(n=5).index.tolist())].copy()
top_df['title'].value_counts()

title
Stories and literature    73
History                   71
Thinkers and theories     70
Ethics                    69
Biology                   62
Name: count, dtype: int64

In [7]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

MAX_DF = 1.0
MIN_DF = 4
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'

model_start = now()
model = KeyBERT(model=MODEL,)
model.max_seq_length = 4 * 1024
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=top_df['essay'].values.tolist(), vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=top_df['essay'].values.tolist(), top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
top_df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:01:21.044954
we have 345 documents and 14397 words.
model time: 0:01:25.749518


In [8]:
top_df['keyword'].value_counts().head(n=20)

keyword
historians         4
morality           4
literature         4
animals            4
plato              3
totalitarianism    2
aristotle          2
philosophy         2
martians           2
slavery            2
secularism         2
silence            2
socrates           2
shakespeare        2
ethics             2
selfish            2
whale              2
holocaust          2
philosophers       2
veganism           2
Name: count, dtype: int64

We have a lot of keywords with relatively low cardinality each.

In [9]:
import pandas as pd
from plotly.express import scatter
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
top_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
top_df['short text'] = top_df['essay'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...')
IGNORE = {'-none-', }
scatter(data_frame=top_df[~top_df['keyword'].isin(IGNORE)], x='u0', y='u1', hover_name='short text',
        hover_data=['keyword', 'title', 'description'],
        color = 'title',
       ).show()
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:11.329655


Essays differ from e.g. scientific papers or social media messages in that they by design don't have to stick to one topic, so it probably isn't surprising that we have a fair amount of mixing here; especially since the differences among these categories are somewhat arbitrary.