In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [2]:
import pandas as pd

from nltk.tokenize import sent_tokenize

JEST = '/kaggle/input/infinite-jest-text-endnotes-and-definitions/IJ_full_body.csv'
df = pd.read_csv(filepath_or_buffer=JEST, index_col=[0], ).drop(columns=['endnotes_on_page'])
df['text'] = df['text'].apply(func=sent_tokenize)
df = df.explode(column='text', ignore_index=True).dropna()
df['text'] = df['text'].str.replace('\n', ' ')
df['token count'] = df['text'].str.split().str.len()
df = df[df['token count'] > 4] # short documents tend to cluster without regard to their content 
df.head()

Unnamed: 0,page,text,chapter,token count
0,6,"YEAR OF GLAD I am seated in an office, surr...",YEAR OF GLAD,14
1,6,My posture is consciously congruent to the sh...,YEAR OF GLAD,12
2,6,This is a cold room in University Administrat...,YEAR OF GLAD,9
3,6,"Remington-hung, double-windowed against the N...",YEAR OF GLAD,26
5,6,Three faces have resolved into place above sum...,YEAR OF GLAD,27


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23636 entries, 0 to 28607
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page         23636 non-null  int64 
 1   text         23636 non-null  object
 2   chapter      23636 non-null  object
 3   token count  23636 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 923.3+ KB


In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='page').show()
histogram(data_frame=df, x='chapter').show()

Before we go any further what we expect? We know this novel tells stories about Canadian terror groups, about addition and recovery, and about tennis. Let's see if we see related keywords.

In [5]:
# while we're building this thing let's take a sample
df = df.sample(n=5000, random_state=2024)

In [6]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 5 # we have a lot of documents so we can contract our token space somewhat without fear
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
# we use the clean text for keywords even though we show a truncated original message
DOCS = df['text'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we may need to increase the max sequence length from the default of 128
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:01:50.070712
we have 5000 documents and 2253 words.
model time: 0:01:57.269080


In [7]:
df[df['keyword'] != '-none-']['keyword'].value_counts(normalize=True).head(n=10)

keyword
gately     0.028983
hal        0.024666
pemulis    0.014183
marathe    0.012127
mario      0.011511
orin       0.008428
lenz       0.006989
steeply    0.005344
avril      0.004933
tennis     0.004317
Name: proportion, dtype: float64

As is typical with novels, the keyword model tends to pick up character names rather than themes, which is disappointing.

In [8]:
histogram(data_frame=df[df['keyword'] != '-none-']['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count', marginal='box')

In [9]:
import pandas as pd
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
plot_df = df.copy()
plot_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
plot_df['short chapter'] = plot_df['chapter'].apply(func=lambda x: ' '.join(x.split()[:5]))
plot_df['short text'] = plot_df['text'].apply(func=lambda x: ' '.join(x.split()[:20]))
plot_df = plot_df[plot_df['keyword'] != '-none-']
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:43.847299


In [10]:
from plotly.colors import qualitative
from plotly.express import scatter
scatter(data_frame=plot_df, x='u0', y='u1', hover_name='text', color='short chapter', hover_data='keyword',
        color_discrete_sequence=qualitative.Alphabet, height=900
       ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

We have 26 colors and 44 chapters, so we end up reusing some colors when we color by chapter. It doesn't make much difference as our sentence embeddings do not cluster very strongly according to their chapter most (almost all) of the time.

In [11]:
scatter(data_frame=plot_df, x='u0', y='u1', hover_name='short text', hover_data='short chapter', color='keyword',
        color_discrete_sequence=qualitative.Alphabet, height=900
       ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

If we color by keyword we see some keyword clusters emerge; these become clearer if we restrict our attention to top keyword. Because we have only 26 colors in our palette let's take the top 26 keywords.

In [12]:
top_keywords = plot_df['keyword'].value_counts().head(n=26).index.tolist()
scatter(data_frame=plot_df[plot_df['keyword'].isin(top_keywords)], x='u0', y='u1', hover_name='short text', hover_data='short chapter', color='keyword',
        color_discrete_sequence=qualitative.Alphabet, height=900
       ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)