In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [2]:
import pandas as pd

def clean(arg: str) -> str:
    return arg.replace('[^A-Za-z\s]+', '')

BIDEN = '/kaggle/input/biden-2024-facebook-reactions-to-14m-jobs-claim/biden_2024_jobs.csv'
df = pd.read_csv(filepath_or_buffer=BIDEN, parse_dates=['date'])
df['commentsCount'] = df['commentsCount'].fillna(value=0).apply(func=int)
df['text'] = df['text'].fillna(value='')
df['token count'] = df['text'].str.split().str.len()
df = df[df['token count'] > 2] # Really short messages don't carry much meaning
# we need short text to display in our scatter plot
df['short text'] = df['text'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...' if len(x.split()) > 20 else x)
df['clean'] = df['text'].apply(func=clean)
df.head()

Unnamed: 0,commentsCount,text,date,likesCount,token count,short text,clean
0,26,You rock Sir! A President for all Americans.,2024-01-14 20:44:25+00:00,87,8,You rock Sir! A President for all Americans.,You rock Sir! A President for all Americans.
1,20,"You will be forever be remembered, because you...",2024-01-14 20:47:30+00:00,60,22,"You will be forever be remembered, because you...","You will be forever be remembered, because you..."
2,36,"On behalf of everyone, I publicly declare that...",2024-01-14 18:30:21+00:00,73,31,"On behalf of everyone, I publicly declare that...","On behalf of everyone, I publicly declare that..."
3,4,"Mr. President, I would like to sincerely expre...",2024-01-14 18:30:17+00:00,17,100,"Mr. President, I would like to sincerely expre...","Mr. President, I would like to sincerely expre..."
4,23,Good luck mr president,2024-01-14 17:05:14+00:00,72,4,Good luck mr president,Good luck mr president


In [3]:
from plotly.express import histogram
histogram(data_frame=df, x='date', marginal='box')

These comments die off pretty quickly in time; roughly half arrive in the first six hours.

In [4]:
from plotly.express import scatter
scatter(data_frame=df, x='likesCount', y='commentsCount', hover_name='short text', log_x=True, log_y=True, trendline='ols')

This is a surprisingly high comments per likes ratio.

In [5]:
histogram(data_frame=df, x='token count', log_y=True, marginal='box')

Almost all of our messages are shorter than 60 tokens.

In [6]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 2
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
# we use the clean text for keywords even though we show a truncated original message
DOCS = df['clean'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we will capture almost all of the content with the default max sequence length of 128
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:42.813591
we have 1091 documents and 2822 words.
model time: 0:00:44.036765


In [7]:
df['keyword'].value_counts()

keyword
biden         169
president      60
bidenomics     34
joe            31
inflation      30
             ... 
days            1
joes            1
crisis          1
leadership      1
weather         1
Name: count, Length: 406, dtype: int64

In [8]:
import pandas as pd
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:11.639102


In [9]:
histogram(data_frame=df[df['keyword'] != '-none-']['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count', marginal='box')

Our keywords are less concentrated than the quantities we see above; the distribution has a really long tail of small and occasionally mysterious-looking buckets.

In [10]:
scatter(data_frame=df, x='u0', y='u1', hover_name='short text', hover_data=['keyword'] ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

This is what our whole document vector plot looks like; let's see what we see if we look at top keywords.

In [11]:
keywords = [keyword for keyword in df['keyword'].value_counts().head(n=13).index.tolist() if keyword != '-none-']
scatter(data_frame=df[df['keyword'].isin(keywords)], x='u0', y='u1', hover_name='short text', color='keyword', hover_data=['keyword'] ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)