In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [2]:
import pandas as pd

IOWA = '/kaggle/input/2024-iowa2k-trump-fb-comments-and-public-sentiment/donald_trump_iowa.csv'
df = pd.read_csv(filepath_or_buffer=IOWA, parse_dates=['date'])
df['commentsCount'] = df['commentsCount'].fillna(value=0).apply(func=int)
df['text length'] = df['text'].str.split().str.len().fillna(value=0).apply(func=int)
df.head()

Unnamed: 0,commentsCount,date,likesCount,text,text length
0,421,2024-01-16 04:46:27+00:00,581,I’m so proud of you tonight! I actually have h...,55
1,10,2024-01-16 03:32:47+00:00,26,Numero uno,2
2,62,2024-01-16 07:06:56+00:00,301,The best is yet to come Mr president. Congratu...,14
3,45,2024-01-16 03:33:13+00:00,371,Congratulations,1
4,179,2024-01-16 03:33:43+00:00,963,I’d vote for Trump over Desantis any day. And...,13


In [3]:
from plotly.express import histogram
histogram(data_frame=df, x='commentsCount',  log_y=True).show()
histogram(data_frame=df, x='likesCount',  log_y=True).show()
histogram(data_frame=df, x='text length',  log_y=True).show()

In [4]:
from plotly.express import scatter
scatter(data_frame=df, y='commentsCount', x='likesCount', log_x=True, log_y=True, hover_name='text', trendline='ols', color='text length')

This is a sample with an unusually high number of comments per like. We generally expect them to be linearly related, but we don't expect there to be 0.23 comments per like on average.

In [5]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 2
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = df['text'].fillna(value='').values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we will capture more than 90% of the content with the default max sequence length of 128
# model.max_seq_length = 256
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:31.022615
we have 2000 documents and 1107 words.
model time: 0:00:32.983922


In [6]:
import pandas as pd
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
# df['short text'] = df['text'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...' if len(x.split()) > 20 else x)
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:15.745370


In [7]:
df['keyword'].nunique(), df['keyword'].value_counts(normalize=True).head(n=10)

(348,
 keyword
 congratulations    0.2410
 president          0.1220
 iowa               0.0655
 trump              0.0505
 2024               0.0270
 love               0.0245
 bless              0.0215
 maga               0.0190
 vote               0.0180
 -none-             0.0130
 Name: proportion, dtype: float64)

In [8]:
histogram(data_frame=df['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count')

First let's focus on the top ten keywords, ignoring the top keyword and the no-keyword keyword.

In [9]:
from plotly.express import scatter
scatter(data_frame=df[df['keyword'].isin(df['keyword'].value_counts().head(n=9).tail(n=8).index.tolist())], x='u0', y='u1', hover_name='text', color='keyword').update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

The top ten or so keywords probably tell us the most about what the corpus is saying. 

In [10]:
scatter(data_frame=df, x='u0', y='u1', hover_name='text', hover_data=['keyword'] ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

The congratulations keyword has a bunch of little islands related to emoji and punctuation.

In [11]:
from plotly.express import scatter
scatter(data_frame=df[~df['keyword'].isin(df['keyword'].value_counts().head(n=10).index.tolist())], x='u0', y='u1', hover_name='text', color='keyword', height=1200).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

The residue plot probably contains the most interesting tweets, but they typically have low-cardinality keywords.