<a href="https://www.kaggle.com/code/mikedelong/cluster-and-tag-with-keybert?scriptVersionId=157561674" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
from datetime import datetime
from time import mktime

# https://stackoverflow.com/a/6451892
def year_fraction(date):
    def since(date): # returns seconds since epoch
        return mktime(date.timetuple())

    startOfThisYear = datetime(year=date.year, month=1, day=1)
    startOfNextYear = datetime(year=1 + date.year, month=1, day=1)

    return date.year + (since(date) - since(startOfThisYear))/(since(startOfNextYear) - since(startOfThisYear))

filename = '/kaggle/input/elon-musk-and-rogan-youtubes-historic-interview/Elon Musk_Joe Rogan.csv'
usecols = ['Published At', 'Comment']
df = pd.read_csv(filepath_or_buffer=filename, parse_dates=['Published At'],
                usecols=usecols)
df = df[usecols].dropna(subset=['Comment']).drop_duplicates(ignore_index=True)
df['year_float'] = df['Published At'].apply(func=year_fraction)
df.head()

Unnamed: 0,Published At,Comment,year_float
0,2023-12-31 14:19:47+00:00,"elon had AI brain extension back then already,...",2023.998896
1,2023-12-30 06:35:01+00:00,How many times they said pit? ;-;,2023.995272
2,2023-12-30 01:28:25+00:00,Thanks for being real. All of us are doing our...,2023.994689
3,2023-12-28 20:31:59+00:00,Hard to believe two smart guys think the human...,2023.991385
4,2023-12-27 18:44:43+00:00,36:01 The biggest lie Joe ever told 🤣,2023.988441


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92185 entries, 0 to 92184
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   Published At  92185 non-null  datetime64[ns, UTC]
 1   Comment       92185 non-null  object             
 2   year_float    92185 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(1)
memory usage: 2.1+ MB


In [3]:
from plotly.express import histogram
histogram(data_frame=df, x='Published At', log_y=True)

In [4]:
!pip install --quiet KeyBERT
print('installed KeyBERT')

installed KeyBERT


In [5]:
# we can only visualize a few thousand points with good performance so let's take the most recent
sample_df = df[df['year_float'] > 2022].copy() # obviously the smaller this number the more documents we will have in our sample
print(sample_df.shape)

(11107, 3)


In [6]:
# use KeyBERT to tag using the top keyword
from arrow import now
from keybert import KeyBERT

MODEL = 'all-MiniLM-L12-v2'
keyword_model = KeyBERT(model=MODEL)

model_start = now()
document_embeddings, word_embeddings = keyword_model.extract_embeddings(docs=sample_df['Comment'].values.tolist(),)
print('got embeddings')
keywords = keyword_model.extract_keywords(docs=sample_df['Comment'].values.tolist(), top_n=1)
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

got embeddings
model time: 0:10:06.009594


In [7]:
TOP_N = 26 # we have 26 colors so let's use 26 tags
sample_df['tag'] = [item[0][0] if len(item) else 'unknown' for item in keywords]
sample_df['tag'] = sample_df['tag'].apply(func=lambda x: 'unknown' if x not in sample_df['tag'].value_counts()[:TOP_N].index.tolist() else x)
# how much of the corpus did we tag?
sample_df['tag'].value_counts(normalize=True)

tag
unknown         0.575313
elon            0.156388
musk            0.060052
ai              0.034393
joe             0.033402
rogan           0.019267
pit             0.013145
tesla           0.013145
podcast         0.011434
interview       0.009453
alien           0.008013
metaverse       0.007023
tunnels         0.006392
twitter         0.006032
simulation      0.006032
love            0.005402
tunnel          0.004412
elons           0.004232
chimps          0.004232
magnet          0.003691
human           0.003421
conversation    0.003331
weed            0.003061
mars            0.002971
flame           0.002881
time            0.002881
Name: proportion, dtype: float64

In [8]:
from plotly.colors import qualitative
from plotly.express import scatter
from umap import UMAP

umap_model = UMAP(n_components=2, random_state=2023, verbose=True, n_jobs=1)
sample_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=sample_df, x='u0', y='u1', hover_name='Comment', height=900, color='tag', color_discrete_sequence=qualitative.Alphabet).show()

UMAP(n_jobs=1, random_state=2023, verbose=True)
Wed Jan  3 14:54:55 2024 Construct fuzzy simplicial set
Wed Jan  3 14:54:55 2024 Finding Nearest Neighbors
Wed Jan  3 14:54:55 2024 Building RP forest with 10 trees
Wed Jan  3 14:55:02 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	 7  /  13
	Stopping threshold met -- exiting after 7 iterations
Wed Jan  3 14:55:24 2024 Finished Nearest Neighbor Search
Wed Jan  3 14:55:28 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Jan  3 14:55:38 2024 Finished embedding


How did we do? We have a timestamp cluster, a non-English cluster, and an emoji cluster that we did not tag, but otherwise this looks pretty good.

In [9]:
from plotly.express import scatter
scatter(data_frame=sample_df, x='year_float', y='tag')