In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete


In [2]:
import pandas as pd
filename = '/kaggle/input/reddit-investing-new-jan07/reddit_investing_new.csv'
df = pd.read_csv(filepath_or_buffer=filename)
df['text'] = df['title'] + ' ' + df['selftext']
df['token count'] = df['text'].str.split().apply(len)
df = df[df['token count'] > 19]
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,score,text,token count
0,t3_190oxui,1704622000.0,investing,Daily General Discussion and Advice Thread - J...,Have a general question? Want to offer some c...,1.0,1.0,1.0,Daily General Discussion and Advice Thread - J...,287
1,t3_190op6z,1704621000.0,investing,Sustainable companies stocks/funds suggestions?,"I am all for sustainability, but for 3 reasons...",1.0,1.0,1.0,Sustainable companies stocks/funds suggestions...,173
2,t3_190ojl3,1704620000.0,investing,Where to follow news on cryptos ?,"Hi folk, i did a bit of trading in 2017 early ...",0.25,0.0,0.0,"Where to follow news on cryptos ? Hi folk, i d...",106
3,t3_190k9fl,1704604000.0,investing,What is the cost basis for stocks acquired fro...,Imagine the following for stock X:\n\n1) I buy...,0.5,0.0,0.0,What is the cost basis for stocks acquired fro...,97
4,t3_190jmn7,1704602000.0,investing,Dallas Fed Pres Logan said Fed might need to s...,Dallas Fed Pres Logan said that if the ON RRP ...,0.85,13.0,13.0,Dallas Fed Pres Logan said Fed might need to s...,386


In [3]:
df.shape

(879, 10)

In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='created_utc')

In [5]:
histogram(data_frame=df, x='token count', log_y=True)

In [6]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

COLUMN = 'text'
MIN_DF = 3
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'

model_start = now()
model = KeyBERT(model=MODEL,)
# if we set this to 512 we get almost all of the input intact
model.max_seq_length = 512
raw_documents = df[COLUMN].values
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF,)
document_embeddings, word_embeddings = model.extract_embeddings(docs=raw_documents, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=raw_documents, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
df['keyword/weight'] = keywords
df['keyword'] = df['keyword/weight'].apply(func=lambda x: x[0][0] if len(x) else '')
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:51.408991
we have 879 documents and 2792 words.
model time: 0:00:52.157055


In [7]:
df['keyword'].value_counts(normalize=True).head(n=20)

keyword
401k           0.070535
investing      0.059158
roth           0.053470
invest         0.052332
ira            0.035267
etfs           0.034130
vanguard       0.022753
stocks         0.020478
etf            0.018203
dividends      0.015927
portfolio      0.013652
treasury       0.012514
stock          0.012514
fund           0.011377
investment     0.011377
gains          0.011377
investments    0.011377
fidelity       0.010239
debt           0.009101
savings        0.009101
Name: proportion, dtype: float64

In [8]:
from plotly.express import histogram
histogram(data_frame=df['keyword'].value_counts(normalize=True).to_frame().reset_index().head(n=40),
          x='keyword', y='proportion')

In [9]:
df['keyword'].value_counts(normalize=True).head(n=40).sum()

0.621160409556314

In [10]:
df['keyword'].nunique()

255

In [11]:
from plotly.express import scatter
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df, x='u0', y='u1', hover_name='title', height=900, color='upvote_ratio', 
       hover_data=['keyword', 'token count', ]).show()
scatter(data_frame=df, x='u0', y='u1', hover_name='title', height=900, color='keyword', 
       hover_data=['keyword', 'token count', ]).show()
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:09.083522
