In [1]:
!pip install --quiet KeyBERT
print('installed KeyBERT')

installed KeyBERT


In [2]:
import pandas as pd
from datetime import datetime
from time import mktime

# https://stackoverflow.com/a/6451892
def year_fraction(date):
    def since(date): # returns seconds since epoch
        return mktime(date.timetuple())
    this_year = datetime(year=date.year, month=1, day=1)
    next_year = datetime(year=1 + date.year, month=1, day=1)
    return date.year + (since(date) - since(this_year))/(since(next_year) - since(this_year))

filename = '/kaggle/input/tate-x-owens-interview-10000-yt-comments/tate_candace.csv'
usecols = ['Published At', 'Comment']
df = pd.read_csv(filepath_or_buffer=filename, usecols=usecols, parse_dates=['Published At'])
df = df[usecols]
df['year'] = df['Published At'].apply(func=year_fraction)
df = df.dropna(subset='Comment')
df['token count'] = df['Comment'].apply(func=lambda x: len(x.split()))
df.head()

Unnamed: 0,Published At,Comment,year,token count
0,2023-12-26 00:48:16+00:00,I honestly think its funny that he compares hi...,2023.983653,83
1,2023-12-25 22:51:42+00:00,Wow Andrew either learnt how to act abd speak ...,2023.983432,35
2,2023-12-25 22:40:40+00:00,"Merry Christmas Candice & Andrew, 3 hours of r...",2023.983411,22
3,2023-12-25 21:44:10+00:00,Thank you both for such a realistic convesatio...,2023.983303,12
4,2023-12-25 21:40:04+00:00,Very interesting conversation.,2023.983295,3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10095 entries, 0 to 10095
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   Published At  10095 non-null  datetime64[ns, UTC]
 1   Comment       10095 non-null  object             
 2   year          10095 non-null  float64            
 3   token count   10095 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(1)
memory usage: 394.3+ KB


In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='year', log_y=True)

In [5]:
# use KeyBERT to tag using the top keyword
from arrow import now
from keybert import KeyBERT

MODEL = 'all-MiniLM-L12-v2'
keyword_model = KeyBERT(model=MODEL)

model_start = now()
document_embeddings, word_embeddings = keyword_model.extract_embeddings(docs=df['Comment'].values,)
print('got embeddings')
keywords = keyword_model.extract_keywords(docs=df['Comment'].values, top_n=1)
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

got embeddings
model time: 0:09:33.956605


In [6]:
TOP_N = 26 # we have 26 colors so let's use 26 tags
RESOLVE = {'candice': 'candace', 'tates': 'tate', 'interviews': 'interview', 'conservatives': 'conservative', 'candance': 'candace'}
df['top keyword'] = [item[0][0] if len(item) else 'unknown' for item in keywords]
df['tag'] = df['top keyword'].apply(func=lambda x: x if x not in RESOLVE.keys() else RESOLVE[x])
# all of the keywords out of the top N get retagged as unknown
df['tag'] = df['tag'].apply(func=lambda x: 'unknown' if x not in df['tag'].value_counts()[:TOP_N].index.tolist() else x)
# how much of the corpus did we tag?
df['tag'].value_counts(normalize=True)

tag
unknown         0.535315
tate            0.142744
candace         0.094007
interview       0.087469
andrew          0.041803
conversation    0.012481
islam           0.009510
podcast         0.006934
conservative    0.005944
therapy         0.005745
muslim          0.005250
owens           0.004557
christian       0.004458
trafficking     0.004458
respect         0.004359
love            0.004160
harry           0.004061
masculinity     0.003665
porn            0.003566
pimp            0.003566
covid           0.003269
women           0.003071
video           0.002873
truth           0.002377
amazing         0.002179
men             0.002179
Name: proportion, dtype: float64

In [7]:
df['top keyword'].value_counts(normalize=True,).nlargest(n=30)

top keyword
tate             0.138088
candace          0.085587
interview        0.082813
andrew           0.041803
unknown          0.020505
conversation     0.012481
islam            0.009510
podcast          0.006934
candice          0.006142
therapy          0.005745
muslim           0.005250
interviews       0.004656
tates            0.004656
owens            0.004557
christian        0.004458
trafficking      0.004458
respect          0.004359
love             0.004160
harry            0.004061
masculinity      0.003665
porn             0.003566
pimp             0.003566
conservatives    0.003368
covid            0.003269
women            0.003071
video            0.002873
conservative     0.002576
truth            0.002377
candance         0.002278
men              0.002179
Name: proportion, dtype: float64

In [8]:
from plotly.colors import qualitative
from plotly.express import scatter
from umap import UMAP

umap_model = UMAP(n_components=2, random_state=2023, verbose=True, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df, x='u0', y='u1', hover_name='Comment', height=900, color='tag', color_discrete_sequence=qualitative.Alphabet).show()

UMAP(n_jobs=1, random_state=2023, verbose=True)
Wed Jan  3 17:31:16 2024 Construct fuzzy simplicial set
Wed Jan  3 17:31:16 2024 Finding Nearest Neighbors
Wed Jan  3 17:31:16 2024 Building RP forest with 10 trees
Wed Jan  3 17:31:22 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	Stopping threshold met -- exiting after 6 iterations
Wed Jan  3 17:31:42 2024 Finished Nearest Neighbor Search
Wed Jan  3 17:31:46 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Jan  3 17:31:55 2024 Finished embedding
