In [1]:
!pip install --quiet KeyBERT
print('installed KeyBERT')

installed KeyBERT


In [2]:
import pandas as pd

filename = '/kaggle/input/tate-x-owens-interview-10000-yt-comments/tate_candace.csv'
usecols = ['Published At', 'Comment']
df = pd.read_csv(filepath_or_buffer=filename, usecols=usecols, parse_dates=['Published At'])
df = df[usecols]
df = df.dropna(subset='Comment')
df.head()

Unnamed: 0,Published At,Comment
0,2023-12-26 00:48:16+00:00,I honestly think its funny that he compares hi...
1,2023-12-25 22:51:42+00:00,Wow Andrew either learnt how to act abd speak ...
2,2023-12-25 22:40:40+00:00,"Merry Christmas Candice & Andrew, 3 hours of r..."
3,2023-12-25 21:44:10+00:00,Thank you both for such a realistic convesatio...
4,2023-12-25 21:40:04+00:00,Very interesting conversation.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10095 entries, 0 to 10095
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   Published At  10095 non-null  datetime64[ns, UTC]
 1   Comment       10095 non-null  object             
dtypes: datetime64[ns, UTC](1), object(1)
memory usage: 236.6+ KB


In [4]:
from re import compile
from re import UNICODE

# https://stackoverflow.com/a/49986645
# this still has some gaps; it's about an 85% solution
PATTERN = compile(pattern = "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       "]+", flags = UNICODE)

def timestamp(arg: str) -> bool:
    arg = arg.replace(',', '')
    if ':' in arg and arg.replace(':', '').isnumeric():
        return True
    return False

def clean(arg: str) -> str:
    pieces = arg.split()
    pieces = [item for item in pieces if not timestamp(item)]
    return PATTERN.sub(r'', ' '.join(pieces))

df['clean'] = df['Comment'].apply(clean).apply(str.strip)
df['token count'] = df['clean'].apply(func=lambda x: len(x.split()))

df = df.dropna(subset='clean')
# remove all the one-token comments and shuffle
df = df[df['token count'] > 1].sample(frac=1.0)
df.shape

(9652, 4)

In [5]:
from plotly.express import histogram
histogram(data_frame=df, x='token count', log_y=True)

In [6]:
# use KeyBERT to tag using the top keyword
from arrow import now
from keybert import KeyBERT

MODEL = 'all-MiniLM-L12-v2'
model = KeyBERT(model=MODEL)

model_start = now()
document_embeddings, word_embeddings = model.extract_embeddings(docs=df['Comment'].values,)
print('{} got embeddings.'.format(now().time()))
keywords = model.extract_keywords(docs=df['Comment'].values, top_n=1)
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

18:46:42.396037 got embeddings.
model time: 0:08:19.349290


In [7]:
TOP_N = 26 # we have 26 colors so let's use 26 tags
# we are folding together some similar tags somewhat arbitrarily
RESOLVE = {'candice': 'candace', 'tates': 'andrew', 'interviews': 'interview', 'conservatives': 'conservative', 'candance': 'candace',
          'islamic': 'islam', 'muslim': 'islam', 'owens': 'candace', 'tate': 'andrew', 'christians': 'christian'}
df['top keyword'] = [item[0][0] if len(item) else 'unknown' for item in keywords]
df['tag'] = df['top keyword'].apply(func=lambda x: x if x not in RESOLVE.keys() else RESOLVE[x])
# all of the keywords out of the top N get retagged as unknown
df['tag'] = df['tag'].apply(func=lambda x: 'unknown' if x not in df['tag'].value_counts()[:TOP_N].index.tolist() else x)
# how much of the corpus did we tag?
df['tag'].value_counts(normalize=True)

tag
unknown         0.506009
andrew          0.192603
candace         0.102880
interview       0.091380
islam           0.016370
conversation    0.013054
podcast         0.007252
christian       0.006734
conservative    0.006216
therapy         0.006009
trafficking     0.004662
respect         0.004351
love            0.004351
harry           0.004248
masculinity     0.003833
porn            0.003730
pimp            0.003626
covid           0.003419
women           0.003212
video           0.003005
men             0.002279
romania         0.002176
prince          0.002176
tucker          0.002176
truth           0.002176
batman          0.002072
Name: proportion, dtype: float64

In [8]:
df['top keyword'].value_counts(normalize=True,).nlargest(n=TOP_N + len(RESOLVE))

top keyword
tate             0.144012
candace          0.089412
interview        0.086511
andrew           0.043722
conversation     0.013054
islam            0.009946
unknown          0.007356
podcast          0.007252
candice          0.006320
therapy          0.006009
muslim           0.005491
interviews       0.004869
tates            0.004869
owens            0.004766
christian        0.004662
trafficking      0.004662
love             0.004351
respect          0.004351
harry            0.004248
masculinity      0.003833
porn             0.003730
pimp             0.003626
conservatives    0.003523
covid            0.003419
women            0.003212
video            0.003005
conservative     0.002694
candance         0.002383
men              0.002279
romania          0.002176
prince           0.002176
tucker           0.002176
truth            0.002176
batman           0.002072
christians       0.002072
intelligent      0.002072
Name: proportion, dtype: float64

In [9]:
from plotly.colors import qualitative
from plotly.express import scatter
from umap import UMAP

umap_model = UMAP(n_components=2, random_state=2023, verbose=True, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df, x='u0', y='u1', hover_name='Comment', hover_data = ['clean', 'top keyword'],
        height=900, color='tag', color_discrete_sequence=qualitative.Alphabet).show()

UMAP(n_jobs=1, random_state=2023, verbose=True)
Wed Jan  3 18:51:29 2024 Construct fuzzy simplicial set
Wed Jan  3 18:51:29 2024 Finding Nearest Neighbors
Wed Jan  3 18:51:29 2024 Building RP forest with 10 trees
Wed Jan  3 18:51:35 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	Stopping threshold met -- exiting after 6 iterations
Wed Jan  3 18:51:53 2024 Finished Nearest Neighbor Search
Wed Jan  3 18:51:57 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan  3 18:52:17 2024 Finished embedding


In [10]:
sorted(df['tag'].unique().tolist())

['andrew',
 'batman',
 'candace',
 'christian',
 'conservative',
 'conversation',
 'covid',
 'harry',
 'interview',
 'islam',
 'love',
 'masculinity',
 'men',
 'pimp',
 'podcast',
 'porn',
 'prince',
 'respect',
 'romania',
 'therapy',
 'trafficking',
 'truth',
 'tucker',
 'unknown',
 'video',
 'women']