In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [2]:
import pandas as pd
from glob import glob

PATHNAME = '/kaggle/input/sentiment-analysis-datasets/*.csv'

df = pd.concat(axis=1, objs=[pd.read_csv(filepath_or_buffer=input_file) for input_file in glob(pathname=PATHNAME)])[['Sentiment', 'text']]
df['token count'] = df['text'].str.split().apply(func=len)
df.head()

Unnamed: 0,Sentiment,text,token count
0,Awe,Awe-inspired by the grandeur of an ancient cat...,10
1,Awe,Awe-struck by the grandeur of an ancient cathe...,10
2,Awe,"Walking the Great Wall of China, each step a t...",14
3,Euphoria,Euphoria floods in as the final puzzle piece c...,11
4,Euphoria,Euphoria floods in as the final puzzle piece f...,10


In [3]:
df['Sentiment'].nunique(), len(df)

(191, 732)

In [4]:
df['Sentiment'].value_counts().head(n=20)

Sentiment
Positive         45
Joy              44
Excitement       37
Contentment      19
Neutral          18
Gratitude        18
Curiosity        16
Serenity         15
Happy            14
Despair          11
Nostalgia        11
Loneliness        9
Sad               9
Awe               9
Hopeful           9
Grief             9
Embarrassed       8
Confusion         8
Acceptance        8
Determination     7
Name: count, dtype: int64

In [5]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 2
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = df['text'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
model.max_seq_length = 64
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:17.034932
we have 732 documents and 1016 words.
model time: 0:00:17.658418


In [6]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering

N_CLUSTERS = 3
clustering_model = AgglomerativeClustering(n_clusters=N_CLUSTERS) 
clustering_model.fit(X=document_embeddings)
df['cluster'] = clustering_model.labels_.tolist()
count = 0
for item in df['Sentiment'].unique():
    count += 1 if df[df['Sentiment'] == item]['cluster'].nunique() > 1 else 0
print(N_CLUSTERS, count)


3 39


In [7]:
df.head()

Unnamed: 0,Sentiment,text,token count,keyword,cluster
0,Awe,Awe-inspired by the grandeur of an ancient cat...,10,cathedral,0
1,Awe,Awe-struck by the grandeur of an ancient cathe...,10,cathedral,0
2,Awe,"Walking the Great Wall of China, each step a t...",14,walking,0
3,Euphoria,Euphoria floods in as the final puzzle piece c...,11,euphoria,2
4,Euphoria,Euphoria floods in as the final puzzle piece f...,10,euphoria,2


In [8]:
import pandas as pd
from plotly.express import scatter
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
df['short text'] = df['text'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...' if len(x.split()) > 20 else x)
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=df[~df['keyword'].isin(IGNORE)], x='u0', y='u1', hover_name='short text',
        hover_data=['keyword', 'Sentiment', 'cluster'], 
        color = 'cluster', height=800,
        size='token count').show()
scatter(data_frame=df[~df['keyword'].isin(IGNORE)], x='u0', y='u1', hover_name='short text',
        hover_data=['keyword', 'Sentiment', 'cluster'], 
        color = 'Sentiment', height=800,
        size='token count').show()
scatter(data_frame=df[~df['keyword'].isin(IGNORE)], x='u0', y='u1', hover_name='short text',
        hover_data=['keyword', 'Sentiment', 'cluster'], 
        color = 'keyword', height=800,
        size='token count').show()

print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:19.397899


In [9]:
for item in df['Sentiment'].unique():
    print(item, df[df['Sentiment'] == item]['cluster'].unique())

Awe [0 2]
Euphoria [2 0]
Exploration [0]
Gratitude [0]
Serenity [2 0]
Grandeur [0]
Admiration [0]
Connection [0]
Inspiration [2 0]
Hypnotic [0]
Amazement [0]
Journey [0]
Harmony [0]
Adventure [0]
CulinaryOdyssey [0]
Culinary Adventure [0]
Disgust [1 0]
Positive [0 2]
Indifference [2 0]
Loneliness [1]
Anxiety [2]
Confusion [1]
Numbness [1]
Regret [1]
Frustration [1 0]
Ambivalence [1]
Bitterness [1]
Melancholy [2 1 0]
Fearful [2]
Hopeful [2]
Inspired [2]
Apprehensive [1 2]
Dismissive [2 1]
Frustrated [1]
Hate [1]
Calmness [2]
Shame [1]
Empowerment [0]
Acceptance [1 0]
Empathetic [1]
Jealous [1]
Amusement [0]
Reverence [0]
Hope [0 2]
Blessed [0]
Surprise [0]
Excitement [0 2 1]
Contentment [2 0]
Joy [0 2]
Curiosity [2 0]
Bad [0 2]
Sad [1 2 0]
Neutral [0 1 2]
Arousal [0]
Determination [1 0]
Immersion [0]
Desolation [1]
Happy [0]
Adoration [0]
Free-spirited [2]
Nostalgia [0 2]
Grief [0 1]
Thrilling Journey [0]
Embarrassed [0]
Tenderness [0]
Elation [0]
Contemplation [2 0]
Appreciation [0]
Pr

In [10]:
# let's look at words our little model thinks are related
from sklearn.metrics.pairwise import linear_kernel
words_df = pd.DataFrame(data=linear_kernel(X=word_embeddings), columns=vectorizer.get_feature_names_out())
# we want to ignore self-similarity and focus on medium-strong to strong similarity
words_df = words_df[(words_df < 0.9999) & (words_df > 0.66)]
words = words_df.columns.tolist()

for index, row in words_df.iterrows():
    related = words_df.index[row.notnull()].tolist()
    related = [item for item in related if abs(item - index) > 2]
    related_words = [words[item] for item in related]
    if len(related_words):
        print(index, words[index], related_words)

5 accomplished ['achieved', 'completing', 'finished']
6 accomplishment ['achievement', 'achievements', 'achieving', 'success']
7 achieve ['achieving']
8 achieved ['accomplished', 'achieving']
9 achievement ['accomplishment']
10 achievements ['accomplishment']
11 achieving ['accomplishment', 'achieve', 'achieved', 'success']
12 act ['acts']
13 action ['acts']
17 acts ['act', 'action']
19 admiration ['appreciation']
20 admiring ['gazing']
27 afternoon ['evening', 'morning']
28 age ['old']
30 air ['atmosphere']
37 amused ['laughter']
38 ancient ['archaeological', 'historical']
39 anticipated ['upcoming']
41 anticipation ['excitement']
44 appreciating ['gratefulness', 'thankfulness']
45 appreciation ['admiration', 'gratefulness', 'gratitude', 'thankfulness']
47 archaeological ['ancient', 'historic', 'historical']
51 arousal ['emotion', 'emotions', 'enthusiasm', 'excitement']
52 art ['artistry', 'creativity', 'painting', 'paints']
54 artistic ['creative', 'creativity', 'painted', 'paints']
