In [1]:
import pandas as pd
ZEPHYR = '/kaggle/input/q-and-a-with-zephyr-7b/zephyr.json'
df = pd.read_json(lines=True, path_or_buf=ZEPHYR)
df['prompt token count'] = df['prompt'].str.split().str.len()
df['response token count'] = df['response'].str.split().str.len()
df.head()

Unnamed: 0,prompt,response,prompt token count,response token count
0,Delve into the intricate ways Histochemistry i...,"Histochemistry, the study of the chemical comp...",10,546
1,How does Chemical engineering intersect with t...,Chemical engineering plays a significant role ...,11,226
2,What are the implications of Outline of paraps...,"The ""Outline of parapsychology"" is a scientifi...",11,276
3,How does Phytopathology influence the developm...,"Phytopathology, the scientific study of plant ...",9,107
4,Delve into the detailed ethical dilemmas posed...,The insurance industry has witnessed significa...,11,477


In [2]:
df.shape

(179759, 4)

In [3]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [4]:
df.nunique()

prompt                  168394
response                179759
prompt token count          22
response token count      1198
dtype: int64

In [5]:
prompt_df = df[['prompt']].drop_duplicates(ignore_index=True).copy().sample(n=20000, random_state=2024)

In [6]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 10 # we have a lot of documents so we can contract our token space somewhat without fear
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
# we use the clean text for keywords even though we show a truncated original message
DOCS = prompt_df['prompt'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we will capture almost all of the content with the default max sequence length of 128
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
prompt_df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:02:06.832614
we have 20000 documents and 1333 words.
model time: 0:02:16.788852


In [7]:
prompt_df['keyword'].value_counts(normalize=True).head(n=20)

keyword
sociology         0.02585
archaeology       0.01870
psychology        0.01360
anthropology      0.01340
ethical           0.01290
cultural          0.01270
ethics            0.01230
economics         0.01190
history           0.01090
chemistry         0.00970
quantum           0.00935
climate           0.00885
engineering       0.00885
geography         0.00860
economic          0.00770
astronomy         0.00720
governance        0.00645
technology        0.00615
environmental     0.00595
sustainability    0.00595
Name: proportion, dtype: float64

In [8]:
from plotly.express import histogram
histogram(data_frame=prompt_df[prompt_df['keyword'] != '-none-']['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count', marginal='box')

In [9]:
import pandas as pd
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
prompt_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:31.452819


In [10]:
prompt_df.head()

Unnamed: 0,prompt,keyword,u0,u1
34551,What are the implications of Digital Marketing...,marketing,8.747697,9.8256
54975,How does Jewish studies interact with the envi...,environment,9.287416,11.309435
100544,What are some common myths and truths in Socio...,religion,10.103828,11.93854
144655,What are the key skills needed to excel in Anc...,egypt,11.451433,13.415709
47362,Detail the impact of Health informatics on sha...,informatics,7.49374,11.22527


In [11]:
from plotly.express import scatter
top_keywords = prompt_df[prompt_df['keyword'] != '-none-']['keyword'].value_counts().head(n=12).index.tolist()
top_df = prompt_df[prompt_df['keyword'].isin(top_keywords)]
scatter(data_frame=top_df, x='u0', y='u1', hover_name='prompt', color='keyword' ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

Let's take our top prompts with top keywords and go back to our original data to get the responses.

In [12]:
top_response_df = top_df.merge(right=df, on='prompt', how='inner').drop(columns=['prompt token count', 'response token count'])
top_response_df.head()

Unnamed: 0,prompt,keyword,u0,u1,response
0,How does Sociology of the family shape the fut...,sociology,7.003085,12.610957,The Sociology of the Family does not directly ...
1,Explain the role of Faculty (division) in the ...,ethical,7.651914,11.657876,Faculty (division) plays a crucial role in sha...
2,Investigate the role of Biblical archaeology i...,archaeology,11.706668,12.929948,The role of Biblical archaeology in the future...
3,How does Dendrochronology influence the evolut...,cultural,9.645062,11.472548,"Dendrochronology, which is the scientific stud..."
4,Explore the detailed ethical challenges and de...,ethical,11.781575,9.587606,Linear programming (LP) is a mathematical opti...


And use the model above to get the response keywords.

In [13]:
time_start = now()
RESPONSES = top_response_df['response'].values.tolist()
response_document_embeddings, response_word_embeddings = model.extract_embeddings(docs=RESPONSES, vectorizer=vectorizer, )

response_keyword = model.extract_keywords(docs=RESPONSES, 
                                                              top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer, 
                                                              doc_embeddings=response_document_embeddings, 
                                                              word_embeddings=response_word_embeddings, min_df=MIN_DF, )
top_response_df['response keyword'] = [response_keyword[0][0] if len(keyword) else '-none-' for keyword in response_keyword]
print('got response keywords in {}'.format(now() - time_start))

got response keywords in 0:02:53.747509


How often do we get the same keywords for both prompt and response?

In [14]:
top_response_df.head()

Unnamed: 0,prompt,keyword,u0,u1,response,response keyword
0,How does Sociology of the family shape the fut...,sociology,7.003085,12.610957,The Sociology of the Family does not directly ...,"(sociology, 0.5444)"
1,Explain the role of Faculty (division) in the ...,ethical,7.651914,11.657876,Faculty (division) plays a crucial role in sha...,"(sociology, 0.5444)"
2,Investigate the role of Biblical archaeology i...,archaeology,11.706668,12.929948,The role of Biblical archaeology in the future...,"(sociology, 0.5444)"
3,How does Dendrochronology influence the evolut...,cultural,9.645062,11.472548,"Dendrochronology, which is the scientific stud...","(sociology, 0.5444)"
4,Explore the detailed ethical challenges and de...,ethical,11.781575,9.587606,Linear programming (LP) is a mathematical opti...,"(sociology, 0.5444)"
