In [199]:
# pip install sentence-transformers

In [200]:
# pip install hdbscan

In [252]:
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import warnings
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import hdbscan
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc
warnings.filterwarnings("ignore")
import datetime as dt
import os
import re
from tqdm import tqdm
# import inflect
# import pycld2 as cld2
import regex

The query below pulls the search terms from the trends final report. I'm filtering by category and date. I would suggest pulling the data by category because pulling everything gives a lot of search terms. You can also play with the date and see if a different date gives better results. 

In [253]:
%%bigquery df
SELECT search_term, kids, ya, adult,fiction, nonfiction, religious,christian,unknown,lemmatized_search_term, pct_rank
FROM hc-data-prod-analytics.ds_prod.ds_trends_final_report
WHERE title_care=False
AND nonfiction = False
AND fiction = True
AND language = 'english'
AND start_date >= '2024-06-01';

Query is running:   0%|          |

Downloading:   0%|          |

In [254]:
df.head()

Unnamed: 0,search_term,kids,ya,adult,fiction,nonfiction,religious,christian,unknown,lemmatized_search_term,pct_rank
0,the last one left,False,False,True,True,False,False,False,False,last leave one,0.567624
1,a a dark,False,False,True,True,False,False,False,False,dark,0.652423
2,a.a. dark,False,False,True,True,False,False,False,False,a.a dark,0.685457
3,zombie apocalypse books,False,False,True,True,False,False,False,False,apocalypse zombie,0.709801
4,tortured poets department piano book,False,False,True,True,False,False,False,False,department piano poet torture,0.638179


In [255]:
from google.cloud import bigquery
client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: hc-data-prod-analytics


In [256]:
terms = list(df['search_term'].unique())
terms[:5]

['the last one left',
 'a a dark',
 'a.a. dark',
 'zombie apocalypse books',
 'tortured poets department piano book']

In [257]:
len(terms)

5245

In [258]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

This query is pretty slow. It pulls the products associated to the search terms we pulled from the previous query. It's slow because it cycles through chunks of 1000 search terms. You can try to increase the number of search terms, or even try with all of them at once. I cycle through them because when I tried the query crashed. It might not crash if you apply stricter filters to the previous query. 

In [259]:
%%time
products = []
for chunk in chunks(terms,3000):
    chunk = '("'+'","'.join(chunk)+'")'   
    q="""
    SELECT clickeditemname, searchterm 
    FROM `hc-data-prod-analytics.datascience.amazon_weekly_search_term_full`
    WHERE searchterm IN {}
    """.format(chunk)
    products.append(client.query(q,location="US").to_dataframe())

CPU times: user 711 ms, sys: 246 ms, total: 957 ms
Wall time: 11 s


In [260]:
# products

In [261]:
products = pd.concat(products)
products.head()

Unnamed: 0,clickeditemname,searchterm
0,The Complete Isaac Asimov's Foundation Series ...,issac asimov books
1,Foundation,issac asimov books
2,The Robot Series ( 4 Book Set ),issac asimov books
3,"My Hero Academia, Vol. 1 (1)",my hero manga
4,"My Hero Academia, Vol. 30 (30)",my hero manga


In [262]:
products = products.groupby(['searchterm'])['clickeditemname'].apply(list)
products = pd.DataFrame(products)

In [263]:
df = pd.merge(df, products, how='left',left_on='search_term',right_on=products.index)
df.head()

Unnamed: 0,search_term,kids,ya,adult,fiction,nonfiction,religious,christian,unknown,lemmatized_search_term,pct_rank,clickeditemname
0,the last one left,False,False,True,True,False,False,False,False,last leave one,0.567624,"[The Only One Left: A Novel, The Only One Left..."
1,a a dark,False,False,True,True,False,False,False,False,dark,0.652423,"[24690 (24690 series Book 1), 24690, 24690: 24..."
2,a.a. dark,False,False,True,True,False,False,False,False,a.a dark,0.685457,"[24690 (24690 series Book 1), 24690, 24690: 24..."
3,zombie apocalypse books,False,False,True,True,False,False,False,False,apocalypse zombie,0.709801,[The Light We Lost : A Post-Apocalyptic Surviv...
4,tortured poets department piano book,False,False,True,True,False,False,False,False,department piano poet torture,0.638179,[Taylor Swift - The Tortured Poets Department:...


In [264]:
df.shape

(7215, 12)

In [265]:
# df.drop_duplicates(inplace=True)

In [266]:
df = df.reset_index().sort_values(['search_term'])
df.head()

Unnamed: 0,index,search_term,kids,ya,adult,fiction,nonfiction,religious,christian,unknown,lemmatized_search_term,pct_rank,clickeditemname
6162,6162,#1 best seller,False,False,True,True,False,False,False,False,1 seller well,0.017437,[BAGSMART Toiletry Bag Travel Bag with Hanging...
2953,2953,0.00,False,False,True,True,False,False,False,False,0.00,0.689108,[The American Sign Language Alphabet – A Proje...
3084,3084,0.00 free kindle books,False,False,True,True,False,False,False,False,0.00 free,0.907845,"[The House of Closed Doors, Find Free Kindle B..."
2960,2960,0.00 free kindle books romance,False,False,True,True,False,False,False,False,0.00 free romance,0.477479,[Overruled by Love: A Small Town Romance (Boys...
6092,6092,0.99,False,False,True,True,False,False,False,False,0.99,0.514151,[The Complete Father Brown Mysteries ($.99 Mys...


In [267]:
df.isna().sum()

index                     0
search_term               0
kids                      0
ya                        0
adult                     0
fiction                   0
nonfiction                0
religious                 0
christian                 0
unknown                   0
lemmatized_search_term    0
pct_rank                  0
clickeditemname           0
dtype: int64

In [268]:
df.dropna(inplace=True)

In [269]:
df.isna().sum()

index                     0
search_term               0
kids                      0
ya                        0
adult                     0
fiction                   0
nonfiction                0
religious                 0
christian                 0
unknown                   0
lemmatized_search_term    0
pct_rank                  0
clickeditemname           0
dtype: int64

tried both the universal sentence encoder and SBERT for the embeddings.  not really sure if one is better than the other

In [270]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for unique search terms
embeddings = model.encode(df['search_term'].unique().tolist())
embed_df = pd.DataFrame(embeddings, index=df['search_term'].unique())

In [271]:
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# embeddings = embed(newdf['search_term'].unique().tolist())
# len(embeddings), type(embeddings), print(embeddings.shape)

In [272]:
# embed_df = pd.DataFrame(embeddings.numpy())
# embed_df.index = newdf['search_term'].unique()

In [273]:
embed_df.shape

(5245, 384)

transforming the clicked items into a binary matrix

In [274]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)
mlb_array = mlb.fit_transform(df['clickeditemname']) 

In [275]:
mlb_array = pd.DataFrame.sparse.from_spmatrix(mlb_array)
mlb_array['search_term'] = df['search_term'].values
mlb_array.drop_duplicates(subset=['search_term'],inplace=True)
mlb_array.index = mlb_array['search_term'].values
mlb_array.drop('search_term',axis=1,inplace=True)
print(mlb_array.shape)

(5245, 84285)


In [276]:
mlb_array.shape, embed_df.shape

((5245, 84285), (5245, 384))

merging search term embeddings and clicked items

In [277]:
sim_df = embed_df.merge(mlb_array, left_on=embed_df.index, right_on=mlb_array.index)
sim_df.index = sim_df.iloc[:,0].values
sim_df.drop(['key_0'], axis=1,inplace=True)

In [278]:
# %%time
# sim = cosine_distances(sim_df)
# len(sim[0])

In [279]:
# semantic_sim = cosine_similarity(embed_df)
# clickeditems_sim = cosine_similarity(mlb_array)
# combined_sim = (semantic_sim * .8 + clickeditems_sim *.3)
# combined_sim_df = pd.Dataframe(combined_sim, index = embed_df.index, columns = embed_df.index)

In [280]:
%%time
sim2 = cosine_similarity(sim_df)

CPU times: user 2min 34s, sys: 26.9 s, total: 3min 1s
Wall time: 12.3 s


In [281]:
#converting matrix to a dataframe

sims_df = pd.DataFrame(sim2, index=embed_df.index, columns=embed_df.index)
sims_df.index.name = None
new_sims_df = sims_df.reset_index()
new_sims_df = new_sims_df.rename(columns={'index': 'Search Term'})
new_sims_df.head

melt = new_sims_df.melt(id_vars=["Search Term"], var_name="Compared Term", value_name="Similarity Score")

looking at the cosine similarity by grouping by search term and seeing what the top hits are

setting the threshold based on similarity score didn't give the best results, since it doesn't account for the variability of scores for different search terms
so tried to do it based on percentiles, and been testing different ones out

In [282]:
# threshold = .20
# melting = new_sims_df.melt(id_vars=["Search Term"], var_name="Compared Term", value_name="Similarity Score")
# peripheral = melting[melting["Similarity Score"] >= threshold]
# grouped_peripheral = peripheral.groupby("Search Term")

# setting a threshold based on quantiles/percentiles
def dynamic_threshold(df, quantile=0.99):
    thresholds = df.groupby('Search Term')['Similarity Score'].quantile(quantile)
    return thresholds

thresholds = dynamic_threshold(melt, quantile=0.99) # testing different percentiles
filtered_scores = melt.join(thresholds, on='Search Term', rsuffix='_threshold')
filtered_scores = filtered_scores[filtered_scores['Similarity Score'] >= filtered_scores['Similarity Score_threshold']].drop('Similarity Score_threshold', axis=1)


grouped_terms = filtered_scores.groupby("Search Term")

keyword = "enemies to lovers romance"  # testing different search terms

if keyword in grouped_terms.groups:
    compared_df = grouped_terms.get_group(keyword).reset_index(drop=True)
    # Sort by similarity score in descending order
    compared_df = compared_df.sort_values(by="Similarity Score", ascending=False)
else:
    compared_df = pd.DataFrame(columns=["Search Term", "Compared Term", "Similarity Score"])
    print(f"No peripheral words found for {keyword}")

compared_df.head(20)

Unnamed: 0,Search Term,Compared Term,Similarity Score
15,enemies to lovers romance,enemies to lovers romance,1.0
16,enemies to lovers romance,enemies to lovers romance books,0.489985
18,enemies to lovers romance,enemy to lovers romance books,0.447245
17,enemies to lovers romance,enemies to lovers romance fantasy,0.432283
27,enemies to lovers romance,mm enemies to lovers romance,0.421334
13,enemies to lovers romance,enemies to lovers dark romance,0.394696
51,enemies to lovers romance,why choose enemies to lovers,0.390998
12,enemies to lovers romance,enemies to lovers books spicy,0.35269
4,enemies to lovers romance,angsty romance betrayal,0.322899
14,enemies to lovers romance,enemies to lovers free books,0.321018


here i'm testing the threshold, by using the table below, left joining it with the cosine sim matrix, and grouping by trend.
this way you can see the search terms that are already mapped in the trend, along with the unmapped search terms that fall into the trend based on the threshold set

In [283]:
%%bigquery df_trends
SELECT * FROM `hc-data-prod-analytics.oss.bi_dashboard_trends_living_searchterms`

Query is running:   0%|          |

Downloading:   0%|          |

In [284]:
df_trends.head()

Unnamed: 0,searchterm,trend
0,tiktok cookbook,Booktok
1,book tok,Booktok
2,booktok books 2024,Booktok
3,tiktok cookbook 2023,Booktok
4,booktok journal,Booktok


In [297]:
def dynamic_threshold(df, quantile=0.99):
    thresholds = df.groupby('Search Term')['Similarity Score'].quantile(quantile)
    return thresholds

thresholds = dynamic_threshold(melt, quantile=0.99)

filtered = melt.join(thresholds, on='Search Term', rsuffix='_threshold')
filtered = filtered[filtered['Similarity Score'] >= filtered['Similarity Score_threshold']].drop('Similarity Score_threshold', axis=1)
# merging table with the cosine sim matrix
filtered_with_trends = filtered.merge(df_trends, left_on='Search Term', right_on='searchterm', how='left')
filtered_with_trends['trend'] = filtered_with_trends['trend'].fillna('Unmapped') # setting 'Unmapped' in the trend column for all the search terms not already in the trends table 
filtered_with_trends.head() # Search Term is the cosine sim matrix and searchterm is from the trends table

Unnamed: 0,Search Term,Compared Term,Similarity Score,searchterm,trend
0,#1 best seller,#1 best seller,1.0,,Unmapped
1,best sellers nyt,#1 best seller,0.466641,,Unmapped
2,booker prize,#1 best seller,0.242559,,Unmapped
3,chick lit best sellers,#1 best seller,0.426865,,Unmapped
4,kindle top 100 best sellers,#1 best seller,0.464779,,Unmapped


In [298]:
#indicates whether the search term was originally unmapped
filtered_with_trends['Unmapped/Mapped'] = filtered_with_trends['trend'].apply(lambda x: 'Unmapped' if x == 'Unmapped' else 'Mapped')
compared_trends = df_trends.rename(columns={'searchterm': 'Compared Term', 'trend': 'Compared Trend'})
filtered_with_trends = filtered_with_trends.merge(compared_trends, on='Compared Term', how='left', suffixes=('', '_compared'))
filtered_with_trends['Compared Trend'] = filtered_with_trends['Compared Trend'].fillna('Unmapped')

#indicates whether the compared term was originally unmapped
filtered_with_trends['Compared Term Unmapped/Mapped'] = filtered_with_trends['Compared Trend'].apply(lambda x: 'Unmapped' if x == 'Unmapped' else 'Mapped')
filtered_with_trends['Final Trend'] = filtered_with_trends.apply(lambda x: x['trend'] if x['trend'] != 'Unmapped' else x['Compared Trend'], axis=1)
grouped_peripheral = filtered_with_trends.groupby("Final Trend")

In [299]:
results = []
for trend, group in grouped_peripheral:
    group = group.reset_index(drop=True)
    group = group.sort_values(by="Similarity Score", ascending=False)
    results.append(group)
final_df = pd.concat(results).reset_index(drop=True)


trend = "Enemies to Lovers Romance"  # testing different trends

if trend in final_df['Final Trend'].values:
    peripheral_df = final_df[final_df['Final Trend'] == trend]
else:
    peripheral_df = pd.DataFrame(columns=["Search Term", "Compared Term", "Similarity Score", "Final Trend", "Unmapped_Indicator", "Unmapped_Compared_Indicator"])
    print(f"No peripheral words found for {example_trend}")

print(peripheral_df.shape)
peripheral_df.head()

(900, 9)


Unnamed: 0,Search Term,Compared Term,Similarity Score,searchterm,trend,Unmapped/Mapped,Compared Trend,Compared Term Unmapped/Mapped,Final Trend
100001,enemies to lovers romance,enemies to lovers romance,1.0,enemies to lovers romance,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance
100002,enemies to lovers romance,enemies to lovers romance,1.0,enemies to lovers romance,Enemies to Lovers Romance,Mapped,Romantic Comedy,Mapped,Enemies to Lovers Romance
100003,enemies to lovers romance,enemies to lovers romance,1.0,enemies to lovers romance,Enemies to Lovers Romance,Mapped,All Romance,Mapped,Enemies to Lovers Romance
100004,enemies to lovers romance,enemies to lovers romance,1.0,enemies to lovers romance,Enemies to Lovers Romance,Mapped,Enemies to Lovers Romance,Mapped,Enemies to Lovers Romance
100005,enemies to lovers romance books,enemies to lovers romance books,1.0,enemies to lovers romance books,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance


In [300]:
peripheral_df['Similarity Score'] = peripheral_df['Similarity Score'].round(4)
df2 = peripheral_df[peripheral_df['Similarity Score'] < 1] 
df2.head()

Unnamed: 0,Search Term,Compared Term,Similarity Score,searchterm,trend,Unmapped/Mapped,Compared Trend,Compared Term Unmapped/Mapped,Final Trend
100026,enemies to lovers romance fantasy,mm enemies to lovers romance,0.706,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance
100027,enemies to lovers romance fantasy,mm enemies to lovers romance,0.706,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,All Romance,Mapped,Enemies to Lovers Romance
100028,enemies to lovers romance books,enemy to lovers romance books,0.6504,enemies to lovers romance books,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance
100029,enemies to lovers romance books,enemy to lovers romance books,0.6504,enemies to lovers romance books,Enemies to Lovers Romance,Mapped,All Romance,Mapped,Enemies to Lovers Romance
100030,enemies to lovers romance fantasy,enemy to lovers romance books,0.6407,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance


In [301]:
# peripheral_df.to_csv('98th Percentile Sports Romance.csv')

looking at the compared terms not mapped to the trend we're looking at

In [302]:
mapped_to_trend = peripheral_df[peripheral_df['Compared Trend'] == "Enemies to Lovers Romance"]['Compared Term'].unique()

In [303]:
#filter out terms not in the trend
df3 = df2[~df2['Compared Term'].isin(mapped_to_trend) & (df2['Compared Trend'] != 'Unmapped')]
print(df3.shape)
print(df3['Compared Term'].nunique())
# df3.to_csv('98th Percentile Sports Romance - Compared Terms Not Mapped to Original Trend.csv')

(633, 9)
128


In [304]:
df3.head()

Unnamed: 0,Search Term,Compared Term,Similarity Score,searchterm,trend,Unmapped/Mapped,Compared Trend,Compared Term Unmapped/Mapped,Final Trend
100026,enemies to lovers romance fantasy,mm enemies to lovers romance,0.706,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance
100027,enemies to lovers romance fantasy,mm enemies to lovers romance,0.706,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,All Romance,Mapped,Enemies to Lovers Romance
100028,enemies to lovers romance books,enemy to lovers romance books,0.6504,enemies to lovers romance books,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance
100029,enemies to lovers romance books,enemy to lovers romance books,0.6504,enemies to lovers romance books,Enemies to Lovers Romance,Mapped,All Romance,Mapped,Enemies to Lovers Romance
100030,enemies to lovers romance fantasy,enemy to lovers romance books,0.6407,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Romcom Books,Mapped,Enemies to Lovers Romance


looking at the already mapped search terms compared to the unmapped search terms that fall within the threshold

In [309]:
df4 = peripheral_df[(peripheral_df['Compared Term Unmapped/Mapped'] == 'Unmapped')]
# df4.to_csv('98th Percentile Sports Romance - Compared Terms Not Mapped to Any Trend.csv')
df4

Unnamed: 0,Search Term,Compared Term,Similarity Score,searchterm,trend,Unmapped/Mapped,Compared Trend,Compared Term Unmapped/Mapped,Final Trend
100041,enemies to lovers romance fantasy,why choose enemies to lovers,0.5744,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100050,enemies to lovers free books,friends to lovers books,0.564,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100086,enemies to lovers free books,little library of banned books,0.5491,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100113,enemies to lovers free books,free novels to read,0.5368,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100115,enemies to lovers romance fantasy,romantic thrillers,0.534,enemies to lovers romance fantasy,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100138,enemies to lovers free books,free books to buy,0.5219,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100147,enemies to lovers free books,full free books,0.5189,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100150,enemies to lovers free books,lulu dean's library of banned books,0.5164,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100152,enemies to lovers free books,buy for free books,0.5161,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance
100161,enemies to lovers free books,folk horror books,0.5152,enemies to lovers free books,Enemies to Lovers Romance,Mapped,Unmapped,Unmapped,Enemies to Lovers Romance


In [306]:
df4['Search Term'].unique() #these are the search terms that are alredy mapped to the trend

array(['enemies to lovers romance fantasy',
       'enemies to lovers free books', 'enemies to lovers books spicy',
       'enemies to lovers dark romance',
       'enemies to lovers romance books', 'enemies to lovers romance'],
      dtype=object)

In [307]:
print(df4['Compared Term'].nunique())
df4['Compared Term'].unique() # unique unmapped terms

26


array(['why choose enemies to lovers', 'friends to lovers books',
       'little library of banned books', 'free novels to read',
       'romantic thrillers', 'free books to buy', 'full free books',
       "lulu dean's library of banned books", 'buy for free books',
       'folk horror books', 'thriller books on sale',
       'diary free kindle books',
       'clean and wholesome romantic suspense ebooks',
       'the strangers book', 'free christian mystery books',
       'free sexy romance', 'disturbing books', 'books used cheap',
       'witches secret love',
       'free kindle books mystery and suspense detective series',
       'best romantic suspense books', 'lesbian mystery and thriller',
       'spicy books for women booktok', 'spicy book',
       'sexy romance books best sellers 2024', 'flirting enemy'],
      dtype=object)

In [308]:
df_trends.trend.unique()

array(['Booktok', 'Fantasy', 'Box Sets', 'Cookbook', 'July 4th',
       'Literary', 'Stoicism', 'Astrology', 'Book Club', 'Earth Day',
       'Romantasy', 'AAPI Month', 'Cozy Books', 'Gift Books',
       'Juneteenth', 'All Romance', 'Beach Books', 'Board Books',
       'Craft Books', 'Deluxe Book', 'Pride Month', 'Winter Read',
       'World War 2', 'Young Adult', 'Baby Romance', 'Banned Books',
       'Bedtime Book', 'Black Friday', 'Budget Books', 'Cozy Fantasy',
       'Cozy Mystery', 'Cute Animals', 'Dark Romance', 'Easter Books',
       "Father's Day", 'Horror Books', 'Memorial Day', "Mother's Day",
       'Nerd Romance', 'Poetry Month', 'Romcom Books', 'Stress Books',
       'Veterans Day', 'Witchy Books', 'Alpha Romance', 'Amish Romance',
       'Angst Romance', 'Anxiety Books', 'Bully Romance', 'Canning Books',
       'Clean Romance', 'Cocktail Book', 'Crime Romance', 'Divorce Books',
       'Drawing Books', 'Easy Cookbook', 'Graphic Novel', 'Grilling Book',
       'Harem Roman