In [None]:
import pandas as pd

df = pd.read_csv('data/trends.csv')

df = df[df['location'].isin(['United Kingdom', 'Scotland', 'Australia', 'Ireland', 'New Zealand', 'Wales'])]
df = df[['category']].drop_duplicates()

df.size

In [None]:
df_wiki = pd.read_csv('data/children_cats.csv')
df_wiki = df_wiki.head(100000)

df_wiki['category'] = df_wiki.iloc[:, 0]
df_wiki = df_wiki[['category']].drop_duplicates()

df_wiki['category'] = df_wiki['category'].str.replace('_', ' ')

df_wiki = df_wiki.sample(frac=1).reset_index(drop=True)

df_wiki.size

In [None]:
from transformers import BertTokenizer, BertModel
import torch  
from tqdm import tqdm  

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 1024
mean_embeddings_np = np.empty((0, 768))

for i in tqdm(range(0, len(df_wiki), batch_size), desc="Processing batches"):
    batch_df = df_wiki.iloc[i:i + batch_size]

    tokens = tokenizer(batch_df['category'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
    tokens = {key: val.to(device) for key, val in tokens.items()}

    with torch.no_grad():
        outputs = model(**tokens)

    last_hidden_states = outputs.last_hidden_state
    mean_embeddings = torch.mean(last_hidden_states, dim=1)
    mean_embeddings_np = np.append(mean_embeddings_np, mean_embeddings.cpu().numpy(), axis=0)

df_wiki['embedding'] = list(mean_embeddings_np)

In [None]:
df

In [None]:
df_wiki.to_csv('data/wiki_categories.csv_100k', index=False)

In [None]:
df.to_csv('data/google_trends.csv', index=False)