In [65]:
import pandas as pd

df = pd.read_csv('data/trends.csv')

# select distinct category from English speaking countries
df = df[df['location'].isin(['United Kingdom', 'Scotland', 'Australia', 'Ireland', 'New Zealand', 'Wales'])]
df = df[['category']].drop_duplicates()

# count the number of categories
df.size

147

In [42]:
df_wiki = pd.read_csv('data/children_cats.csv')

df_wiki['category'] = df_wiki.iloc[:, 0]
df_wiki = df_wiki[['category']].drop_duplicates()

df_wiki['category'] = df_wiki['category'].str.replace('_', ' ')

# shuffle
df_wiki = df_wiki.sample(frac=1).reset_index(drop=True)

# count the number of categories
df_wiki.size


885620

In [66]:
# TODO: Too long to run, also do we need it?
from transformers import BertTokenizer, BertModel
import torch  
from tqdm import tqdm  

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

import numpy as np  # Ensure numpy is imported

batch_size = 1024  # Set your desired batch size
mean_embeddings_np = np.empty((0, 768))  # Initialize an empty array for embeddings

# Process the data in batches
for i in tqdm(range(0, len(df_wiki), batch_size), desc="Processing batches"):  # Wrap the range with tqdm
    batch_df = df_wiki.iloc[i:i + batch_size]

    # Tokenize and encode the texts
    tokens = tokenizer(batch_df['category'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Forward pass, get hidden states
    with torch.no_grad():
        outputs = model(**tokens)

    # Take the output embeddings from the last layer
    last_hidden_states = outputs.last_hidden_state

    # Pooling strategy: Take the mean of all token embeddings
    mean_embeddings = torch.mean(last_hidden_states, dim=1)

    # Convert the tensor to a numpy array and append to the mean_embeddings_np
    mean_embeddings_np = np.append(mean_embeddings_np, mean_embeddings.numpy(), axis=0)

# Assign the embeddings back to the DataFrame
df_wiki['embedding'] = list(mean_embeddings_np)

Processing batches: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


In [61]:
df

Unnamed: 0,category,embedding
155,Men,"[-0.3348294496536255, 0.20340462028980255, -0...."
160,Searches,"[-0.2169940024614334, -0.3081307113170624, 0.0..."
165,Women,"[-0.23026470839977264, 0.2526428997516632, -0...."
205,Popular Movies,"[0.2829076051712036, -0.00320279598236084, -0...."
210,Popular Queries,"[0.1685190200805664, 0.1572684794664383, -0.07..."
...,...,...
26640,DIY,"[0.3040526211261749, -0.25561073422431946, -0...."
26645,Fitness,"[0.26824596524238586, -0.4710530638694763, -0...."
26655,Lyrics,"[-0.2225637137889862, 0.07541773468255997, -0...."
26690,When...?,"[0.2923111319541931, -0.4870285391807556, 0.02..."


In [22]:
df_wiki.to_csv('data/wiki_categories.csv', index=False)

In [67]:
df.to_csv('data/google_trends.csv', index=False)