In [1]:
import pandas as pd

df = pd.read_csv('data/trends.csv')

# select distinct category from English speaking countries
df = df[df['location'].isin(['United Kingdom', 'Scotland', 'Australia', 'Ireland', 'New Zealand', 'Wales'])]
df = df[['category']].drop_duplicates()

# count the number of categories
df.size

147

In [2]:
df_wiki = pd.read_csv('data/children_cats.csv')
df_wiki = df_wiki.head(50000)

df_wiki['category'] = df_wiki.iloc[:, 0]
df_wiki = df_wiki[['category']].drop_duplicates()

df_wiki['category'] = df_wiki['category'].str.replace('_', ' ')

# shuffle
df_wiki = df_wiki.sample(frac=1).reset_index(drop=True)

# count the number of categories
df_wiki.size


100000

In [None]:
# TODO: Too long to run, also do we need it?
from transformers import BertTokenizer, BertModel
import torch  
from tqdm import tqdm  

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

import numpy as np  # Ensure numpy is imported

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 1024  # Set your desired batch size
mean_embeddings_np = np.empty((0, 768))  # Initialize an empty array for embeddings

# Process the data in batches
for i in tqdm(range(0, len(df_wiki), batch_size), desc="Processing batches"):  # Wrap the range with tqdm
    batch_df = df_wiki.iloc[i:i + batch_size]

    # Tokenize and encode the texts
    tokens = tokenizer(batch_df['category'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
    tokens = {key: val.to(device) for key, val in tokens.items()}

    # Forward pass, get hidden states
    with torch.no_grad():
        outputs = model(**tokens)

    # Take the output embeddings from the last layer
    last_hidden_states = outputs.last_hidden_state

    # Pooling strategy: Take the mean of all token embeddings
    mean_embeddings = torch.mean(last_hidden_states, dim=1)

    # Convert the tensor to a numpy array and append to the mean_embeddings_np
    mean_embeddings_np = np.append(mean_embeddings_np, mean_embeddings.cpu().numpy(), axis=0)

# Assign the embeddings back to the DataFrame
df_wiki['embedding'] = list(mean_embeddings_np)

In [None]:
df

In [None]:
df_wiki.to_csv('data/wiki_categories.csv_50k', index=False)

In [None]:
df.to_csv('data/google_trends.csv', index=False)