This code is for running on Google Colab, connected to a GPU runtime. It gets sentence embeddings for each event which we can run topic modeling with using BERTopic

In [1]:
! pip install meerkat-ml



In [2]:
! pip install "pydantic<2.0"



In [1]:
import meerkat as mk
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!
import os

# the base Google Drive directory
root_dir = "/content/drive/Shared drives/"

# choose where you want your project files to be saved
project_folder = "CESTA Fellows 2023-24/CESTA Events Data/topic_modeling"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

create_and_set_working_directory(project_folder)

In [8]:
import pandas as pd

# Load the CSV file with entities using pandas
input_file_path = 'event_data_edited.csv'
df = pd.read_csv(input_file_path)

# Combine the relevant text columns
df['Combined Text'] = df.apply(lambda row: ' '.join([
    str(row['Title']),
    str(row['Meta Description']),
    str(row['H1 Content']),
    str(row['Paragraph Content'])
]), axis=1)

# Save the DataFrame with the combined text column to a new CSV
combined_text_file_path = 'event_data_with_combined_text.csv'
df.to_csv(combined_text_file_path, index=False)

In [9]:
df = mk.from_csv("event_data_with_combined_text.csv")

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to("cuda")

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



encoded_input = tokenizer(
    df["Combined Text"].tolist(),
    padding=True, truncation=True, return_tensors='pt'
).to("cuda")

with torch.no_grad():
    model_output = model(**encoded_input)

embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1).to("cpu").numpy()

# Add embeddings to Meerkat DataFrame
df["emb"] = list(embeddings)

# embs = []
# for batch in tqdm(df.batch(batch_size=128), total=len(df) // 128):
#   encoded_input = tokenizer(
#       batch["Combined Text"].to_numpy().tolist(),
#       padding=True, truncation=True, return_tensors='pt'
#   ).to("cuda")
#   with torch.no_grad():
#     model_output = model(**encoded_input)


#   emb = mean_pooling(model_output, encoded_input['attention_mask'])
#   emb = F.normalize(emb, p=2, dim=1)
#   embs.append(emb.to("cpu"))

# df["emb"] = torch.concat(embs, axis=0).numpy()


In [20]:
print(f"Shape of embeddings: {embeddings.shape}")

Shape of embeddings: (407, 384)


In [19]:
print(df["emb"].head())

[array([-8.019...dtype=float32), array([-2.689...dtype=float32), array([ 1.239...dtype=float32), array([ 1.301...dtype=float32), array([-8.518...dtype=float32)]


In [21]:
df.write("sentence-embs.mk")

In [22]:
!tar -czvf sentence-embs.mk.tar.gz sentence-embs.mk/

sentence-embs.mk/
sentence-embs.mk/mgr/
sentence-embs.mk/mgr/blocks/
sentence-embs.mk/mgr/blocks/136192260892000/
sentence-embs.mk/mgr/blocks/136192260892000/data.feather
tar: sentence-embs.mk/mgr/blocks/136192260892000/data.feather: file changed as we read it
sentence-embs.mk/mgr/blocks/136192260892000/meta.yaml
tar: sentence-embs.mk/mgr/blocks/136192260892000/meta.yaml: file changed as we read it
sentence-embs.mk/mgr/blocks/136192260886768/
sentence-embs.mk/mgr/blocks/136192260886768/data.npy
tar: sentence-embs.mk/mgr/blocks/136192260886768/data.npy: file changed as we read it
sentence-embs.mk/mgr/blocks/136192260886768/meta.yaml
tar: sentence-embs.mk/mgr/blocks/136192260886768/meta.yaml: file changed as we read it
sentence-embs.mk/mgr/columns/
sentence-embs.mk/mgr/meta.yaml
tar: sentence-embs.mk/mgr/meta.yaml: file changed as we read it
sentence-embs.mk/meta.yaml
tar: sentence-embs.mk/meta.yaml: file changed as we read it
