# Import Statements

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from torch.utils.data import DataLoader, Dataset
import torch
import glob

# Preprocessing

In [3]:
data = pd.read_csv("./data/CUBE_WEBANALYTICS_SEARCHES_202410101536.csv")

In [4]:
# Preprocessing
data = data.drop_duplicates(subset=['SESSION_ID', 'SEARCH_TERM'])
data = data[~data['SEARCH_TERM'].str.contains(r'http[s]?://|^ecli:', na=False)]
data = data[~data['SEARCH_TERM'].str.contains(r'@', na=False)]
data['SEARCH_TERM'] = (data['SEARCH_TERM']
                       .str.replace(r'[\t\n\\\"(),]', '', regex=True)
                       .str.replace(r'\b(de|het|een|van|en|in|op|voor|bij|uit|ter|zake|terzake)\b', '', regex=True)
                       .str.replace(r'(\b\d{2})([/:])', r'20\1\2', regex=True)
                       .str.replace(r'\bartikel\b', 'art.', regex=True)
                       .str.replace(r'\bart(?!\.)\b', 'art.', regex=True)
                       .str.replace(r'\bmet betrekking tot\b', 'm.b.t.', regex=True)
                       .str.replace(r'\s+', ' ', regex=True)
                       .str.strip())
data.to_csv('./data/data.csv', index=False)

In [5]:
df = pd.read_csv('./data/data.csv')
search_terms = df['SEARCH_TERM'].astype(str).tolist()

# Vectorization

In [6]:
# WORD2VEC MODEL
# Model and Vectorization
model = Word2Vec(sentences=search_terms, vector_size=100, window=5, min_count=1, workers=4)
model.save('./model/model_word2vec.model')
word2vec_vectors = [np.mean([model.wv[token] for token in tokens if token in model.wv.key_to_index], axis=0) if tokens else np.zeros(model.vector_size) for tokens in search_terms]

# Save Vectors
df_vectors = pd.DataFrame(word2vec_vectors)
df_vectors.to_csv('./data/word2vec_vectors.csv', index=False)
word2vec_df = df_vectors

In [None]:
# BERT
# Define a Dataset class for efficient batching
class SearchTermDataset(Dataset):
    def __init__(self, terms):
        self.terms = terms

    def __len__(self):
        return len(self.terms)

    def __getitem__(self, idx):
        return self.terms[idx]

# Create Dataset and DataLoader
dataset = SearchTermDataset(search_terms)
batch_size = 100
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').cuda()  # Ensure model is on GPU

# Function to process and save vectors incrementally
def process_and_save_vectors(dataloader, tokenizer, model, output_prefix="bert_vectors"):
    model.eval()
    all_vectors = []
    for batch_idx, batch in enumerate(dataloader):
        # Tokenize batch
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        encoded_input = {key: val.cuda() for key, val in encoded_input.items()}  # Move to GPU

        # Process batch through BERT
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_vectors = model_output.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token

        # Save batch to disk
        batch_df = pd.DataFrame(batch_vectors)
        batch_df.to_csv(f"{output_prefix}_batch_{batch_idx}.csv", index=False)

        # Optional: Append to all_vectors if in-memory collection is needed
        all_vectors.append(batch_vectors)

        # Clear GPU memory
        torch.cuda.empty_cache()

    # Combine all vectors into one DataFrame if needed
    return np.vstack(all_vectors) if all_vectors else None

# Process data and save vectors
process_and_save_vectors(dataloader, tokenizer, model)

# Path to the batch CSV files
batch_files = glob.glob("bert_vectors_batch_*.csv")

# Sort files numerically by their batch number
batch_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Read and concatenate all batch files
all_data = pd.concat((pd.read_csv(file) for file in batch_files), ignore_index=True)

# Save the combined DataFrame to a single CSV file
all_data.to_csv("./data/bert_vectors.csv", index=False)

print("All files have been combined into 'bert_vectors.csv'.")

In [None]:
# ROBBERT
# Define a Dataset class for efficient batching
class SearchTermDataset(Dataset):
    def __init__(self, terms):
        self.terms = terms

    def __len__(self):
        return len(self.terms)

    def __getitem__(self, idx):
        return self.terms[idx]

# Create Dataset and DataLoader
dataset = SearchTermDataset(search_terms)
batch_size = 100
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained BERT model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('pdelobelle/robbert-v2-dutch-base')
model = RobertaModel.from_pretrained('pdelobelle/robbert-v2-dutch-base').cuda()  # Ensure model is on GPU

# Function to process and save vectors incrementally
def process_and_save_vectors(dataloader, tokenizer, model, output_prefix="robbert_vectors"):
    model.eval()
    all_vectors = []
    for batch_idx, batch in enumerate(dataloader):
        # Tokenize batch
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        encoded_input = {key: val.cuda() for key, val in encoded_input.items()}  # Move to GPU

        # Process batch through BERT
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_vectors = model_output.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token

        # Save batch to disk
        batch_df = pd.DataFrame(batch_vectors)
        batch_df.to_csv(f"{output_prefix}_batch_{batch_idx}.csv", index=False)

        # Optional: Append to all_vectors if in-memory collection is needed
        all_vectors.append(batch_vectors)

        # Clear GPU memory
        torch.cuda.empty_cache()

    # Combine all vectors into one DataFrame if needed
    return np.vstack(all_vectors) if all_vectors else None

# Process data and save vectors
process_and_save_vectors(dataloader, tokenizer, model)

# Path to the batch CSV files
batch_files = glob.glob("robbert_vectors_batch_*.csv")

# Sort files numerically by their batch number
batch_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Read and concatenate all batch files
all_data = pd.concat((pd.read_csv(file) for file in batch_files), ignore_index=True)

# Save the combined DataFrame to a single CSV file
all_data.to_csv("./data/robbert_vectors.csv", index=False)

print("All files have been combined into 'robbert_vectors.csv'.")

Some weights of RobertaModel were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Clustering & Topic Modelling