# Finetuning BERT for URL

In [4]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


### Data preprocessing

In [5]:
import pandas as pd
from urllib.parse import urlparse

# Function to preprocess and extract domain from URLs
def extract_domain(url):
    # Check if the URL is not a string (e.g., NaN or None)
    if not isinstance(url, str):
        return ""  # Return an empty string to indicate no domain
        # Clean the URL by removing slashes and quotation marks
    
    # Extract the domain
    parsed_url = urlparse(url)
    domain = parsed_url.netloc or parsed_url.path  # Fallback to path if netloc is empty (e.g., relative URLs)
    domain = domain.replace('www.', '')  # Removing 'www.' for consistency
    domain = domain.replace('/', '').replace('"', '')
    return domain

# Load the datasets
dataset_query = pd.read_csv('dataset_incl_query.csv')
dataset_scraped = pd.read_csv('search_results_DDG.csv')

# Merge datasets on 'EntityNumber'
merged_dataset = pd.merge(dataset_query[['EntityNumber', 'URL', 'SearchQuery']], dataset_scraped, on='EntityNumber')

# Preprocess URLs to extract domains
merged_dataset['CorrectDomain'] = merged_dataset['URL'].apply(extract_domain)
for i in range(1, 6):
    merged_dataset[f'URL{i}Domain'] = merged_dataset[f'URL{i}'].apply(extract_domain)

# Prepare labels: If the correct domain matches one of the scraped domains, label with that index; otherwise, label as -1
# Adjust the labeling function to handle multiple correct URLs
def mark_correct_labels(row):
    labels = []
    for i in range(1, 6):
        # Check if each scraped domain matches the correct domain
        if row['CorrectDomain'] == row[f'URL{i}Domain']:
            labels.append(1)  # Mark as correct
        else:
            labels.append(0)  # Mark as incorrect
    return labels

# Apply the function to each row in the merged dataset
merged_dataset['Labels'] = merged_dataset.apply(mark_correct_labels, axis=1)

# Display the updated dataset with domains and new labels for inspection
print(merged_dataset[['EntityNumber', 'SearchQuery', 'CorrectDomain', 'URL1Domain', 'URL2Domain', 'URL3Domain', 'URL4Domain', 'URL5Domain', 'Labels']].head())


   EntityNumber                                        SearchQuery  \
0  0201.310.929                                      IGL 3600 Genk   
1  0202.239.951                           PROXIMUS 1030 Schaarbeek   
2  0203.201.340             Nationale Bank van België 1000 Brussel   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...   

           CorrectDomain             URL1Domain      URL2Domain  \
0  extranet.iglimburg.be           iglimburg.be  intergalva.com   
1           proximus.com           proximus.com     proximus.be   
2                 nbb.be                 nbb.be          nbb.be   
3           interwaas.be  erfgoedcelwaasland.be         vvsg.be   
4          inami.fgov.be          riziv.fgov.be   riziv.fgov.be   

      URL3Domain               URL4Domain         URL5Domain           Labels  
0   mapcarta.com       roamtechnology.com         geruro.com  [0, 0, 0, 0, 0]  
1    proximus.be

In [6]:
merged_dataset = merged_dataset.drop(['URL', 'URL1', 'URL2', 'URL3', 'URL4', 'URL5'], axis=1)
print(merged_dataset.head())

   EntityNumber                                        SearchQuery  \
0  0201.310.929                                      IGL 3600 Genk   
1  0202.239.951                           PROXIMUS 1030 Schaarbeek   
2  0203.201.340             Nationale Bank van België 1000 Brussel   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...   

           CorrectDomain             URL1Domain      URL2Domain  \
0  extranet.iglimburg.be           iglimburg.be  intergalva.com   
1           proximus.com           proximus.com     proximus.be   
2                 nbb.be                 nbb.be          nbb.be   
3           interwaas.be  erfgoedcelwaasland.be         vvsg.be   
4          inami.fgov.be          riziv.fgov.be   riziv.fgov.be   

      URL3Domain               URL4Domain         URL5Domain           Labels  
0   mapcarta.com       roamtechnology.com         geruro.com  [0, 0, 0, 0, 0]  
1    proximus.be

### Data prepreration for BERT

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class URLDomainDataset(Dataset):
    def __init__(self, queries, domains, labels, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.queries = queries
        self.domains = domains  # List of lists containing domains for each query
        self.labels = labels    # List of lists containing binary labels for each domain
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)  # Ensure correct shape [5]
        
        # Assuming you are combining query with each domain
        # This simplifies the example; adapt as necessary for your actual tokenization logic
        encoded = self.tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoded['input_ids'].squeeze(0),  # Shape [max_len]
            'attention_mask': encoded['attention_mask'].squeeze(0),  # Shape [max_len]
            'labels': labels  # Shape [5]
        }



  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
# Prepare data for splitting
X = merged_dataset[['SearchQuery', 'URL1Domain', 'URL2Domain', 'URL3Domain', 'URL4Domain', 'URL5Domain']].values
y = np.array(merged_dataset['Labels'].tolist())  # Convert labels to a suitable format

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # 20% for validation


In [9]:
# Extracting data
queries = merged_dataset['SearchQuery'].tolist()
domains = merged_dataset[[f'URL{i}Domain' for i in range(1, 6)]].values.tolist()
labels = merged_dataset['Labels'].tolist()
# Prepare the data for the Dataset instances
train_queries = [x[0] for x in X_train]  # Assuming the first column of X is 'SearchQuery'
train_domains = [x[1:] for x in X_train]  # Assuming the rest are 'URL1Domain' to 'URL5Domain'
train_labels = y_train

val_queries = [x[0] for x in X_val]  # Same assumption for validation data
val_domains = [x[1:] for x in X_val]
val_labels = y_val

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dataset instances
train_dataset = URLDomainDataset(train_queries, train_domains, train_labels, tokenizer)
val_dataset = URLDomainDataset(val_queries, val_domains, val_labels, tokenizer)

# Initializing Dataset
dataset = URLDomainDataset(queries, domains, labels, tokenizer)

# Initializing DataLoader
loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [10]:
batch_size = 8  # Define an appropriate batch size for your model and hardware

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### Model definition

In [13]:
from transformers import BertForSequenceClassification, AdamW
import torch
from torch.optim.lr_scheduler import StepLR
import os

# Assuming you're using a GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=5,  # Assuming you have 5 URL domains to classify per query
    problem_type="multi_label_classification",  # Specify the problem type
).to(device)

# Initialize the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=7, gamma=0.1)  # Adjust parameters as needed

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Training loop and evaluation

In [15]:
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, precision_score, recall_score
import numpy as np
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

loss_func = BCEWithLogitsLoss()
def evaluate_model_extended(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            logits = outputs.logits
            preds = torch.sigmoid(logits).round().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(batch['labels'].cpu().numpy())
    
    # Calculate metrics
    hammingLoss = hamming_loss(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro')
    precision_micro = precision_score(true_labels, predictions, average='micro')
    recall_micro = recall_score(true_labels, predictions, average='micro')
    
    return hammingLoss, f1_micro, precision_micro, recall_micro

epochs = 3 
best_hamming_loss = float('inf')
model_save_path = 'best_model_state.bin'

for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True, desc=f'Epoch {epoch+1}/{epochs}')
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = loss_func(outputs.logits, batch['labels'].to(device))
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
    
    scheduler.step()  # Update the learning rate

    # Evaluation step
    hamming_loss_val, f1_micro, precision_micro, recall_micro = evaluate_model_extended(model, val_loader, device)
    print(f'\nValidation Metrics after Epoch {epoch + 1}:')
    print(f'Hamming Loss: {hamming_loss_val}')
    print(f'F1-Score (Micro): {f1_micro}')
    print(f'Precision (Micro): {precision_micro}')
    print(f'Recall (Micro): {recall_micro}')
    
    # Model checkpointing
    if hamming_loss_val < best_hamming_loss:
        best_hamming_loss = hamming_loss_val
        print("Hamming Loss improved, saving model...")
        torch.save(model.state_dict(), model_save_path)

Epoch 1/3:   0%|          | 0/1724 [00:41<?, ?it/s]

### Save model

In [None]:
model.save_pretrained("./BERT_model/model")
tokenizer.save_pretrained("./BERT_model/tokenizer")