In [1]:
from urllib.parse import urlparse, unquote
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import torch
import pandas as pd
import re

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
path = '/home/dataq/Documents/Research/public/mlwl/outputs/sample.csv'

In [4]:
url_list = pd.read_csv(path, usecols=['url'], encoding='utf-8')
url_list.shape

(61, 1)

In [6]:
url_list

Unnamed: 0,url
0,/image/60844/productModel/200x200
1,/image/61474/productModel/200x200
2,/product/31893/62100/سشوار-خانگی-پرنسلی-مدل-PR...
3,/m/product/32574/62991/ماشین-اصلاح-صورت-پرنسلی...
4,/product/10075/13903/مایکروفر-رومیزی-سامسونگ-م...
...,...
56,/m/updateVariation?__amp_source_origin=https:/...
57,/product/29080?model=58289
58,/m/updateVariation?__amp_source_origin=https:/...
59,/product/30472/60169/عطر-و-ادکلن-زنانه-دیور-مد...


In [None]:
def url_for_training(url_list):
    result = url_list['url'].unique()
    result = [re.sub(r'\d+', '<NUM>', url) for url in result]
    result = set(result)   
    return list(result)

train_url_list = url_for_training(url_list)
len(train_url_list)

In [None]:
# Step 1: Clean and tokenize URLs
def tokenize_url(url):
    parsed = urlparse(url)
    path = unquote(parsed.path)
    query = unquote(parsed.query)

    # Extended the regex to include (), [], <>
    delimiters = r"[\/\-\_\=\&\?\.\+\(\)\[\]\<\>\{\}]"
    path_tokens = re.split(delimiters, path.strip("/"))
    query_tokens = re.split(delimiters, query)

    tokens = [tok for tok in path_tokens + query_tokens if tok]
    return tokens

tokenized_urls = [" ".join(tokenize_url(url)) for url in train_url_list]

In [None]:
# Step 2: Load BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

In [None]:
# Step 3: Get BERT embeddings
def get_url_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # back to CPU

embeddings = np.array([get_url_embedding(url) for url in tokenized_urls])

In [None]:
# Step 4: KMeans clustering
n_clusters=400
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

In [None]:
# Step 5: Print Clustered URLs
clustered_urls = {i: [] for i in range(n_clusters)}
for idx, label in enumerate(labels):
    clustered_urls[label].append(train_url_list[idx])

with open("clustered_urls.txt", "w") as f:
    for cluster, urls in clustered_urls.items():
        f.write(f"\nCluster {cluster}:\n")
        for url in urls:
            f.write(f"  {url}\n")


In [None]:
# Step 6: Save clustered URLs to CSV
df_label = pd.DataFrame({
    "masked": train_url_list,
    "cluster": labels
})
df_label = df_label.sort_values(by='cluster')
df_label.to_csv("clustered_urls.csv", index=False, encoding="utf-8")

In [None]:
df_label

In [None]:
url_list['masked'] = url_list['url'].apply(lambda url: re.sub(r'\d+', '<NUM>', url))
url_list

In [None]:
url_list = url_list.merge(df_label[['masked', 'cluster']], on='masked', how='left')
url_list

In [None]:
url_list['cluster'].value_counts()