Notebook is a copy of the script run on Google Colab (used for TPU access for training)
Parameters from optimal checkpoints too large to be accessed, hence re-training would be required to use them. Saved locally.

This trial included using BERT-base-uncased model and fine-tuning on dummy dataset generated via Copilot on RTL violations and reasonable tag names.

In [None]:
%pip install qdrant_client
%pip install datasets
%pip install transformers

In [None]:
# When running in Google Colab, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd
import numpy as np

In [None]:
with open("/workspaces/ViolationClassification_withBERT_Summer2025/UPDATED_combined_multilabel_rtl_violations_dataset.json", "r") as f:
    raw_data = json.load(f)

all_df = pd.DataFrame(raw_data)[["Description", "Tag"]]
print(all_df.shape)
all_df.head()

(1000, 2)


Unnamed: 0,Description,Tag
0,Improper reset logic causes signal to remain c...,"[RESET_ISSUE, STUCK_SIGNAL]"
1,Circular logic path eliminated but does not im...,"[COMB_LOOP, UNUSED_LOGIC, BUFFER_USAGE]"
2,Macro used in RTL block without proper expansion,[MACRO_USAGE]
3,Synthesis tool identified unused registered logic,"[REGISTER_USAGE, UNUSED_LOGIC]"
4,FSM condition causes unreachable state,"[FSM_PROBLEM, CONDITION_ISSUE]"


In [None]:
all_tags = set(tag for tag_list in all_df["Tag"] for tag in tag_list)
num_tags = len(all_tags)
print(f"Total number of unique tags: {num_tags}")

Total number of unique tags: 42


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [None]:
train_df, val_df = train_test_split(all_df,test_size=0.2,random_state=42,stratify=all_df["Tag"])

In [None]:
train_df = train_df.rename(columns={"Description": "text", "Tag": "label"})
val_df = val_df.rename(columns={"Description": "text", "Tag": "label"})

In [None]:
# Binarizer for multilabel encodings of tags
# ONLY needed for mlb clf, not required for embeddings approach
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_df["label"] = mlb.fit_transform(train_df["label"]).tolist()
val_df["label"] = mlb.transform(val_df["label"]).tolist()

In [None]:
print(len(train_df["label"].iloc[0]))

42


In [None]:
print(train_df.shape)
train_df.head()

(800, 2)


Unnamed: 0,text,label
881,Real number declared in procedural logic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
242,casez used with overlapping wildcard patterns,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
626,Improper use of casez leads to ambiguous condi...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
717,Synth tool warning: initial block detected,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
158,Signal name does not comply with naming standards,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
print(val_df.shape)
val_df.head()

(200, 2)


Unnamed: 0,text,label
323,Macro used in procedural block not supported b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
855,Feedback loop in sequential logic causes hazard,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
506,Signal stuck due to missing input change,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
991,Clock signal not routed to flip-flop,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
970,Combinational logic produces constant output,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [None]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

print(type(train_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [None]:
train_dataset = train_dataset.with_format("torch")
val_dataset = val_dataset.with_format("torch")

In [None]:
train_dataset = train_dataset.map(lambda x: {"text": x["text"]})
val_dataset = val_dataset.map(lambda x: {"text": x["text"]})

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
print(train_dataset[0])
print(type(train_dataset[0]["label"]))
print(train_dataset[0]["label"])

{'text': 'Real number declared in procedural logic', 'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
<class 'torch.Tensor'>
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [None]:
from transformers import BertTokenizerFast, BertModel, BertForSequenceClassification, Trainer, TrainingArguments
from torch.nn import BCEWithLogitsLoss
import torch

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def tokenize_and_format(example):
    encoding = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    encoding["labels"] = [float(x) for x in example["label"]]  # 👈 ensure float32
    return encoding

tokenized_train = train_dataset.map(tokenize_and_format)
tokenized_val = val_dataset.map(tokenize_and_format)

# Convert to torch format
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
print(tokenized_train[0]["labels"])
print(tokenized_train[0]["labels"].dtype)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.])
torch.float32


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_tags, problem_type="multi_label_classification")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

In [None]:
print("Any labels:", any(any(label) for label in train_df["label"]))

Any labels: True


In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="steps",
    logging_steps=10,
)

# Custom metrics for multilabel
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.3).astype(int)  # threshold at 0.5

    return {
        "f1_micro": f1_score(labels, preds, average="micro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro",zero_division=0),
        "precision_micro": precision_score(labels, preds, average="micro", zero_division=0),
        "recall_micro": recall_score(labels, preds, average="micro", zero_division=0),
    }

# Initialize trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
print(tokenized_train[0]['labels'].shape)

torch.Size([42])


In [None]:
trainer.train()

  return LooseVersion(v) >= LooseVersion(check)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnandinibohra777[0m ([33mnandinibohra777-self[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  self.scope.user = {"email": email}


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,0.2178,0.199463,0.0,0.0,0.0,0.0
2,0.1684,0.16223,0.0,0.0,0.0,0.0
3,0.1587,0.155619,0.0,0.0,0.0,0.0
4,0.1511,0.147621,0.0,0.0,0.0,0.0
5,0.1422,0.13708,0.060423,0.020704,1.0,0.031153
6,0.1317,0.126162,0.155172,0.068065,1.0,0.084112
7,0.1224,0.11726,0.314136,0.153502,0.983607,0.186916
8,0.1123,0.111143,0.518349,0.262782,0.982609,0.352025
9,0.1104,0.107228,0.638655,0.352708,0.980645,0.47352
10,0.106,0.106027,0.627119,0.339534,0.980132,0.461059


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=500, training_loss=0.15896440577507018, metrics={'train_runtime': 649.6506, 'train_samples_per_second': 12.314, 'train_steps_per_second': 0.77, 'total_flos': 526411100160000.0, 'train_loss': 0.15896440577507018, 'epoch': 10.0})

In [None]:
# Best performance on epoch 9, save that checkpoint
# Replace with checkpoint path when saving after training
# Replace with saved model folder pathway when loading for use
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/checkpoint-450", output_hidden_states=True)
tokenizer = BertTokenizerFast.from_pretrained("/content/drive/MyDrive/Colab Notebooks/checkpoint-450")

In [None]:
# Save pretrained model
# Run ONCE to save model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9")

('/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9/tokenizer.json')

In [None]:
from google.colab import files
files.download("/content/drive/MyDrive/Colab Notebooks/mlb_model_epoch9")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Turn on evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
all_descriptions = all_df["Description"].tolist()
all_tags = all_df["Tag"].tolist()

# Tokenize all descriptions with fine-tuned tokenizer
inputs = tokenizer(all_descriptions, padding=True, truncation=True, return_tensors="pt")

In [None]:
# Approach #1
# For cls token
# [ [CLS for whole sentence embedding] [WORDS per word embedding] [SEP special token embedding] ]
with torch.no_grad():
    outputs = model.bert(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    cls_embeddings = outputs.hidden_states[-1].squeeze().cpu().numpy()

In [None]:
# Approach #2
# For mean pooling
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    hidden_states = outputs.hidden_states
    last_hidden = hidden_states[-1]
    attention_mask = inputs["attention_mask"]

    # mask out padding tokens, take mean over actual tokens
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size())
    sum_embeddings = torch.sum(last_hidden * mask_expanded, dim=1)
    sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
    mean_embeddings = (sum_embeddings / sum_mask).squeeze().cpu().numpy()

  return forward_call(*args, **kwargs)


In [None]:
print(mean_embeddings[0].shape) # (768,)
print(mean_embeddings.dtype) # Should be float32

print(mean_embeddings[0][:10]) # First 10 dims of first embedding
print(np.isnan(mean_embeddings).any()) # Should be False
print(np.all(mean_embeddings == 0)) # Should be False

# Count non-zero embeddings
nonzero_rows = np.sum(np.any(mean_embeddings != 0, axis=1))
print(f"{nonzero_rows} out of {mean_embeddings.shape[0]} embeddings are non-zero")


(768,)
float32
[ 0.0957358   0.51149696 -0.9294634  -0.9774256   1.371013    0.99696416
  0.57825214  0.6574653  -0.7069744  -0.21315512]
False
False
1000 out of 1000 embeddings are non-zero


In [None]:
# Prepare payloads
payloads = pd.DataFrame({
    "id": range(len(all_descriptions)),
    "description": all_descriptions,
    "tag": all_tags
})

payloads.head()

Unnamed: 0,id,description,tag
0,0,Improper reset logic causes signal to remain c...,"[RESET_ISSUE, STUCK_SIGNAL]"
1,1,Circular logic path eliminated but does not im...,"[COMB_LOOP, UNUSED_LOGIC, BUFFER_USAGE]"
2,2,Macro used in RTL block without proper expansion,[MACRO_USAGE]
3,3,Synthesis tool identified unused registered logic,"[REGISTER_USAGE, UNUSED_LOGIC]"
4,4,FSM condition causes unreachable state,"[FSM_PROBLEM, CONDITION_ISSUE]"


In [None]:
payload_dicts = payloads.to_dict(orient="records")

In [None]:
from qdrant_client import QdrantClient, models
from qdrant_client.models import VectorParams, Distance

In [None]:
from google.colab import userdata

In [None]:
qclient = QdrantClient(
    url=userdata.get("QDRANT_DB_URL"),
    api_key=userdata.get("QDRANT_API_KEY")
)

In [None]:
collection_name = "1000_mean_pooling_embeddings"

In [None]:
embedding_dim = len(mean_embeddings[0])
print(embedding_dim)

768


In [None]:
qclient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=embedding_dim,
        distance=Distance.COSINE
    )
)

  qclient.recreate_collection(


True

In [None]:
records = [
    models.Record(
        id=int(payload["id"]),
        vector=mean_embeddings[idx],
        payload={
            "description": payload["description"],
            "tag": payload["tag"]
        }
    )
    for idx, payload in enumerate(payload_dicts)
]


In [None]:
qclient.upload_records(
    collection_name=collection_name,
    records=records
)

  qclient.upload_records(


In [None]:
# Run if restarting notebook
model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9", output_hidden_states=True)
tokenizer = BertTokenizerFast.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/emb_model/1000_model_epoch9")
model.eval()

In [None]:
# Prepare inputs for querying

input_desc = "Flop and CDC synchronization"
inputs = tokenizer(input_desc, return_tensors="pt", padding=True, truncation=True)


# Mean pooling approach for input embeddings
with torch.no_grad():
    output = model(**inputs, output_hidden_states=True, return_dict=True)
    hidden_states = output.hidden_states
    last_hidden = hidden_states[-1]
    attention_mask = inputs["attention_mask"]

    # mask out padding tokens, take mean over actual tokens
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size())
    sum_embeddings = torch.sum(last_hidden * mask_expanded, dim=1)
    sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
    query_vector = (sum_embeddings / sum_mask).squeeze().cpu().numpy()

In [None]:
print(query_vector.shape) # (768,)
print(query_vector.dtype) # Should be float32

(768,)
float32


In [None]:
# Query Qdrant for New Text

results = qclient.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=5,
    score_threshold=0.6  # adjust threshold
)

print("\n")
print("Input Description: "+input_desc)
print("---------------------------------------------")

print("Suggested Tags")
if len(results) == 0:
    print("No results found. Adjust threshold or refine query.")

for result in results:
    print(f"Tag: {result.payload['tag']}\tScore: {result.score:.3f}")


  results = qclient.search(




Input Description: Flop and CDC synchronization
---------------------------------------------
Suggested Tags
Tag: ['FLIP_FLOP']	Score: 0.802
Tag: ['FLIP_FLOP']	Score: 0.802
Tag: ['FLIP_FLOP']	Score: 0.802
Tag: ['FLIP_FLOP']	Score: 0.786
Tag: ['FLIP_FLOP']	Score: 0.786
