<a href="https://colab.research.google.com/github/lwal123/google_colab/blob/main/prepare_train_test_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This file is prepared for running on Google colab

In [1]:
# Complete hardware summary
!echo "=== GPU INFO ==="
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
!echo "=== CPU INFO ==="
!lscpu | grep "Model name"
!echo "=== MEMORY INFO ==="
!free -h
!echo "=== DISK INFO ==="
!df -h /

=== GPU INFO ===
name, memory.total [MiB], memory.free [MiB]
Tesla T4, 15360 MiB, 15095 MiB
=== CPU INFO ===
Model name:                           Intel(R) Xeon(R) CPU @ 2.20GHz
=== MEMORY INFO ===
               total        used        free      shared  buff/cache   available
Mem:            12Gi       2.4Gi       7.1Gi       1.0Mi       3.2Gi        10Gi
Swap:             0B          0B          0B
=== DISK INFO ===
Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   39G   74G  35% /


In [2]:
# Google Colab's secret management system.

from google.colab import userdata
import os

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [7]:
!pip install scikit-learn pandas datasets accelerate sentence_transformers  numpy




In [45]:
# Set the environment variable to disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [46]:
!pip uninstall wandb -y

[0m

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import numpy as np
import random
import datasets
import accelerate
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import BatchSemiHardTripletLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [4]:
# ----------------------------Load form HuggingFace
dataset = datasets.load_dataset("aaa961/vscode_bugs_duplicates")
df = dataset['train'].to_pandas()

# filter only row with duplicates
# 670 rows
df = df[df["has_duplicates"] == 1]

README.md:   0%|          | 0.00/675 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/34.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32596 [00:00<?, ? examples/s]

In [5]:

# -----------------------------Split into train/test/eval sets with stratification

# Collapse to cluster-level
clusters = df.drop_duplicates("Cluster")[["Cluster", "num_duplicates"]]

# Separate rare categories
rare_clusters = clusters.groupby("num_duplicates").filter(
    lambda g: len(g) == 1 or len(g) == 2
)
common_clusters = clusters.drop(rare_clusters.index)

# Step 1: Split into train (70%) and temp (30%), stratified by num_duplicates on common clusters
train_clusters, temp_clusters = train_test_split(
    common_clusters,
    test_size=0.3,
    stratify=common_clusters["num_duplicates"],
    random_state=42,
)

# Step 2: Split temp into test (15%) and eval (15%),
# NO stratified - to  ensure we don't get error during stratification fro rare clusters
test_clusters, eval_clusters = train_test_split(
    temp_clusters,
    test_size=0.5,
    # stratify=temp_clusters["num_duplicates"],
    random_state=42,
)

# Add rare ones to training (so they're at least represented somewhere)
train_clusters = pd.concat([train_clusters, rare_clusters])


# Map back to original dataframe
train_df = df[df["Cluster"].isin(train_clusters["Cluster"])]
test_df = df[df["Cluster"].isin(test_clusters["Cluster"])]
eval_df = df[df["Cluster"].isin(eval_clusters["Cluster"])]

# --- Debugging ---
print(
    "Train size (number of rows):",
    len(train_df),
    "Num of clusters:",
    train_clusters.shape[0],
)
print(
    "Test size (number of rows):",
    len(test_df),
    "Num of clusters:",
    test_clusters.shape[0],
)
print(
    "Eval size (number of rows):",
    len(eval_df),
    "Num of clusters:",
    eval_clusters.shape[0],
)

print("\nCluster distribution in train:")
print(train_clusters["num_duplicates"].value_counts(normalize=True))

print("\nCluster distribution in test:")
print(test_clusters["num_duplicates"].value_counts(normalize=True))

print("\nCluster distribution in eval:")
print(eval_clusters["num_duplicates"].value_counts(normalize=True))

Train size (number of rows): 476 Num of clusters: 209
Test size (number of rows): 95 Num of clusters: 44
Eval size (number of rows): 96 Num of clusters: 44

Cluster distribution in train:
num_duplicates
2.0    0.827751
3.0    0.119617
4.0    0.028708
6.0    0.009569
7.0    0.009569
5.0    0.004785
Name: proportion, dtype: float64

Cluster distribution in test:
num_duplicates
2.0    0.840909
3.0    0.159091
Name: proportion, dtype: float64

Cluster distribution in eval:
num_duplicates
2.0    0.863636
3.0    0.090909
4.0    0.045455
Name: proportion, dtype: float64


In [8]:
# data format: {"sentence": "text", "label": int}
# where text is "description all" and label is cluster id


train_dataset = Dataset.from_pandas(
    train_df[["Description_all", "Cluster"]].rename(
        columns={"Description_all": "sentence", "Cluster": "label"}
    ).reset_index(drop=True)
)

test_dataset = Dataset.from_pandas(
    test_df[["Description_all", "Cluster"]].rename(
        columns={"Description_all": "sentence", "Cluster": "label"}
    ).reset_index(drop=True)
)

In [9]:
train_dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 476
})

In [10]:
# ----------------------------Prepare data for evaluation
# data format: {anchor: "text", positive: "text", negative: "text"}
# where each triplet: anchor is "description all" , positive is  "description all" of the same cluster, and negative is  "description all" of a different cluster



def create_triplet_dict_for_evaluation(test_df, random_state=42):
    """
    Create triplet dictionary for TripletEvaluator where:
    - anchor: one Description_all text
    - positive: one other Description_all text from same cluster
    - negative: one Description_all text from a different cluster

    Args:
        test_df: DataFrame with columns including 'Description_all', 'Cluster'
        random_state: Random seed for reproducibility

    Returns:
        dict: {"anchor": [texts], "positive": [texts], "negative": [texts]}
              all flat lists of same length
    """
    random.seed(random_state)
    np.random.seed(random_state)

    anchors = []
    positives = []
    negatives = []

    clustered_data = test_df.groupby("Cluster")
    all_clusters = list(test_df["Cluster"].unique())

    print(f"Processing {len(clustered_data)} clusters...")

    for cluster_id, cluster_group in clustered_data:
        # Skip clusters with only one item (no positives possible)
        if len(cluster_group) < 2:
            continue

        cluster_descriptions = cluster_group["Description_all"].tolist()
        other_clusters = [c for c in all_clusters if c != cluster_id]

        # Each item in cluster becomes an anchor
        for i, anchor_text in enumerate(cluster_descriptions):
            # Candidates for positives (same cluster, not the anchor)
            positive_candidates = [
                desc for j, desc in enumerate(cluster_descriptions) if j != i
            ]
            if not positive_candidates or not other_clusters:
                continue

            # Pick exactly one positive and one negative
            positive_text = random.choice(positive_candidates)

            neg_cluster = random.choice(other_clusters)
            neg_candidates = test_df[test_df["Cluster"] == neg_cluster][
                "Description_all"
            ].tolist()
            negative_text = random.choice(neg_candidates)

            # Add triplet
            anchors.append(anchor_text)
            positives.append(positive_text)
            negatives.append(negative_text)

    result = {"anchor": anchors, "positive": positives, "negative": negatives}

    print(f"Generated triplet dictionary:")
    print(f"  - {len(anchors)} triplets total")

    return result


In [11]:
# -----------------------------Prepare data for loss function (as Hugging face Datasets)
# data format for triplet loss: {"anchor": "text", "positive": "text", "negative": "text"}


train_triplet = create_triplet_dict_for_evaluation(train_df)
test_triplet = create_triplet_dict_for_evaluation(test_df)
eval_triplet = create_triplet_dict_for_evaluation(eval_df)



Processing 209 clusters...
Generated triplet dictionary:
  - 476 triplets total
Processing 44 clusters...
Generated triplet dictionary:
  - 95 triplets total
Processing 44 clusters...
Generated triplet dictionary:
  - 96 triplets total


In [12]:
# ----------------------create triplet dictionary for evaluation

TRAIN_EVALUATOR_NAME = "bge-base-en-train"
EVAL_EVALUATOR_NAME = "bge-base-en-eval"


train_evaluator = TripletEvaluator(
    anchors=train_triplet["anchor"],
    positives=train_triplet["positive"],
    negatives=train_triplet["negative"],
    name=TRAIN_EVALUATOR_NAME,
)


test_evaluator = TripletEvaluator(
    anchors=test_triplet["anchor"],
    positives=test_triplet["positive"],
    negatives=test_triplet["negative"],
    name=TRAIN_EVALUATOR_NAME,
)


eval_evaluator = TripletEvaluator(
    anchors=eval_triplet["anchor"],
    positives=eval_triplet["positive"],
    negatives=eval_triplet["negative"],
    name=TRAIN_EVALUATOR_NAME,
)

len(eval_triplet["anchor"])
len(eval_triplet["positive"])
len(eval_triplet["negative"])


96

In [13]:
# ----------------------------Get model and fine-tuning parameters


# To resolve the warning from huggingface/tokenizers about parallelism:
# 1. Avoid using `tokenizers` before the fork if possible.
# 2. Explicitly set the environment variable TOKENIZERS_PARALLELISM to either 'true' or 'false'.
import os

# Set the environment variable to disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TEST_SIZE = 0.2
BASE_MODEL_NAME = "BAAI/bge-base-en"
FINETUNED_MODEL_NAME = "aaa961/finetuned-bge-base-en"

MODEL_OUTPUT_DIR = "./models/bge-base-en"


model = SentenceTransformer(BASE_MODEL_NAME)
loss = BatchSemiHardTripletLoss(model)

#! after installing accelarator (uv add accelarator) you must restart kernel
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=MODEL_OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    report_to="none"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# ---------------------------- Evaluate model before fine-tuning
# why only cosine???
# it's accurecy is 0.96 it's very big , something wrong ????????
# {'bge-base-en-train_cosine_accuracy': 0.96875}
eval_evaluator(model)


{'bge-base-en-train_cosine_accuracy': 0.96875}

In [15]:

# 11 min
# {'bge-base-en-train_cosine_accuracy': 0.9453781247138977}
train_evaluator(model)


{'bge-base-en-train_cosine_accuracy': 0.9453781247138977}

In [16]:
# ---------------------------- Fine-tune model
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=loss,
    evaluator=train_evaluator,
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [17]:

trainer.train()

Step,Training Loss,Validation Loss,Bge-base-en-train Cosine Accuracy
100,0.0,,0.0


TrainOutput(global_step=150, training_loss=0.0, metrics={'train_runtime': 232.402, 'train_samples_per_second': 10.241, 'train_steps_per_second': 0.645, 'total_flos': 0.0, 'train_loss': 0.0, 'epoch': 5.0})

In [19]:
train_evaluator(model)


{'bge-base-en-train_cosine_accuracy': 0.0}

In [20]:
emb = model.encode(train_df["Description_all"].tolist())
print(np.mean(np.linalg.norm(emb, axis=1)))  # should be ~1.0 if normalized
print(np.std(emb, axis=0))  # if near 0, embeddings collapsed

nan
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan na

In [21]:
train_df

Unnamed: 0,Issue id,Summary,Created,Resolved,Description,Resolution,Duplicates,Cluster,num_duplicates,has_duplicates,Description_all
210,218628,Branch list is sometimes out of order,2024-06-26 23:08:22+00:00,2024-07-03 14:11:59+00:00,\r\nType: <b>Bug</b>\r\n\r\n1. Open a workspac...,completed,"[214626, 218628]",299.0,2.0,1,Branch list is sometimes out of order \r\nType...
355,214626,Git Branch Picker Race Condition,2024-06-07 16:10:17+00:00,2024-07-05 13:29:58+00:00,If I paste the branch too quickly and then pre...,completed,"[214626, 218628]",299.0,2.0,1,Git Branch Picker Race Condition If I paste th...
423,213730,Ctrl+I stopped working after first hold+talk+r...,2024-05-28 21:07:05+00:00,2024-05-29 07:28:38+00:00,Testing #213355\r\n\r\nScreencast shows that i...,not_planned,"[213637, 213730]",298.0,2.0,1,Ctrl+I stopped working after first hold+talk+r...
451,213637,`Ctrl + I` does not work when chat input field...,2024-05-28 09:13:33+00:00,2024-05-30 22:32:44+00:00,Testing #213355\r\n\r\n`Ctrl + I` works in the...,completed,"[213637, 213730]",298.0,2.0,1,`Ctrl + I` does not work when chat input field...
543,212508,After updating to version 1.89 my $HOME path i...,2024-05-11 15:03:55+00:00,,Type: <b>Bug</b>\r\n\r\nAfter updating to the ...,,"[212487, 212508]",297.0,2.0,1,After updating to version 1.89 my $HOME path i...
...,...,...,...,...,...,...,...,...,...,...,...
21020,61474,Expanding emmet abbreviations inside JSX inlin...,2018-10-22 08:53:47+00:00,,Issue Type: <b>Bug</b>\r\n\r\nExpanding emmet ...,,"[61474, 115839, 115880, 116151]",36.0,4.0,1,Expanding emmet abbreviations inside JSX inlin...
23979,43928,The bgcolor of selected text in highlighted se...,2018-02-18 12:43:46+00:00,,### Issue Type\r\nBug\r\n\r\n### Description\r...,reopened,"[43928, 179238]",241.0,2.0,1,The bgcolor of selected text in highlighted se...
24379,41574,Can we get a few more pixels for clicking the ...,2018-01-13 16:43:00+00:00,,Hello!\r\n\r\nI'm routinely frustrated when tr...,,"[41574, 127025]",93.0,2.0,1,Can we get a few more pixels for clicking the ...
24527,40262,"Bottom left ""action"" (gear icon) menu not scro...",2017-12-15 06:20:28+00:00,2022-08-04 13:32:52+00:00,- VSCode Version: Code 1.19.0 (816be6780ca8bd0...,completed,"[40262, 110453]",19.0,2.0,1,"Bottom left ""action"" (gear icon) menu not scro..."
