# 🏌️‍♂️ Fine-Tune Embedding Model (Colab Ready) For the Golf Caddie AI


In [1]:
# 1. Install dependencies
# (Uncomment the next line if running in Colab)
!pip install sentence-transformers datasets torch pandas



### Environment Variables

In [2]:
# 2. Prompt for model name (and any other secrets)
import os
EMBEDDING_MODEL = input("Enter the base embedding model (e.g., thenlper/gte-small): ")
os.environ["EMBEDDING_MODEL"] = EMBEDDING_MODEL

Enter the base embedding model (e.g., thenlper/gte-small): BAAI/bge-base-en-v1.5


### Upload JSON

In [3]:
# 3. Upload your JSON file (must have 'query' and 'passage' columns)
try:
    from google.colab import files
    uploaded = files.upload()
    JSON_PATH = list(uploaded.keys())[0]
except ImportError:
    JSON_PATH = input("Enter the path to your JSON file: ")


Saving golden_shot_dataset.json to golden_shot_dataset (1).json


### Load sentence pairs

In [4]:
# 4. Load sentence pairs (query, passage)
import pandas as pd
from sentence_transformers import InputExample

df = pd.read_json(JSON_PATH)
df = df[['query', 'ideal_answer']].dropna()

train_samples = [
    InputExample(texts=[f"query: {row['query']}", f"passage: {row['ideal_answer']}"])
    for _, row in df.iterrows()
]
print(f"Loaded {len(train_samples)} training pairs")


Loaded 40 training pairs


### Build the model

In [5]:
# 5. Build model (using e5 which expects 'query: ' and 'passage: ' prefixes)
from sentence_transformers import SentenceTransformer, models, losses
from torch.utils.data import DataLoader

MODEL_NAME = EMBEDDING_MODEL
word_embedding_model = models.Transformer(MODEL_NAME)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Setup Training

In [6]:
# 6. Set up training
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

num_epochs = 1  # You can increase this later
OUTPUT_DIR = f"./{MODEL_NAME.replace('/', '_')}_finetuned"

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=100,
    output_path=OUTPUT_DIR
)
print(f"✅ Model saved to {OUTPUT_DIR}")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtamark22[0m ([33mtamark22-tamark-designs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


✅ Model saved to ./BAAI_bge-base-en-v1.5_finetuned


Push to Hugging Face Hub

In [10]:
# 7. (Optional) Push to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub('mwalker22/BAAI_bge-base-en-v1.5_finetuned')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/mwalker22/BAAI_bge-base-en-v1.5_finetuned/commit/ddf8ad7dd3a5d363edee9c81536e3ba73848212e'