In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import re
import streamlit as st

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# 📘 Generate OpenAI Embeddings for SHL Catalog

import pandas as pd
import numpy as np
import re
import os
from openai import OpenAI
from tqdm import tqdm
import time
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Load OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load the cleaned catalog
csv_path = "/Users/parakhchaudhary/SHL_Recommendation_System/dataset/dataset_with_minutes.csv"
df = pd.read_csv(csv_path)

# Helper: Clean and format the time duration field
def format_duration(text):
    match = re.search(r"(\d+)", str(text))
    return f"Time required: {int(match.group(1))} minutes" if match else ""

# Build embedding string with field labels
def build_embedding_text(row):
    parts = [
        f"Test Name: {row['name']}",
        f"Test Types: {row['test_type']}",
        f"Remote: {row['remote_testing']}",
        f"Adaptive: {row['adaptive_support']}"
    ]
    time_str = format_duration(row["Assessment Length"])
    if time_str:
        parts.append(time_str)
    return " | ".join(parts)

print("🔄 Building embedding strings...")
df["text_for_embedding"] = df.apply(build_embedding_text, axis=1)

# Generate embeddings with OpenAI
embeddings = []
print("🔄 Generating OpenAI embeddings...")

for text in tqdm(df["text_for_embedding"].tolist()):
    success = False
    while not success:
        try:
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=text
            )
            embeddings.append(response.data[0].embedding)
            success = True
        except Exception as e:
            print("❌ Error (retrying in 3s):", e)
            time.sleep(3)

# Save results
np.save("dataset_openai.npy", embeddings)
df.to_csv("dataset_openai.csv", index=False)

print("✅ Saved OpenAI embeddings to 'dataset_openai.npy'")
print("✅ Saved updated catalog to 'dataset_openai.csv'")


🔄 Building embedding strings...
🔄 Generating OpenAI embeddings...


100%|██████████| 377/377 [04:27<00:00,  1.41it/s]

✅ Saved OpenAI embeddings to 'dataset_openai.npy'
✅ Saved updated catalog to 'dataset_openai.csv'





In [None]:

# # Load the cleaned catalog
# df = pd.read_csv("/Users/parakhchaudhary/SHL_Recommendation_System/dataset/dataset_with_minutes.csv")

# # Load SentenceTransformer model
# model = SentenceTransformer("all-MiniLM-L6-v2")

# # Helper: Clean and format the time duration field
# def format_duration(text):
#     match = re.search(r"(\d+)", str(text))
#     return f"Time required: {int(match.group(1))} minutes" if match else ""

# # Build embedding string with field labels
# def build_embedding_text(row):
#     parts = [
#         f"Test Name: {row['name']}",
#         f"Test Types: {row['test_type']}",
#         f"Remote: {row['remote_testing']}",
#         f"Adaptive: {row['adaptive_support']}",
#     ]
#     time_str = format_duration(row["Assessment Length"])
#     if time_str:
#         parts.append(time_str)
#     return " | ".join(parts)

# # Create embedding input column
# print("🔄 Building embedding strings...")
# df["text_for_embedding"] = df.apply(build_embedding_text, axis=1)

# # Generate embeddings
# print("🔄 Generating embeddings...")
# embeddings = model.encode(df["text_for_embedding"].tolist(), show_progress_bar=True)

# # Save embeddings and updated CSV
# np.save("dataset_final.npy", embeddings)
# df.to_csv("dataset_final.csv", index=False)

# print("✅ Embeddings saved to 'shl_embeddings_with_time.npy'")
# print("✅ Updated catalog saved to 'shl_catalog_with_time.csv'")




🔄 Building embedding strings...
🔄 Generating embeddings...


Batches: 100%|██████████| 12/12 [00:02<00:00,  5.95it/s]

✅ Embeddings saved to 'shl_embeddings_with_time.npy'
✅ Updated catalog saved to 'shl_catalog_with_time.csv'



