In [1]:
!pip install -q beautifulsoup4 requests pandas flask-ngrok pyngrok sentence-transformers faiss-cpu flask flask-restful uvicorn


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import requests, time, re, pandas as pd, numpy as np, pickle, faiss
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from sentence_transformers import SentenceTransformer


In [3]:
BASE_URL = "https://www.shl.com/solutions/products/product-catalog/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def crawl_shl_catalog():
    soup = get_soup(BASE_URL)
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/product-catalog/view/" in href:
            links.add(urljoin(BASE_URL, href))
    records = []
    for link in sorted(list(links)):
        try:
            time.sleep(1)
            psoup = get_soup(link)
            text = psoup.get_text(" ", strip=True)
            if "pre-packaged" in text.lower():
                continue
            title = (psoup.find("h1") or psoup.find("h2") or psoup.title).get_text(strip=True)
            desc = " ".join(p.get_text(" ", strip=True) for p in psoup.find_all("p"))[:1500]
            records.append({"name": title, "url": link, "desc": desc, "text": text})
        except: pass
    return pd.DataFrame(records)

def classify_type(t):
    t = t.lower()
    tech = ["sql","python","java","javascript","css","html","data","excel","automation","coding","developer","analysis","engineer","tableau","selenium"]
    beh = ["personality","behaviour","behavior","communication","leadership","interpersonal","opq","team","values","culture","motivation"]
    if any(k in t for k in tech): return "K"
    if any(k in t for k in beh): return "P"
    return "U"

df_products = crawl_shl_catalog()
df_products["type_tag"] = df_products["text"].apply(classify_type)
df_products["__search_text__"] = (df_products["name"] + " " + df_products["desc"] + " " + df_products["text"]).str.lower()
df_products.to_csv("shl_catalog_scraped.csv", index=False)


In [4]:
import os
from google.colab import files

if not os.path.exists("/content/Gen_AI Dataset (1).xlsx"):
    uploaded = files.upload()

df_excel = pd.read_excel("/content/Gen_AI Dataset (1).xlsx")
unique_urls = pd.Series(df_excel["Assessment_url"].dropna().unique()).astype(str).tolist()

existing = set(df_products["url"])
to_add = [u for u in unique_urls if u not in existing]

def fetch_page(u):
    try:
        r = requests.get(u, headers=HEADERS, timeout=20)
        r.raise_for_status()
        s = BeautifulSoup(r.text, "html.parser")
        title = (s.find("h1") or s.find("h2") or s.title).get_text(strip=True)
        desc = " ".join([p.get_text(" ", strip=True) for p in s.find_all("p")])[:2000]
        text = s.get_text(" ", strip=True)[:8000]
        return {"name": title, "url": u, "desc": desc, "text": text}
    except: return {"name": u, "url": u, "desc": "", "text": ""}

new_rows = [fetch_page(u) for u in to_add]
df_new = pd.DataFrame(new_rows)
df_new["__search_text__"] = (df_new["name"] + " " + df_new["desc"] + " " + df_new["text"]).str.lower()
df_new["type_tag"] = df_new["text"].apply(classify_type)
df_products = pd.concat([df_products, df_new], ignore_index=True).drop_duplicates(subset=["url"]).reset_index(drop=True)
df_products.to_csv("shl_catalog_augmented.csv", index=False)


In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus = (df_products["name"] + ". " + df_products["desc"]).tolist()
embeddings = model.encode(corpus, show_progress_bar=True, convert_to_numpy=True)
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
np.save("embeddings.npy", embeddings)
with open("df_products.pkl","wb") as f: pickle.dump(df_products, f)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
TECH_TOKENS = ["python","sql","javascript","java","excel","tableau","selenium","automation","react","node","css","html"]
BEH_TOKENS = ["personality","behaviour","behavior","interpersonal","communication","leadership","team","culture"]

def diag_counts(df,tokens):
    c={}
    for t in tokens:
        c[t]=int(df["__search_text__"].str.contains(r"\\b"+t+r"\\b",regex=True).sum())
    return c

def query_tokens(q):
    q=q.lower()
    return [t for t in TECH_TOKENS if re.search(r"\\b"+re.escape(t)+r"\\b",q)]

def token_strict_recommend(q,top_k=10):
    qlow=q.lower()
    q_tokens=query_tokens(q)
    q_emb=model.encode([q],convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D,I=index.search(q_emb,min(200,index.ntotal))
    sims=D[0]; idxs=I[0]
    cands=[]
    for sim,idx in zip(sims,idxs):
        if idx<0: continue
        prod=df_products.iloc[idx]
        txt=prod["__search_text__"]
        nm=prod["name"]; url=prod["url"]
        prod_toks=[t for t in TECH_TOKENS if t in txt or t in url.lower() or t in nm.lower()]
        prod_score=min(1.0,len(prod_toks)/2.0)
        comb=sim+0.45*prod_score
        cands.append({"name":nm,"url":url,"sim":sim,"prod_toks":prod_toks,"score":comb})
    if len(q_tokens)>0:
        strict=[c for c in cands if any(qt in c["prod_toks"] for qt in q_tokens)]
        if len(strict)>=top_k:
            out=sorted(strict,key=lambda x:x["score"],reverse=True)[:top_k]
        else:
            out=sorted(strict,key=lambda x:x["score"],reverse=True)
            rem=[c for c in cands if c not in out]
            for r in rem:
                if len(r["prod_toks"])==0: r["score"]-=0.5
            rem=sorted(rem,key=lambda x:x["score"],reverse=True)
            out+=rem[:top_k-len(out)]
    else:
        out=sorted(cands,key=lambda x:x["score"],reverse=True)[:top_k]
    return [{"assessment_name":r["name"],"assessment_url":r["url"],"score":r["score"]} for r in out]


In [7]:
q="Looking to hire mid-level professionals who are proficient in Python, SQL and Java Script. Need an assessment package max 60 minutes."
recs=token_strict_recommend(q,top_k=10)
for i,r in enumerate(recs,1):
    print(i,"-",r["assessment_name"],"|",r["assessment_url"],"| score:",round(r["score"],3))


1 - Automata Selenium | https://www.shl.com/solutions/products/product-catalog/view/automata-selenium/ | score: 0.917
2 - JavaScript (New) | https://www.shl.com/solutions/products/product-catalog/view/javascript-new/ | score: 0.869
3 - HTML/CSS (New) | https://www.shl.com/solutions/products/product-catalog/view/htmlcss-new/ | score: 0.783
4 - Python (New) | https://www.shl.com/solutions/products/product-catalog/view/python-new/ | score: 0.779
5 - Automata - Fix (New) | https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/ | score: 0.734
6 - Core Java (Entry Level) (New) | https://www.shl.com/solutions/products/product-catalog/view/core-java-entry-level-new/ | score: 0.677
7 - SQL Server Analysis Services (SSAS) (New) | https://www.shl.com/solutions/products/product-catalog/view/sql-server-analysis-services-%28ssas%29-%28new%29/ | score: 0.666
8 - SQL Server (New) | https://www.shl.com/solutions/products/product-catalog/view/sql-server-new/ | score: 0.662
9 - Cor

In [8]:
import pandas as pd, os
if os.path.exists("unlabeled_test.csv"):
    test_df=pd.read_csv("unlabeled_test.csv")
    test_queries=test_df["Query"].astype(str).tolist()
else:
    test_queries=[
        "Looking to hire mid-level professionals who are proficient in Python, SQL and Java Script. Need an assessment package max 60 minutes."
    ]
rows=[]
for q in test_queries:
    recs=token_strict_recommend(q,top_k=10)
    for r in recs:
        rows.append({"Query":q,"Assessment_url":r["assessment_url"]})
pd.DataFrame(rows).to_csv("recommendations_final.csv",index=False)
