In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange





In [23]:
# URL pattern for paginated pages
base_url = "https://www.shl.com/solutions/products/product-catalog/?start={}&type=2"
page_offsets = [i for i in range(0, 144, 12)]
urls = [base_url.format(offset) for offset in page_offsets]

# Storage for all scraped data
data = []

# Detects green circle = "Yes"
def has_green_dot(cell):
    return "Yes" if cell.find("span", class_="catalogue__circle -yes") else "No"

# Extract extra details from individual assessment page
def extract_assessment_details(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        def get_section_text(heading):
            section = soup.find("h4", string=heading)
            return section.find_next("p").get_text(strip=True) if section else ""

        def get_test_type():
            span = soup.select_one('span.product-catalogue__key')
            return span.text.strip() if span else ""

        def get_remote_testing_detail():
            remote_span = soup.select_one('p:has(> span.catalogue__circle.-yes)')
            return "Yes" if remote_span else "No"

        completion_raw = get_section_text("Assessment length")
        completion_time = re.search(r"\d+", completion_raw)
        completion_time = completion_time.group(0) if completion_time else ""

        return {
            "Description": get_section_text("Description"),
            "Job Levels": get_section_text("Job levels"),
            "Languages": get_section_text("Languages"),
            "Completion Time (mins)": completion_time,
            "Test Type (Detail)": get_test_type(),
            "Remote Testing (Detail)": get_remote_testing_detail()
        }
    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return {
            "Description": "",
            "Job Levels": "",
            "Languages": "",
            "Completion Time (mins)": "",
            "Test Type (Detail)": "",
            "Remote Testing (Detail)": ""
        }

# Loop through all catalog pages
for page_num, url in enumerate(urls, start=1):
    print(f"🔎 Scraping Page {page_num}: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    tables = soup.find_all("table")
    if not tables:
        print(f"⚠️ No tables found on Page {page_num}")
        continue

    rows = tables[0].find_all("tr")[1:]  # Skip table header

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue

        name = cols[0].get_text(strip=True)
        link_tag = cols[0].find("a")
        link = link_tag["href"] if link_tag else None
        full_link = f"https://www.shl.com{link}" if link else ""

        remote_testing = has_green_dot(cols[1])
        adaptive = has_green_dot(cols[2])
        test_type = cols[3].get_text(strip=True)

        # Get additional info from the assessment page
        details = extract_assessment_details(full_link)

        # Append row to the data
        data.append({
            "Assessment Name": name,
            "Link": full_link,
            "Remote Testing": remote_testing,
            "Adaptive/IRT": adaptive,
            "Test Type": test_type,
            "Description": details["Description"],
            "Job Levels": details["Job Levels"],
            "Languages": details["Languages"],
            "Completion Time (mins)": details["Completion Time (mins)"],
            "Test Type (Detail)": details["Test Type (Detail)"],
            "Remote Testing (Detail)": details["Remote Testing (Detail)"],
            "Page": page_num
        })
        time.sleep(0.5)  # polite delay

# Save final result to CSV
df = pd.DataFrame(data)
df.to_csv("shl_prepackaged_solutions_detailed.csv", index=False)
print("✅ Data saved to 'shl_prepackaged_solutions_detailed.csv'")


🔎 Scraping Page 1: https://www.shl.com/solutions/products/product-catalog/?start=0&type=2
🔎 Scraping Page 2: https://www.shl.com/solutions/products/product-catalog/?start=12&type=2
🔎 Scraping Page 3: https://www.shl.com/solutions/products/product-catalog/?start=24&type=2
🔎 Scraping Page 4: https://www.shl.com/solutions/products/product-catalog/?start=36&type=2
🔎 Scraping Page 5: https://www.shl.com/solutions/products/product-catalog/?start=48&type=2
🔎 Scraping Page 6: https://www.shl.com/solutions/products/product-catalog/?start=60&type=2
🔎 Scraping Page 7: https://www.shl.com/solutions/products/product-catalog/?start=72&type=2
🔎 Scraping Page 8: https://www.shl.com/solutions/products/product-catalog/?start=84&type=2
🔎 Scraping Page 9: https://www.shl.com/solutions/products/product-catalog/?start=96&type=2
🔎 Scraping Page 10: https://www.shl.com/solutions/products/product-catalog/?start=108&type=2
🔎 Scraping Page 11: https://www.shl.com/solutions/products/product-catalog/?start=120&typ

In [41]:
data=pd.read_csv('shl_prepackaged_solutions_detailed.csv')
data.head()

Unnamed: 0,Assessment Name,Link,Remote Testing,Adaptive/IRT,Test Type,Description,Job Levels,Languages,Completion Time (mins),Test Type (Detail),Remote Testing (Detail),Page
0,Account Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,CPAB,The Account Manager solution is an assessment ...,"Mid-Professional,","English (USA),",49.0,C,Yes,1
1,Administrative Professional - Short Form,https://www.shl.com/solutions/products/product...,Yes,Yes,AKP,The Administrative Professional solution is fo...,"Entry-Level,","English (USA),",36.0,A,Yes,1
2,Agency Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,ABPS,The Agency Manager solution is for mid-level s...,"Front Line Manager, Manager, Supervisor,","English (USA),",51.0,A,Yes,1
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,BP,The Apprentice + 8.0 Job-Focused Assessment is...,"General Population, Graduate, Entry-Level,","English International, German,",30.0,B,Yes,1
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,BP,The Apprentice 8.0 Job-Focused Assessment is a...,"Entry-Level, General Population, Graduate,","English International, German, French,",20.0,B,Yes,1


In [43]:
data.isnull().sum()

Assessment Name            0
Link                       0
Remote Testing             0
Adaptive/IRT               0
Test Type                  0
Description                0
Job Levels                 1
Languages                  2
Completion Time (mins)     3
Test Type (Detail)         0
Remote Testing (Detail)    0
Page                       0
dtype: int64

In [45]:
data['Job Levels'].fillna("Unknown", inplace=True)
data['Languages'].fillna("English", inplace=True)
data.dropna(subset=['Completion Time (mins)'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Job Levels'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Languages'].fillna("English", inplace=True)


In [49]:
data.isnull().sum()

Assessment Name            0
Link                       0
Remote Testing             0
Adaptive/IRT               0
Test Type                  0
Description                0
Job Levels                 0
Languages                  0
Completion Time (mins)     0
Test Type (Detail)         0
Remote Testing (Detail)    0
Page                       0
dtype: int64

In [53]:
data['combined_text'] = (
    "Assessment Name: " + data['Assessment Name'] + ". " +
    "Description: " + data['Description'] + ". " +
    "Test Type: " + data['Test Type'] + ". " +
    "Job Levels: " + data['Job Levels'].fillna('') + ". " +
    "Languages: " + data['Languages'].fillna('') + ". " +
    "Completion Time: " + data['Completion Time (mins)'].astype(str) + " minutes."
)


In [57]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(data['combined_text'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [59]:
# Convert embeddings to numpy array
embedding_matrix = np.vstack(embeddings).astype("float32")

# Build FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# Save metadata (like original assessment row info)
metadata = df.to_dict(orient="records")


In [61]:
faiss.write_index(index, "shl_index.faiss")
# Later...
# index = faiss.read_index("shl_index.faiss")


In [8]:
# Load model and FAISS index
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("shl_index.faiss")

# Load metadata
data = pd.read_csv("shl_prepackaged_solutions_detailed.csv")  # or however you load your main dataframe
metadata = data.to_dict(orient="records")

def recommend_assessments(query, top_k=10):
    # Step 1: Embed the user query
    query_embedding = model.encode([query])[0].astype("float32")  # FAISS needs float32

    # Step 2: Search the FAISS index
    distances, indices = index.search(np.array([query_embedding]), top_k)

    # Step 3: Get top-k results from metadata
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(metadata):  # safeguard
            record = metadata[idx]
            result = {
                "Assessment Name": record["Assessment Name"],
                "URL": record["Link"],  # if this column exists
                "Remote Testing Support": record.get("Remote Testing", "N/A"),
                "Adaptive/IRT Support": record.get("Adaptive/IRT", "N/A"),
                "Duration": f"{record.get('Completion Time (mins)', 'N/A')} mins",
                "Test Type": record.get("Test Type", "N/A"),
                "Similarity Score": f"{1 - dist:.4f}"  # Optional
            }
            results.append(result)
    return results


In [11]:
query = "Looking to hire mid-level professionals who are proficient in Python, SQL and Java Script. Need an assessment package that can test all skills with max duration of 60 minutes."
recommendations = recommend_assessments(query)

for rec in recommendations:
    print(json.dumps(rec, indent=2))


{
  "Assessment Name": "Industrial Professional and Skilled 7.1 Solution",
  "URL": "https://www.shl.com/solutions/products/product-catalog/view/industrial-professional-and-skilled-7-1-solution/",
  "Remote Testing Support": "Yes",
  "Adaptive/IRT Support": "No",
  "Duration": "49.0 mins",
  "Test Type": "AB",
  "Similarity Score": "0.2868"
}
{
  "Assessment Name": "Industrial Professional and Skilled 7.1 (Americas)",
  "URL": "https://www.shl.com/solutions/products/product-catalog/view/industrial-professional-and-skilled-7-1-%28americas%29/",
  "Remote Testing Support": "Yes",
  "Adaptive/IRT Support": "No",
  "Duration": "49.0 mins",
  "Test Type": "AB",
  "Similarity Score": "0.2766"
}
{
  "Assessment Name": "Support Supervisor Solution",
  "URL": "https://www.shl.com/solutions/products/product-catalog/view/support-supervisor-solution/",
  "Remote Testing Support": "Yes",
  "Adaptive/IRT Support": "Yes",
  "Duration": "39.0 mins",
  "Test Type": "APSB",
  "Similarity Score": "0.1531