# Testing

## Single car pulling 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import Dict, Any
import random
import json

# Configure headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
}


In [68]:
def scrape_car_details(url: str) -> Dict[str, Any]:
    """
    Scrape car details from SGCarMart listing page
    
    Args:
        url (str): URL of the car listing page
        
    Returns:
        Dict[str, Any]: Dictionary containing car details
    """
    try:
        # Add random delay to be respectful to the server
        time.sleep(random.uniform(1, 3))
        
        # Make the request
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize dictionary to store car details
        car_details = {}
        
        # Find all detail rows
        detail_rows = soup.find_all('div', class_='row_info')
        
        # Extract information from each row
        for row in detail_rows:
            # Find label and value
            label_elem = row.find('div', class_='label')
            value_elem = row.find('div', class_='value')
            
            if label_elem and value_elem:
                label = label_elem.text.strip()
                value = value_elem.text.strip()
                
                # Clean up the label and value
                label = label.replace(':', '').strip()
                value = value.replace('info', '').strip()
                
                # Store in dictionary
                car_details[label] = value
        
        # Extract price separately as it might be in a different location
        price_elem = soup.find('div', class_='price')
        if price_elem:
            car_details['Price'] = price_elem.text.strip()
            
        return car_details
        
    except Exception as e:
        print(f"Error scraping car details: {str(e)}")
        return {}


In [73]:
# Test the scraper with a sample car listing URL
test_url = "https://www.sgcarmart.com/used_cars/info.php?ID=1439810"

# Test the scraper
car_info = scrape_car_details(test_url)

# Pretty print the results
print(json.dumps(car_info, indent=2, ensure_ascii=False))


{}


In [70]:
url = "https://www.sgcarmart.com/used-cars/info/toyota-prius-plus-hybrid-1439810?dl=4309"

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')



In [16]:
# Let's breakdown the main headers of the soup object (HTML) slowly

# 1. Find all main header tags. Common main headers are <h1>, <h2>, <h3>
h1_headers = soup.find_all('h1')
h2_headers = soup.find_all('h2')
h3_headers = soup.find_all('h3')

print("H1 Headers:")
for h1 in h1_headers:
    print("-", h1.get_text(strip=True))

print("\nH2 Headers:")
for h2 in h2_headers:
    print("-", h2.get_text(strip=True))

print("\nH3 Headers:")
for h3 in h3_headers:
    print("-", h3.get_text(strip=True))

H1 Headers:
- Toyota Prius Plus Hybrid 1.8A

H2 Headers:
- Toyota Prius Discussions
- Related Discussions
- Products & Services
- Alternative Cars
- Recommended For You
- NCD Builder Insurance
- Seller Information
- Resources
- Leasing Option
- Pricing Summary
- Free Car Servicing
- Upfront Payment

H3 Headers:


In [19]:
# From this header (<h1>), extract and then print all body content after it

h1_headers = soup.find_all('h1')
if h1_headers:
    h1 = h1_headers[0]
    print("H1 Header:")
    print("-", h1.get_text(strip=True))

    # Get all the siblings after the h1 until end of body
    print("\nBody content after <h1>:")
    # Iterate over all <div> tags that appear after the <h1>
    for div in h1.find_all_next("div"):
        text = div.get_text(strip=True)
        if text:
            print("-", text)

H1 Header:
- Toyota Prius Plus Hybrid 1.8A

Body content after <h1>:
- ShortlistShareCopy LinkFacebook ShareWhatsApp ShareMoreReport ErrorPrintable VersionPost an Ad
- ShareCopy LinkFacebook ShareWhatsApp Share
- Copy LinkFacebook ShareWhatsApp Share
- MoreReport ErrorPrintable VersionPost an Ad
- Report ErrorPrintable VersionPost an Ad
- OverviewFinancialSimilarLeasingAccessoriesResearchPhotosMap
- OverviewFinancialSimilarLeasingAccessoriesResearchPhotosMap
- Overview
- Financial
- Similar
- Leasing
- Accessories
- Research
- Photos
- Map
- Loan Calculator
- Loan Calculator
- Toyota Prius DiscussionsRelated DiscussionsView All5G Toyota Prius[Official] 2021 2nd Generation Toyota Aqua / Prius CAn Early Look at the 2023 Toyota Prius Prime XSE PremiumToyota Prius Alpha/PlusInside neglected Prius engine after 500,000 kmShare your thoughts on TelegramAbove discussions are extracted frommycarforum.comYou may login to Mycarforum using your Sgcarmart username & password.
- Toyota Prius Discuss

In [20]:
# List all possible div class attributes found in the soup

divs = soup.find_all("div")
div_classes = set()
for div in divs:
    cls = div.get("class")
    if cls:
        # cls is a list, so turn it into a space-separated string for uniqueness
        div_classes.add(" ".join(cls))

print("All unique div classes:")
for cls in sorted(div_classes):
    print("-", cls)

All unique div classes:
- GoogleAds_ads_container__G6sKu gam GoogleAds_desktop_ads__Cj7vd
- GoogleAds_google_ads_container__nvFye
- Styles_dropdown_menu__MK439 styles_dropdown_menu__1sDZj dropdown-menu
- below_footer_container
- bottomfooter styles_bottomFooter__YjRyp
- clear
- d-flex align-items-center gap-2 styles_showDetailSeperator__9bY0w
- d-flex gap-3 mt-3 overflow-hidden
- d-none d-lg-flex row row-cols-5 m-auto f14
- dropdown
- global_desktop_content_width styles_footercont_container__mJ_p4
- google_ads_section undefined
- input-group
- listing_breadcrumb_container__SleKE
- listing_category_dropdown__mTgvY dropdown
- listing_gam_container__aFeVC styles_infoGamContainer__frpAM
- loader_placeholder d-flex gap-2 placeholder-glow
- loader_placeholder my-3 placeholder-glow
- loader_placeholder placeholder-glow
- loader_placeholder styles_left_container_columns__O_8cF placeholder-glow
- loader_placeholder styles_left_container_columns__qaivk placeholder-glow
- loader_placeholder style

In [41]:
# Pull only the <script> tag(s) below the element with id="S:10"
# Assumes BeautifulSoup 'soup' object is available

target_div = soup.find(id="S:10")
if target_div:
    scripts = []
    # Find the next sibling(s), and look for script tags
    next_tag = target_div.find_next_sibling()
    while next_tag:
        if next_tag.name == "script":
            scripts.append(next_tag)
        next_tag = next_tag.find_next_sibling()
    # Print all script tags found, prettified for inspection
    for idx, script in enumerate(scripts):
        print(f"\n--- Script {idx+1} ---\n{script.prettify()}\n")
else:
    print("No element with id='S:10' found.")


--- Script 1 ---
<script>
 $RC("B:10","S:10")
</script>



--- Script 2 ---
<script>
 self.__next_f.push([1,"175:I[75520,[\"8038\",\"static/chunks/7ce798d6-6de03778c6a05922.js\",\"3676\",\"static/chunks/870fdd6f-7acee1cd9ffc0f14.js\",\"4609\",\"static/chunks/4609-da3c93e670782c6b.js\",\"8030\",\"static/chunks/8030-aa02dc5fc25f0f2a.js\",\"8667\",\"static/chunks/8667-30fec0a99c1514f1.js\",\"1809\",\"static/chunks/1809-77fb8dc5f61c4d7a.js\",\"5814\",\"static/chunks/5814-4eddf4e6d73020de.js\",\"6666\",\"static/chunks/6666-3eb94ca4395b7c81.js\",\"7393\",\"static/chunks/7393-fbc801d4de042bbb.js\",\"8603\",\"static/chunks/8603-3f8fd4c127b77587.js\",\"6182\",\"static/chunks/6182-f42a3e1c7d0c2bfa.js\",\"7891\",\"static/chunks/7891-3cc6e31dbbdb335a.js\",\"6434\",\"static/chunks/6434-f1ba83ef6f4d8ff1.js\",\"3654\",\"static/chunks/3654-1c249fb13a450700.js\",\"3035\",\"static/chunks/3035-60fa41caddb1a9a2.js\",\"7123\",\"static/chunks/7123-616712f8b10ea192.js\",\"1543\",\"static/chunks/app/(routes)

In [47]:
# Extract and print only the <script> tag below the element with id="S:10" that is the 10th <script> in order
# Assumes BeautifulSoup 'soup' object is available

target_div = soup.find(id="S:10")
if target_div:
    scripts = []
    # Find the next sibling(s), and look for script tags
    next_tag = target_div.find_next_sibling()
    while next_tag:
        if next_tag.name == "script":
            scripts.append(next_tag)
        next_tag = next_tag.find_next_sibling()
    if len(scripts) >= 10:
        script10 = scripts[9]
        print(f"\n--- Script 10 ---\n{script10.prettify()}\n")
    else:
        print(f"Less than 10 <script> tags found after element with id='S:10'.")
else:
    print("No element with id='S:10' found.")


--- Script 10 ---
<script>
 self.__next_f.push([1,"178:{\"success\":true,\"data\":{\"aid\":1439810,\"car_model\":\"Toyota Prius Plus Hybrid 1.8A\",\"depreciation\":\"$$16,900 /yr\",\"coe\":\"$$39,000\",\"reg_date\":\"16-Nov-2020\",\"original_reg_date\":null,\"lifespan\":null,\"mileage\":\"101,482 km (20.5k /yr)\",\"manufactured\":2020,\"road_tax\":\"$$976 /yr\",\"transmission\":\"Auto\",\"dereg_value\":\"$$35,128 as of today\",\"omv\":\"$$27,507\",\"arf\":\"$$20,510\",\"engine_cap\":\"1,798 cc\",\"drive_range\":\"N.A.\",\"fuel_type\":\"Petrol-Electric\",\"power\":\"100.0 kW (134 bhp)\",\"curb_weight\":\"1,500 kg\",\"owners\":\"2\",\"type_of_vehicle\":{\"text\":\"MPV\",\"link\":\"https://www.sgcarmart.com/used-cars/listing?vts[]=10\"},\"status\":\"Available for sale\",\"features\":\"Fuel efficient 1.8l 4 cylinders 16v DOHC dual VVT-I engine with electric motor.\",\"new_car_text\":\"View specs of the \u003ca href=\\\"https://www.sgcarmart.com/new_cars/newcars_pastcars.php?PCM=1\u0026MOD

In [60]:
script10

<script>self.__next_f.push([1,"178:{\"success\":true,\"data\":{\"aid\":1439810,\"car_model\":\"Toyota Prius Plus Hybrid 1.8A\",\"depreciation\":\"$$16,900 /yr\",\"coe\":\"$$39,000\",\"reg_date\":\"16-Nov-2020\",\"original_reg_date\":null,\"lifespan\":null,\"mileage\":\"101,482 km (20.5k /yr)\",\"manufactured\":2020,\"road_tax\":\"$$976 /yr\",\"transmission\":\"Auto\",\"dereg_value\":\"$$35,128 as of today\",\"omv\":\"$$27,507\",\"arf\":\"$$20,510\",\"engine_cap\":\"1,798 cc\",\"drive_range\":\"N.A.\",\"fuel_type\":\"Petrol-Electric\",\"power\":\"100.0 kW (134 bhp)\",\"curb_weight\":\"1,500 kg\",\"owners\":\"2\",\"type_of_vehicle\":{\"text\":\"MPV\",\"link\":\"https://www.sgcarmart.com/used-cars/listing?vts[]=10\"},\"status\":\"Available for sale\",\"features\":\"Fuel efficient 1.8l 4 cylinders 16v DOHC dual VVT-I engine with electric motor.\",\"new_car_text\":\"View specs of the \u003ca href=\\\"https://www.sgcarmart.com/new_cars/newcars_pastcars.php?PCM=1\u0026MOD=Toyota+Prius\u0026VT

In [62]:
import re

# Convert script10 to string
script10_str = str(script10)

# Define wanted keys and their small caps versions (underscore, lower)
wanted_keys = [
    "Transmission",
    "Fuel Type",
    "Engine Capacity",
    "Curb Weight",
    "Power",
    "Road Tax",
    "Deregistration Value",
    "COE",
    "OMV",
    "ARF"
]
wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]

results_smallcaps = {}

def clean_value(key, value):
    """
    Clean up the extracted value for any of the wanted keys.
    For numeric/currency-like values: remove non-numeric, non-dot, non-minus chars.
    For others: just strip whitespace.
    """
    # Always try to normalize money/numbers (contains $ or numbers or unit suffix)
    # If it contains digits, possibly with currency symbols or known suffixes, try to extract the main number
    if any(char.isdigit() for char in value):
        # Remove currency markers ($), commas, spaces, slashes, "yr", "km", "cc", etc.
        clean = value
        # Remove "$", ",", spaces
        clean = re.sub(r'[,$]', '', clean)
        # Remove common unit suffixes/spaces (will still keep decimals and / where relevant)
        clean = re.sub(r'\s*(/yr|yr|km|cc|as of today|/)', '', clean, flags=re.IGNORECASE)
        # Remove any leftover non-numeric except dot
        clean = re.sub(r'[^\d.]', '', clean)
        return clean
    else:
        return value.strip()

# For each key, search explicitly for its JSON-like property
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Match e.g.: "road_tax": "$$976", allowing for optional spaces and optional quotes, case-insensitive key
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^,"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps[key] = clean

print(results_smallcaps)

{}


In [66]:
import codecs

# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script10_str)

# For each key in wanted_keys_small, extract and clean the value
results_smallcaps_cleaned = {}
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Allow comma inside the value (for thousands in numbers)
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(cleaned_script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps_cleaned[key] = clean

print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol-Electric', 'curb_weight': '1500', 'power': '100.0134', 'road_tax': '976', 'coe': '39000', 'omv': '27507', 'arf': '20510'}


In [118]:
url = "https://www.sgcarmart.com/used_cars/info.php?ID=1442856"

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first <script> tag in the page containing "success", "coe", and "depreciation"
script = next(
    (tag for tag in soup.find_all("script")
     if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)

# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script)

import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

def get_type_of_vehicle(cleaned_script10_str):
    """
    Extract the type_of_vehicle value from string, handling both object and string representations.
    Returns either the dictionary (parsed from JSON) or the plain value if it's a simple string.
    Expected forms:
      - ... "type_of_vehicle":{"text":"Mid-Sized Sedan", ... }
      - ... "type_of_vehicle":"Mid-Sized Sedan"
    """
    import re
    import json

    # Try to match the type_of_vehicle dictionary first (non-greedy for value)
    dict_match = re.search(
        r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
    )
    if dict_match:
        dict_str = '{' + dict_match.group(1) + '}'
        # Remove trailing commas (incomplete objects), then try to parse
        dict_str = re.sub(r',\s*\}$', '}', dict_str)
        try:
            # Fix possible unescaped quotes and parse as JSON
            # Replace single quotes with double only if not inside a string value
            dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
            # Only keep up to the last closing brace to avoid over-read
            if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                dict_str_cleaned = dict_str_cleaned + '}'
            type_obj = json.loads(dict_str_cleaned)
            # If 'text' key is present, return its value
            if isinstance(type_obj, dict) and 'text' in type_obj:
                return type_obj['text'].strip()
            return type_obj
        except Exception:
            pass  # Fall through to the next approach

    # Try to match a simple quoted string value
    simple_match = re.search(
        r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
    )
    if simple_match:
        return simple_match.group(1).strip()

    # Fallback: try unquoted value (rare case)
    simple_unquoted = re.search(
        r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
    )
    if simple_unquoted:
        return simple_unquoted.group(1).strip()

    return None

carmodel = get_carmodel(cleaned_script10_str)
type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)

def extract_car_details(cleaned_script10_str, clean_value_func):
    wanted_keys = [
        "Transmission",
        "Fuel Type",
        "Engine Capacity",
        "Curb Weight",
        "Power",
        "Road Tax",
        "Deregistration Value",
        "COE",
        "OMV",
        "ARF",
        "reg_date",
        "mileage",
        "owners",
        "dealer",
        "dereg_value",
        "engine_cap"
    ]
    wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]

    results_smallcaps_cleaned = {}
    for orig, key in zip(wanted_keys, wanted_keys_small):
        pat = re.compile(
            r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
            re.IGNORECASE
        )
        if key == "mileage":
            matches = pat.findall(cleaned_script10_str)
            # Use the second mileage if there are at least two matches, else skip or fallback to first
            if len(matches) >= 2:
                value = matches[1].strip()
            elif len(matches) == 1:
                value = matches[0].strip()
            else:
                value = None
            if value is not None:
                # Remove parentheses and their contents for mileage if matched (though uncommon, for safety)
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
        else:
            match = pat.search(cleaned_script10_str)
            if match:
                value = match.group(1).strip()
                # Always remove parentheses and their contents from the extracted value
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
    return results_smallcaps_cleaned

# Usage:
results_smallcaps_cleaned = extract_car_details(cleaned_script10_str, clean_value)

# Add carmodel and url into the dictionary
results_smallcaps_cleaned['carmodel'] = carmodel

results_smallcaps_cleaned['type_of_vehicle'] = type_of_vehicle

results_smallcaps_cleaned['url'] = url


print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol', 'curb_weight': '1295', 'power': '95.3', 'road_tax': '738', 'coe': '37000', 'omv': '11398', 'arf': '11398', 'reg_date': '302018', 'mileage': '95500', 'owners': '1', 'dealer': 'Platinum Motoring', 'dereg_value': '16145', 'engine_cap': '1591', 'carmodel': 'Kia Cerato K3 1.6A', 'type_of_vehicle': 'Mid-Sized Sedan', 'url': 'https://www.sgcarmart.com/used_cars/info.php?ID=1442856'}


In [101]:
cleaned_script10_str

'self.__next_f.push([1,"1e:["$","main",null,{"children":["$undefined",["$","$L2a",null,{"src":"https://ps.eyeota.net/pixel?pid=om4o6lu&t=ajs&cat=usedpassengercars&cat=usedpricebelow50k&cat=usedreg2016to2020&cat=usedpetrol&cat=usedkia&cat=usedcarintent","strategy":"afterInteractive"}],[["$","div",null,{"className":"$undefined"}],false,["$","$2f",null,{"fallback":["$","$L22",null,{}],"children":"$L14a"}],"$undefined",["$","div",null,{"className":"styles_mobileAds__at6xI","children":["$","$L141",null,{"from_layout":false,"infoUrlData":{"aid":1442856,"year":"2018","make":"Kia","model":"Cerato","make_model":"Kia Cerato","carmodel":"Kia Cerato K3 1.6A","correct_url":true,"info_url":"https://www.sgcarmart.com/used-cars/info/kia-cerato-k3-16a-1442856","dealer_code":3283,"dealer":"Platinum Motoring","listing_url":"https://www.sgcarmart.com/used-cars/listing?dl=3283","show_map_tab":1,"show_image_tab":1,"show_leasing_tab":0,"show_financial_tab":1,"show_test_drive_button":1,"show_ecics_widget":1,"

In [111]:
cleaned_script10_str

'<script>self.__next_f.push([1,"1e:["$","main",null,{"children":["$undefined",["$","$L2a",null,{"src":"https://ps.eyeota.net/pixel?pid=om4o6lu&t=ajs&cat=usedpassengercars&cat=usedpricebelow50k&cat=usedreg2016to2020&cat=usedpetrol&cat=usedkia&cat=usedcarintent","strategy":"afterInteractive"}],[["$","div",null,{"className":"$undefined"}],false,["$","$2f",null,{"fallback":["$","$L22",null,{}],"children":"$L149"}],"$undefined",["$","div",null,{"className":"styles_mobileAds__at6xI","children":["$","$L141",null,{"from_layout":false,"infoUrlData":{"aid":1442856,"year":"2018","make":"Kia","model":"Cerato","make_model":"Kia Cerato","carmodel":"Kia Cerato K3 1.6A","correct_url":true,"info_url":"https://www.sgcarmart.com/used-cars/info/kia-cerato-k3-16a-1442856","dealer_code":3283,"dealer":"Platinum Motoring","listing_url":"https://www.sgcarmart.com/used-cars/listing?dl=3283","show_map_tab":1,"show_image_tab":1,"show_leasing_tab":0,"show_financial_tab":1,"show_test_drive_button":1,"show_ecics_wid

In [114]:
import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

def get_type_of_vehicle(cleaned_script10_str):
    """
    Extract the type_of_vehicle value from string, handling both object and string representations.
    Returns either the dictionary (parsed from JSON) or the plain value if it's a simple string.
    Expected forms:
      - ... "type_of_vehicle":{"text":"Mid-Sized Sedan", ... }
      - ... "type_of_vehicle":"Mid-Sized Sedan"
    """
    import re
    import json

    # Try to match the type_of_vehicle dictionary first (non-greedy for value)
    dict_match = re.search(
        r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
    )
    if dict_match:
        dict_str = '{' + dict_match.group(1) + '}'
        # Remove trailing commas (incomplete objects), then try to parse
        dict_str = re.sub(r',\s*\}$', '}', dict_str)
        try:
            # Fix possible unescaped quotes and parse as JSON
            # Replace single quotes with double only if not inside a string value
            dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
            # Only keep up to the last closing brace to avoid over-read
            if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                dict_str_cleaned = dict_str_cleaned + '}'
            type_obj = json.loads(dict_str_cleaned)
            # If 'text' key is present, return its value
            if isinstance(type_obj, dict) and 'text' in type_obj:
                return type_obj['text'].strip()
            return type_obj
        except Exception:
            pass  # Fall through to the next approach

    # Try to match a simple quoted string value
    simple_match = re.search(
        r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
    )
    if simple_match:
        return simple_match.group(1).strip()

    # Fallback: try unquoted value (rare case)
    simple_unquoted = re.search(
        r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
    )
    if simple_unquoted:
        return simple_unquoted.group(1).strip()

    return None

carmodel = get_carmodel(cleaned_script10_str)
type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)

carmodel, type_of_vehicle

('Kia Cerato K3 1.6A', 'Mid-Sized Sedan')

In [103]:
import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

carmodel = get_carmodel(cleaned_script10_str)
carmodel

'Kia Cerato K3 1.6A'

In [87]:
# Find the first <script> tag in the page containing "success", "coe", and "depreciation"
script = next(
    (tag for tag in soup.find_all("script")
     if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)
if script:
    print("\n--- First <script> tag containing 'success', 'coe', and 'depreciation' ---\n")
    print(script.prettify())
else:
    print('No <script> tag found in the page containing "success", "coe", and "depreciation"')


--- First <script> tag containing 'success', 'coe', and 'depreciation' ---

<script>
 self.__next_f.push([1,"1e:[\"$\",\"main\",null,{\"children\":[\"$undefined\",[\"$\",\"$L2a\",null,{\"src\":\"https://ps.eyeota.net/pixel?pid=om4o6lu\u0026t=ajs\u0026cat=usedpassengercars\u0026cat=usedpricebelow50k\u0026cat=usedreg2016to2020\u0026cat=usedpetrol\u0026cat=usedkia\u0026cat=usedcarintent\",\"strategy\":\"afterInteractive\"}],[[\"$\",\"div\",null,{\"className\":\"$undefined\"}],false,[\"$\",\"$2f\",null,{\"fallback\":[\"$\",\"$L22\",null,{}],\"children\":\"$L148\"}],\"$undefined\",[\"$\",\"div\",null,{\"className\":\"styles_mobileAds__at6xI\",\"children\":[\"$\",\"$L141\",null,{\"from_layout\":false,\"infoUrlData\":{\"aid\":1442856,\"year\":\"2018\",\"make\":\"Kia\",\"model\":\"Cerato\",\"make_model\":\"Kia Cerato\",\"carmodel\":\"Kia Cerato K3 1.6A\",\"correct_url\":true,\"info_url\":\"https://www.sgcarmart.com/used-cars/info/kia-cerato-k3-16a-1442856\",\"dealer_code\":3283,\"dealer\":\"P

In [92]:
# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script)

# For each key in wanted_keys_small, extract and clean the value
results_smallcaps_cleaned = {}
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Allow comma inside the value (for thousands in numbers)
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(cleaned_script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps_cleaned[key] = clean

print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol', 'curb_weight': '1295', 'power': '95.3127', 'road_tax': '738', 'coe': '37000', 'omv': '11398', 'arf': '11398'}


## Main page scraping


Idea is to take main page - get all the single car links 

In [150]:
import requests
from bs4 import BeautifulSoup

main_url = "https://www.sgcarmart.com/used-cars/listing?pr2=60000&cts[]=18&vts[]=12&vts[]=13&vts[]=9&vts[]=10&vts[]=11&vts[]=8&vts[]=7&vts[]=3&vts[]=2&mil1=&mil2=100000&own_c=%3C&own=2&"

import time
from tqdm import tqdm

# Fetch the page content with delays to avoid being rate-limited.
all_html = ""
max_pages = 22  # For testing: only fetch 10 pages for now

for page in tqdm(range(1, max_pages + 1), desc="Fetching pages"):
    paged_url = main_url + f"&page={page}"
    try:
        res = requests.get(paged_url, headers=headers)
        res.raise_for_status()
        html = res.text
        # Heuristic: stop if there's no "info.php?ID=" anymore (no more cars)
        if "/used_cars/info.php?ID=" not in html:
            print(f"No more car listings found after page {page}. Stopping.")
            break
        all_html += html
    except requests.HTTPError as e:
        print(f"Error fetching page {page}: {e}")
        break
    time.sleep(2)  # Stagger requests by 2 seconds

html = all_html

# Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(html, "html.parser")

# Let's explore the page structure first
print("\n--- Title of the page ---")
print(soup.title)

print("\n--- First 1000 characters of pretty HTML ---")
print(soup.prettify()[:1000])

print("\n--- All <a> tags on the page? (first 10 shown) ---")
a_tags = soup.find_all("a", href=True)
for a in a_tags[:10]:
    print(a, "\n   -->", a['href'])

print("\n--- First 5 car info links found ---")
# Try to extract info links
car_links = []
for a in a_tags:
    href = a['href']
    if href.startswith("/used_cars/info.php?ID="):
        full_url = "https://www.sgcarmart.com" + href
        if full_url not in car_links:  # Prevent duplicates
            car_links.append(full_url)
for link in car_links[:5]:
    print(link)

print(f"\nTotal car links found: {len(car_links)}")


Fetching pages: 100%|██████████| 22/22 [00:45<00:00,  2.09s/it]



--- Title of the page ---
<title>Browse 390  Used Cars in Singapore - Page 1 - Sgcarmart</title>

--- First 1000 characters of pretty HTML ---
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=1140, initial-scale=0" name="viewport"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/d3df112486f97f47.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/0ca7b5ce0e99316f.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/48eb0e870d760499.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/d316907b9f536ae4.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/bda3ef3308e87bbd.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/39df27ad69bac7eb.css" rel="stylesheet"/>
  <link data-precedence="next" href="/used-cars/_next/static/css/94d3da771ff1ae0c.css" re

In [151]:
import re

# Convert the soup object to a string so we can search through the HTML
html_str = str(soup)

# Find all occurrences of URLs like "/used_cars/info.php?ID=<number>"
matches = re.findall(r'/used_cars/info\.php\?ID=(\d+)', html_str)

# Create a mapping from full URL to the ID value
car_link_id_dict = {}
for idnum in matches:
    url = f"https://www.sgcarmart.com/used_cars/info.php?ID={idnum}"
    car_link_id_dict[url] = idnum

print("Example:", "https://www.sgcarmart.com/used_cars/info.php?ID=1439521", "->", car_link_id_dict.get("https://www.sgcarmart.com/used_cars/info.php?ID=1439521"))

Example: https://www.sgcarmart.com/used_cars/info.php?ID=1439521 -> 1439521


In [154]:
set(matches)

{'1393980',
 '1397283',
 '1399300',
 '1402691',
 '1403745',
 '1411072',
 '1411727',
 '1430271',
 '1431672',
 '1431794',
 '1436823',
 '1439521',
 '1443784',
 '1445659',
 '1445702'}

# Final

## Single car pulling

In [None]:
url = "https://www.sgcarmart.com/used_cars/info.php?ID=1442856"

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first <script> tag in the page containing "success", "coe", and "depreciation"
script = next(
    (tag for tag in soup.find_all("script")
     if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)

# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script)

import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

def get_type_of_vehicle(cleaned_script10_str):
    """
    Extract the type_of_vehicle value from string, handling both object and string representations.
    Returns either the dictionary (parsed from JSON) or the plain value if it's a simple string.
    Expected forms:
      - ... "type_of_vehicle":{"text":"Mid-Sized Sedan", ... }
      - ... "type_of_vehicle":"Mid-Sized Sedan"
    """
    import re
    import json

    # Try to match the type_of_vehicle dictionary first (non-greedy for value)
    dict_match = re.search(
        r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
    )
    if dict_match:
        dict_str = '{' + dict_match.group(1) + '}'
        # Remove trailing commas (incomplete objects), then try to parse
        dict_str = re.sub(r',\s*\}$', '}', dict_str)
        try:
            # Fix possible unescaped quotes and parse as JSON
            # Replace single quotes with double only if not inside a string value
            dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
            # Only keep up to the last closing brace to avoid over-read
            if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                dict_str_cleaned = dict_str_cleaned + '}'
            type_obj = json.loads(dict_str_cleaned)
            # If 'text' key is present, return its value
            if isinstance(type_obj, dict) and 'text' in type_obj:
                return type_obj['text'].strip()
            return type_obj
        except Exception:
            pass  # Fall through to the next approach

    # Try to match a simple quoted string value
    simple_match = re.search(
        r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
    )
    if simple_match:
        return simple_match.group(1).strip()

    # Fallback: try unquoted value (rare case)
    simple_unquoted = re.search(
        r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
    )
    if simple_unquoted:
        return simple_unquoted.group(1).strip()

    return None

carmodel = get_carmodel(cleaned_script10_str)
type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)

def extract_car_details(cleaned_script10_str, clean_value_func):
    wanted_keys = [
        "Transmission",
        "Fuel Type",
        "Engine Capacity",
        "Curb Weight",
        "Power",
        "Road Tax",
        "Deregistration Value",
        "COE",
        "OMV",
        "ARF",
        "reg_date",
        "mileage",
        "owners",
        "dealer",
        "dereg_value",
        "engine_cap"
    ]
    wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]

    results_smallcaps_cleaned = {}
    for orig, key in zip(wanted_keys, wanted_keys_small):
        pat = re.compile(
            r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
            re.IGNORECASE
        )
        if key == "mileage":
            matches = pat.findall(cleaned_script10_str)
            # Use the second mileage if there are at least two matches, else skip or fallback to first
            if len(matches) >= 2:
                value = matches[1].strip()
            elif len(matches) == 1:
                value = matches[0].strip()
            else:
                value = None
            if value is not None:
                # Remove parentheses and their contents for mileage if matched (though uncommon, for safety)
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
        else:
            match = pat.search(cleaned_script10_str)
            if match:
                value = match.group(1).strip()
                # Always remove parentheses and their contents from the extracted value
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
    return results_smallcaps_cleaned

# Usage:
results_smallcaps_cleaned = extract_car_details(cleaned_script10_str, clean_value)

# Add carmodel and url into the dictionary
results_smallcaps_cleaned['carmodel'] = carmodel

results_smallcaps_cleaned['type_of_vehicle'] = type_of_vehicle

results_smallcaps_cleaned['url'] = url


print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol', 'curb_weight': '1295', 'power': '95.3', 'road_tax': '738', 'coe': '37000', 'omv': '11398', 'arf': '11398', 'reg_date': '302018', 'mileage': '95500', 'owners': '1', 'dealer': 'Platinum Motoring', 'dereg_value': '16145', 'engine_cap': '1591', 'carmodel': 'Kia Cerato K3 1.6A', 'type_of_vehicle': 'Mid-Sized Sedan', 'url': 'https://www.sgcarmart.com/used_cars/info.php?ID=1442856'}


In [None]:
## 