# Testing

## Single car pulling 

In [57]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import Dict, Any
import random
import json

# Configure headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
}


In [58]:
def scrape_car_details(url: str) -> Dict[str, Any]:
    """
    Scrape car details from SGCarMart listing page
    
    Args:
        url (str): URL of the car listing page
        
    Returns:
        Dict[str, Any]: Dictionary containing car details
    """
    try:
        # Add random delay to be respectful to the server
        time.sleep(random.uniform(1, 3))
        
        # Make the request
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize dictionary to store car details
        car_details = {}
        
        # Find all detail rows
        detail_rows = soup.find_all('div', class_='row_info')
        
        # Extract information from each row
        for row in detail_rows:
            # Find label and value
            label_elem = row.find('div', class_='label')
            value_elem = row.find('div', class_='value')
            
            if label_elem and value_elem:
                label = label_elem.text.strip()
                value = value_elem.text.strip()
                
                # Clean up the label and value
                label = label.replace(':', '').strip()
                value = value.replace('info', '').strip()
                
                # Store in dictionary
                car_details[label] = value
        
        # Extract price separately as it might be in a different location
        price_elem = soup.find('div', class_='price')
        if price_elem:
            car_details['Price'] = price_elem.text.strip()
            
        return car_details
        
    except Exception as e:
        print(f"Error scraping car details: {str(e)}")
        return {}


In [73]:
# Test the scraper with a sample car listing URL
test_url = "https://www.sgcarmart.com/used_cars/info.php?ID=1439810"

# Test the scraper
car_info = scrape_car_details(test_url)

# Pretty print the results
print(json.dumps(car_info, indent=2, ensure_ascii=False))


{}


In [59]:
url = "https://www.sgcarmart.com/used-cars/info/toyota-prius-plus-hybrid-1439810?dl=4309"

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')



In [16]:
# Let's breakdown the main headers of the soup object (HTML) slowly

# 1. Find all main header tags. Common main headers are <h1>, <h2>, <h3>
h1_headers = soup.find_all('h1')
h2_headers = soup.find_all('h2')
h3_headers = soup.find_all('h3')

print("H1 Headers:")
for h1 in h1_headers:
    print("-", h1.get_text(strip=True))

print("\nH2 Headers:")
for h2 in h2_headers:
    print("-", h2.get_text(strip=True))

print("\nH3 Headers:")
for h3 in h3_headers:
    print("-", h3.get_text(strip=True))

H1 Headers:
- Toyota Prius Plus Hybrid 1.8A

H2 Headers:
- Toyota Prius Discussions
- Related Discussions
- Products & Services
- Alternative Cars
- Recommended For You
- NCD Builder Insurance
- Seller Information
- Resources
- Leasing Option
- Pricing Summary
- Free Car Servicing
- Upfront Payment

H3 Headers:


In [19]:
# From this header (<h1>), extract and then print all body content after it

h1_headers = soup.find_all('h1')
if h1_headers:
    h1 = h1_headers[0]
    print("H1 Header:")
    print("-", h1.get_text(strip=True))

    # Get all the siblings after the h1 until end of body
    print("\nBody content after <h1>:")
    # Iterate over all <div> tags that appear after the <h1>
    for div in h1.find_all_next("div"):
        text = div.get_text(strip=True)
        if text:
            print("-", text)

H1 Header:
- Toyota Prius Plus Hybrid 1.8A

Body content after <h1>:
- ShortlistShareCopy LinkFacebook ShareWhatsApp ShareMoreReport ErrorPrintable VersionPost an Ad
- ShareCopy LinkFacebook ShareWhatsApp Share
- Copy LinkFacebook ShareWhatsApp Share
- MoreReport ErrorPrintable VersionPost an Ad
- Report ErrorPrintable VersionPost an Ad
- OverviewFinancialSimilarLeasingAccessoriesResearchPhotosMap
- OverviewFinancialSimilarLeasingAccessoriesResearchPhotosMap
- Overview
- Financial
- Similar
- Leasing
- Accessories
- Research
- Photos
- Map
- Loan Calculator
- Loan Calculator
- Toyota Prius DiscussionsRelated DiscussionsView All5G Toyota Prius[Official] 2021 2nd Generation Toyota Aqua / Prius CAn Early Look at the 2023 Toyota Prius Prime XSE PremiumToyota Prius Alpha/PlusInside neglected Prius engine after 500,000 kmShare your thoughts on TelegramAbove discussions are extracted frommycarforum.comYou may login to Mycarforum using your Sgcarmart username & password.
- Toyota Prius Discuss

In [20]:
# List all possible div class attributes found in the soup

divs = soup.find_all("div")
div_classes = set()
for div in divs:
    cls = div.get("class")
    if cls:
        # cls is a list, so turn it into a space-separated string for uniqueness
        div_classes.add(" ".join(cls))

print("All unique div classes:")
for cls in sorted(div_classes):
    print("-", cls)

All unique div classes:
- GoogleAds_ads_container__G6sKu gam GoogleAds_desktop_ads__Cj7vd
- GoogleAds_google_ads_container__nvFye
- Styles_dropdown_menu__MK439 styles_dropdown_menu__1sDZj dropdown-menu
- below_footer_container
- bottomfooter styles_bottomFooter__YjRyp
- clear
- d-flex align-items-center gap-2 styles_showDetailSeperator__9bY0w
- d-flex gap-3 mt-3 overflow-hidden
- d-none d-lg-flex row row-cols-5 m-auto f14
- dropdown
- global_desktop_content_width styles_footercont_container__mJ_p4
- google_ads_section undefined
- input-group
- listing_breadcrumb_container__SleKE
- listing_category_dropdown__mTgvY dropdown
- listing_gam_container__aFeVC styles_infoGamContainer__frpAM
- loader_placeholder d-flex gap-2 placeholder-glow
- loader_placeholder my-3 placeholder-glow
- loader_placeholder placeholder-glow
- loader_placeholder styles_left_container_columns__O_8cF placeholder-glow
- loader_placeholder styles_left_container_columns__qaivk placeholder-glow
- loader_placeholder style

In [41]:
# Pull only the <script> tag(s) below the element with id="S:10"
# Assumes BeautifulSoup 'soup' object is available

target_div = soup.find(id="S:10")
if target_div:
    scripts = []
    # Find the next sibling(s), and look for script tags
    next_tag = target_div.find_next_sibling()
    while next_tag:
        if next_tag.name == "script":
            scripts.append(next_tag)
        next_tag = next_tag.find_next_sibling()
    # Print all script tags found, prettified for inspection
    for idx, script in enumerate(scripts):
        print(f"\n--- Script {idx+1} ---\n{script.prettify()}\n")
else:
    print("No element with id='S:10' found.")


--- Script 1 ---
<script>
 $RC("B:10","S:10")
</script>



--- Script 2 ---
<script>
 self.__next_f.push([1,"175:I[75520,[\"8038\",\"static/chunks/7ce798d6-6de03778c6a05922.js\",\"3676\",\"static/chunks/870fdd6f-7acee1cd9ffc0f14.js\",\"4609\",\"static/chunks/4609-da3c93e670782c6b.js\",\"8030\",\"static/chunks/8030-aa02dc5fc25f0f2a.js\",\"8667\",\"static/chunks/8667-30fec0a99c1514f1.js\",\"1809\",\"static/chunks/1809-77fb8dc5f61c4d7a.js\",\"5814\",\"static/chunks/5814-4eddf4e6d73020de.js\",\"6666\",\"static/chunks/6666-3eb94ca4395b7c81.js\",\"7393\",\"static/chunks/7393-fbc801d4de042bbb.js\",\"8603\",\"static/chunks/8603-3f8fd4c127b77587.js\",\"6182\",\"static/chunks/6182-f42a3e1c7d0c2bfa.js\",\"7891\",\"static/chunks/7891-3cc6e31dbbdb335a.js\",\"6434\",\"static/chunks/6434-f1ba83ef6f4d8ff1.js\",\"3654\",\"static/chunks/3654-1c249fb13a450700.js\",\"3035\",\"static/chunks/3035-60fa41caddb1a9a2.js\",\"7123\",\"static/chunks/7123-616712f8b10ea192.js\",\"1543\",\"static/chunks/app/(routes)

In [47]:
# Extract and print only the <script> tag below the element with id="S:10" that is the 10th <script> in order
# Assumes BeautifulSoup 'soup' object is available

target_div = soup.find(id="S:10")
if target_div:
    scripts = []
    # Find the next sibling(s), and look for script tags
    next_tag = target_div.find_next_sibling()
    while next_tag:
        if next_tag.name == "script":
            scripts.append(next_tag)
        next_tag = next_tag.find_next_sibling()
    if len(scripts) >= 10:
        script10 = scripts[9]
        print(f"\n--- Script 10 ---\n{script10.prettify()}\n")
    else:
        print(f"Less than 10 <script> tags found after element with id='S:10'.")
else:
    print("No element with id='S:10' found.")


--- Script 10 ---
<script>
 self.__next_f.push([1,"178:{\"success\":true,\"data\":{\"aid\":1439810,\"car_model\":\"Toyota Prius Plus Hybrid 1.8A\",\"depreciation\":\"$$16,900 /yr\",\"coe\":\"$$39,000\",\"reg_date\":\"16-Nov-2020\",\"original_reg_date\":null,\"lifespan\":null,\"mileage\":\"101,482 km (20.5k /yr)\",\"manufactured\":2020,\"road_tax\":\"$$976 /yr\",\"transmission\":\"Auto\",\"dereg_value\":\"$$35,128 as of today\",\"omv\":\"$$27,507\",\"arf\":\"$$20,510\",\"engine_cap\":\"1,798 cc\",\"drive_range\":\"N.A.\",\"fuel_type\":\"Petrol-Electric\",\"power\":\"100.0 kW (134 bhp)\",\"curb_weight\":\"1,500 kg\",\"owners\":\"2\",\"type_of_vehicle\":{\"text\":\"MPV\",\"link\":\"https://www.sgcarmart.com/used-cars/listing?vts[]=10\"},\"status\":\"Available for sale\",\"features\":\"Fuel efficient 1.8l 4 cylinders 16v DOHC dual VVT-I engine with electric motor.\",\"new_car_text\":\"View specs of the \u003ca href=\\\"https://www.sgcarmart.com/new_cars/newcars_pastcars.php?PCM=1\u0026MOD

In [62]:
import re

# Convert script10 to string
script10_str = str(script10)

# Define wanted keys and their small caps versions (underscore, lower)
wanted_keys = [
    "Transmission",
    "Fuel Type",
    "Engine Capacity",
    "Curb Weight",
    "Power",
    "Road Tax",
    "Deregistration Value",
    "COE",
    "OMV",
    "ARF"
]
wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]

results_smallcaps = {}

def clean_value(key, value):
    """
    Clean up the extracted value for any of the wanted keys.
    For numeric/currency-like values: remove non-numeric, non-dot, non-minus chars.
    For others: just strip whitespace.
    """
    # Always try to normalize money/numbers (contains $ or numbers or unit suffix)
    # If it contains digits, possibly with currency symbols or known suffixes, try to extract the main number
    if any(char.isdigit() for char in value):
        # Remove currency markers ($), commas, spaces, slashes, "yr", "km", "cc", etc.
        clean = value
        # Remove "$", ",", spaces
        clean = re.sub(r'[,$]', '', clean)
        # Remove common unit suffixes/spaces (will still keep decimals and / where relevant)
        clean = re.sub(r'\s*(/yr|yr|km|cc|as of today|/)', '', clean, flags=re.IGNORECASE)
        # Remove any leftover non-numeric except dot
        clean = re.sub(r'[^\d.]', '', clean)
        return clean
    else:
        return value.strip()

# For each key, search explicitly for its JSON-like property
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Match e.g.: "road_tax": "$$976", allowing for optional spaces and optional quotes, case-insensitive key
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^,"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps[key] = clean

print(results_smallcaps)

{}


In [66]:
import codecs

# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script10_str)

# For each key in wanted_keys_small, extract and clean the value
results_smallcaps_cleaned = {}
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Allow comma inside the value (for thousands in numbers)
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(cleaned_script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps_cleaned[key] = clean

print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol-Electric', 'curb_weight': '1500', 'power': '100.0134', 'road_tax': '976', 'coe': '39000', 'omv': '27507', 'arf': '20510'}


In [118]:
url = "https://www.sgcarmart.com/used_cars/info.php?ID=1442856"

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first <script> tag in the page containing "success", "coe", and "depreciation"
script = next(
    (tag for tag in soup.find_all("script")
     if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)

# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script)

import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

def get_type_of_vehicle(cleaned_script10_str):
    """
    Extract the type_of_vehicle value from string, handling both object and string representations.
    Returns either the dictionary (parsed from JSON) or the plain value if it's a simple string.
    Expected forms:
      - ... "type_of_vehicle":{"text":"Mid-Sized Sedan", ... }
      - ... "type_of_vehicle":"Mid-Sized Sedan"
    """
    import re
    import json

    # Try to match the type_of_vehicle dictionary first (non-greedy for value)
    dict_match = re.search(
        r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
    )
    if dict_match:
        dict_str = '{' + dict_match.group(1) + '}'
        # Remove trailing commas (incomplete objects), then try to parse
        dict_str = re.sub(r',\s*\}$', '}', dict_str)
        try:
            # Fix possible unescaped quotes and parse as JSON
            # Replace single quotes with double only if not inside a string value
            dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
            # Only keep up to the last closing brace to avoid over-read
            if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                dict_str_cleaned = dict_str_cleaned + '}'
            type_obj = json.loads(dict_str_cleaned)
            # If 'text' key is present, return its value
            if isinstance(type_obj, dict) and 'text' in type_obj:
                return type_obj['text'].strip()
            return type_obj
        except Exception:
            pass  # Fall through to the next approach

    # Try to match a simple quoted string value
    simple_match = re.search(
        r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
    )
    if simple_match:
        return simple_match.group(1).strip()

    # Fallback: try unquoted value (rare case)
    simple_unquoted = re.search(
        r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
    )
    if simple_unquoted:
        return simple_unquoted.group(1).strip()

    return None

carmodel = get_carmodel(cleaned_script10_str)
type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)

def extract_car_details(cleaned_script10_str, clean_value_func):
    wanted_keys = [
        "Transmission",
        "Fuel Type",
        "Engine Capacity",
        "Curb Weight",
        "Power",
        "Road Tax",
        "Deregistration Value",
        "COE",
        "OMV",
        "ARF",
        "reg_date",
        "mileage",
        "owners",
        "dealer",
        "dereg_value",
        "engine_cap"
    ]
    wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]

    results_smallcaps_cleaned = {}
    for orig, key in zip(wanted_keys, wanted_keys_small):
        pat = re.compile(
            r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
            re.IGNORECASE
        )
        if key == "mileage":
            matches = pat.findall(cleaned_script10_str)
            # Use the second mileage if there are at least two matches, else skip or fallback to first
            if len(matches) >= 2:
                value = matches[1].strip()
            elif len(matches) == 1:
                value = matches[0].strip()
            else:
                value = None
            if value is not None:
                # Remove parentheses and their contents for mileage if matched (though uncommon, for safety)
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
        else:
            match = pat.search(cleaned_script10_str)
            if match:
                value = match.group(1).strip()
                # Always remove parentheses and their contents from the extracted value
                value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                clean = clean_value_func(key, value_noparens)
                results_smallcaps_cleaned[key] = clean
    return results_smallcaps_cleaned

# Usage:
results_smallcaps_cleaned = extract_car_details(cleaned_script10_str, clean_value)

# Add carmodel and url into the dictionary
results_smallcaps_cleaned['carmodel'] = carmodel

results_smallcaps_cleaned['type_of_vehicle'] = type_of_vehicle

results_smallcaps_cleaned['url'] = url


print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol', 'curb_weight': '1295', 'power': '95.3', 'road_tax': '738', 'coe': '37000', 'omv': '11398', 'arf': '11398', 'reg_date': '302018', 'mileage': '95500', 'owners': '1', 'dealer': 'Platinum Motoring', 'dereg_value': '16145', 'engine_cap': '1591', 'carmodel': 'Kia Cerato K3 1.6A', 'type_of_vehicle': 'Mid-Sized Sedan', 'url': 'https://www.sgcarmart.com/used_cars/info.php?ID=1442856'}


In [114]:
import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

def get_type_of_vehicle(cleaned_script10_str):
    """
    Extract the type_of_vehicle value from string, handling both object and string representations.
    Returns either the dictionary (parsed from JSON) or the plain value if it's a simple string.
    Expected forms:
      - ... "type_of_vehicle":{"text":"Mid-Sized Sedan", ... }
      - ... "type_of_vehicle":"Mid-Sized Sedan"
    """
    import re
    import json

    # Try to match the type_of_vehicle dictionary first (non-greedy for value)
    dict_match = re.search(
        r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
    )
    if dict_match:
        dict_str = '{' + dict_match.group(1) + '}'
        # Remove trailing commas (incomplete objects), then try to parse
        dict_str = re.sub(r',\s*\}$', '}', dict_str)
        try:
            # Fix possible unescaped quotes and parse as JSON
            # Replace single quotes with double only if not inside a string value
            dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
            # Only keep up to the last closing brace to avoid over-read
            if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                dict_str_cleaned = dict_str_cleaned + '}'
            type_obj = json.loads(dict_str_cleaned)
            # If 'text' key is present, return its value
            if isinstance(type_obj, dict) and 'text' in type_obj:
                return type_obj['text'].strip()
            return type_obj
        except Exception:
            pass  # Fall through to the next approach

    # Try to match a simple quoted string value
    simple_match = re.search(
        r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
    )
    if simple_match:
        return simple_match.group(1).strip()

    # Fallback: try unquoted value (rare case)
    simple_unquoted = re.search(
        r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
    )
    if simple_unquoted:
        return simple_unquoted.group(1).strip()

    return None

carmodel = get_carmodel(cleaned_script10_str)
type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)

carmodel, type_of_vehicle

('Kia Cerato K3 1.6A', 'Mid-Sized Sedan')

In [103]:
import re

def get_carmodel(cleaned_script10_str):
    match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

carmodel = get_carmodel(cleaned_script10_str)
carmodel

'Kia Cerato K3 1.6A'

In [87]:
# Find the first <script> tag in the page containing "success", "coe", and "depreciation"
script = next(
    (tag for tag in soup.find_all("script")
     if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)
if script:
    print("\n--- First <script> tag containing 'success', 'coe', and 'depreciation' ---\n")
    print(script.prettify())
else:
    print('No <script> tag found in the page containing "success", "coe", and "depreciation"')


--- First <script> tag containing 'success', 'coe', and 'depreciation' ---

<script>
 self.__next_f.push([1,"1e:[\"$\",\"main\",null,{\"children\":[\"$undefined\",[\"$\",\"$L2a\",null,{\"src\":\"https://ps.eyeota.net/pixel?pid=om4o6lu\u0026t=ajs\u0026cat=usedpassengercars\u0026cat=usedpricebelow50k\u0026cat=usedreg2016to2020\u0026cat=usedpetrol\u0026cat=usedkia\u0026cat=usedcarintent\",\"strategy\":\"afterInteractive\"}],[[\"$\",\"div\",null,{\"className\":\"$undefined\"}],false,[\"$\",\"$2f\",null,{\"fallback\":[\"$\",\"$L22\",null,{}],\"children\":\"$L148\"}],\"$undefined\",[\"$\",\"div\",null,{\"className\":\"styles_mobileAds__at6xI\",\"children\":[\"$\",\"$L141\",null,{\"from_layout\":false,\"infoUrlData\":{\"aid\":1442856,\"year\":\"2018\",\"make\":\"Kia\",\"model\":\"Cerato\",\"make_model\":\"Kia Cerato\",\"carmodel\":\"Kia Cerato K3 1.6A\",\"correct_url\":true,\"info_url\":\"https://www.sgcarmart.com/used-cars/info/kia-cerato-k3-16a-1442856\",\"dealer_code\":3283,\"dealer\":\"P

In [92]:
# Replace common JavaScript escape sequences (i.e., unescape the string)
def clean_js_escapes(s):
    # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
    s = s.encode('utf-8').decode('unicode_escape')
    s = s.replace('\\/', '/')
    s = s.replace('\\\\', '\\')
    return s

cleaned_script10_str = clean_js_escapes(script)

# For each key in wanted_keys_small, extract and clean the value
results_smallcaps_cleaned = {}
for orig, key in zip(wanted_keys, wanted_keys_small):
    # Allow comma inside the value (for thousands in numbers)
    pat = re.compile(
        r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)', 
        re.IGNORECASE
    )
    match = pat.search(cleaned_script10_str)
    if match:
        value = match.group(1).strip()
        clean = clean_value(key, value)
        results_smallcaps_cleaned[key] = clean

print(results_smallcaps_cleaned)

{'transmission': 'Auto', 'fuel_type': 'Petrol', 'curb_weight': '1295', 'power': '95.3127', 'road_tax': '738', 'coe': '37000', 'omv': '11398', 'arf': '11398'}


## Main page scraping


Idea is to take main page - get all the single car links 

In [24]:
from bs4 import BeautifulSoup
import requests

# Example main listing page URL (can change parameters as needed)
main_url = "https://www.sgcarmart.com/used-cars/listing?pr1=45001&pr2=50000&cat=18"

# Configure headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
}

# Fetch the page content
response = requests.get(main_url)
soup = BeautifulSoup(response.text, "html.parser")


In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re

driver = webdriver.Chrome()
driver.get("https://www.sgcarmart.com/used-cars/listing?pr1=45001&pr2=50000&cat=18")

# wait for the JS to render the listings
time.sleep(5)

soup = BeautifulSoup(driver.page_source, "html.parser")
listing_divs = soup.find_all("div", id=re.compile("^listing_\\d+$"))
print([d["id"] for d in listing_divs])

driver.quit()


In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re

driver = webdriver.Chrome()
driver.get("https://www.sgcarmart.com/used-cars/listing?pr1=45001&pr2=50000&cat=18")

# Wait for JS to render listings
time.sleep(5)

# Parse rendered HTML
soup = BeautifulSoup(driver.page_source, "html.parser")

# Find all divs with ids like listing_0, listing_1, ...
listing_divs = soup.find_all("div", id=re.compile(r"^listing_\d+$"))

all_links = []

for div in listing_divs:
    # Find all <a> tags with the desired class
    links = div.find_all("a", class_="styles_text_link__wBaHL")
    for a in links:
        href = a.get("href")
        if href and href.startswith("https"):
            all_links.append(href)

driver.quit()

# Remove duplicates if any
all_links = list(set(all_links))

# Print or save results
for link in all_links:
    print(link)


https://www.sgcarmart.com/used-cars/info/bmw-x3-sdrive20i-m-sport-1434792/?dl=1079
https://www.sgcarmart.com/used-cars/info/bmw-1-series-116d-1434055/?dl=2633&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/lexus-es250-sunroof-1430189/?dl=2277&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/honda-odyssey-24a-absolute-1445575/?dl=1009
https://www.sgcarmart.com/used-cars/info/lexus-es250-luxury-sunroof-1445530/?dl=4073
https://www.sgcarmart.com/used-cars/info/toyota-sienta-hybrid-15a-1440175/?dl=1198
https://www.sgcarmart.com/used-cars/info/mercedes-benz-a-class-a180-style-1445110/?dl=3863
https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1436071/?dl=4645&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/bmw-x3-sdrive20i-sunroof-1445797/?dl=4935
https://www.sgcarmart.com/used-cars/info/kia-niro-hybrid-16a-1445813/?dl=3414
https://www.sgcarmart.com/used-cars/info/kia-cerato-16a-ex-1435258/?dl=4636&utm_content=SLeligible
https://www.sg

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re, time

URL = "https://www.sgcarmart.com/used-cars/listing?pr1=30001&pr2=50000&cts[]=18&vts[]=12&vts[]=13&vts[]=9&vts[]=10&vts[]=11&vts[]=8&vts[]=7&vts[]=3&vts[]=2&mil1=&mil2=100000&own_c=%3C&own=3&color[]=white&color[]=black&color[]=red&color[]=grey&color[]=silver&avl=a&ord=PRC_ASC"

def extract_links_from_page(html):
    soup = BeautifulSoup(html, "html.parser")
    listing_divs = soup.find_all("div", id=re.compile(r"^listing_\d+$"))
    links = []
    for div in listing_divs:
        for a in div.find_all("a", class_="styles_text_link__wBaHL"):
            href = a.get("href")
            if href and href.startswith("https"):
                links.append(href)
    return links

# --- driver setup (headless optional) ---
options = webdriver.ChromeOptions()
# options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)

driver.get(URL)

all_links = set()
page_idx = 1
MAX_PAGES = 20  # safety guard

while page_idx <= MAX_PAGES:
    # wait for listings to appear
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='listing_']")))
    # collect links on this page
    all_links.update(extract_links_from_page(driver.page_source))

    # locate the Next button in the desktop paginator
    # the hashed class names can change, so match by container id + "right_control" substring
    next_btn = driver.find_element(By.CSS_SELECTOR,
        '#desktopPaginationContainer button[class*="right_control"]')

    # stop if Next is disabled
    if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
        break

    # remember something on current page to detect change
    old_first = driver.find_element(By.CSS_SELECTOR, "div[id^='listing_']")

    # click next (JS click is more reliable)
    driver.execute_script("arguments[0].click();", next_btn)

    # wait for page to change (old listings become stale)
    wait.until(EC.staleness_of(old_first))
    page_idx += 1

driver.quit()

# results
for href in sorted(all_links):
    print(href)
print(f"\nTotal unique links: {len(all_links)}")


https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1434872/?dl=3597&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1439396/?dl=3382
https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1440081/?dl=3597
https://www.sgcarmart.com/used-cars/info/audi-a4-14a-tfsi-1407079/?dl=1000&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/audi-a4-14a-tfsi-1437136/?dl=3163&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/audi-a6-18a-tfsi-1440002/?dl=2943
https://www.sgcarmart.com/used-cars/info/audi-q2-10a-tfsi-1435448/?dl=4769&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/bmw-1-series-116d-1442830/?dl=4919
https://www.sgcarmart.com/used-cars/info/bmw-2-series-216i-1434979/?dl=1418&utm_content=SLeligible
https://www.sgcarmart.com/used-cars/info/bmw-2-series-220i-1124035/?dl=2349
https://www.sgcarmart.com/used-cars/info/bmw-3-series-318i-1429396/?dl=3758&utm_content=SLeligible
https://www.sgcarmart.com/used

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re

def collect_listing_links(main_url, max_pages=20):
    """
    Crawl all paginated listing links from sgcarmart Used Cars search results pages.

    Args:
        main_url (str): The URL of the first result page to start scraping from.
        max_pages (int): Maximum number of pages to paginate through (guard against infinite loops).

    Returns:
        Set[str]: A set containing all unique car listing URLs found.
    """
    def extract_links_from_page(html):
        soup = BeautifulSoup(html, "html.parser")
        listing_divs = soup.find_all("div", id=re.compile(r"^listing_\d+$"))
        links = []
        for div in listing_divs:
            for a in div.find_all("a", class_="styles_text_link__wBaHL"):
                href = a.get("href")
                if href and href.startswith("https"):
                    links.append(href)
        return links

    options = webdriver.ChromeOptions()
    # Uncomment if running on server: options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 20)

    driver.get(main_url)

    all_links = set()
    page_idx = 1

    try:
        while page_idx <= max_pages:
            # Wait for listings to load
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='listing_']")))
            # Collect links from current page
            all_links.update(extract_links_from_page(driver.page_source))

            # Find next button in paginator (desktop)
            next_btn = driver.find_element(By.CSS_SELECTOR,
                '#desktopPaginationContainer button[class*="right_control"]')

            # Stop if Next is disabled
            if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
                break

            # Remember a listing element to detect page change
            old_first = driver.find_element(By.CSS_SELECTOR, "div[id^='listing_']")

            # Click Next (using JS for reliability)
            driver.execute_script("arguments[0].click();", next_btn)

            # Wait for listings to change (old page should be stale)
            wait.until(EC.staleness_of(old_first))
            page_idx += 1
    finally:
        driver.quit()

    return all_links


main_url = "https://www.sgcarmart.com/used-cars/listing?pr1=30001&pr2=50000&cts[]=18&vts[]=12&vts[]=13&vts[]=9&vts[]=10&vts[]=11&vts[]=8&vts[]=7&vts[]=3&vts[]=2&mil1=&mil2=100000&own_c=%3C&own=3&color[]=white&color[]=black&color[]=red&color[]=grey&color[]=silver&avl=a&ord=PRC_ASC"
collect_listing_links(main_url, max_pages = 5)


{'https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1439396/?dl=3382',
 'https://www.sgcarmart.com/used-cars/info/bmw-1-series-116d-1442830/?dl=4919',
 'https://www.sgcarmart.com/used-cars/info/bmw-x1-sdrive18i-1434526/?dl=1000',
 'https://www.sgcarmart.com/used-cars/info/bmw-x1-sdrive20i-1442150/?dl=4413',
 'https://www.sgcarmart.com/used-cars/info/bmw-x3-sdrive20i-sunroof-1384377/?dl=4536',
 'https://www.sgcarmart.com/used-cars/info/citroen-ds3-cabrio-12a-1428853/?dl=1034&utm_content=SLeligible',
 'https://www.sgcarmart.com/used-cars/info/citroen-grand-c4-picasso-1441181/?dl=4815',
 'https://www.sgcarmart.com/used-cars/info/honda-accord-20a-vti-s-1409448/?dl=1079',
 'https://www.sgcarmart.com/used-cars/info/honda-accord-20a-vti-s-1443001/?dl=2098',
 'https://www.sgcarmart.com/used-cars/info/honda-civic-15a-vtec-1436871/?dl=8008',
 'https://www.sgcarmart.com/used-cars/info/honda-civic-16a-vti-1424147/?dl=2296&utm_content=SLeligible',
 'https://www.sgcarmart.com/used-cars/info

## creating new columns for analysis

In [186]:
import pandas as pd

data_dir = './data/'
filename = 'carlist_20251025.csv'
df = pd.read_csv(data_dir + filename)

In [187]:
df[df['transmission'].isnull()].to_csv(data_dir + 'removed_rows.csv', index = True)

In [188]:
df = df[~df['transmission'].isnull()].copy(deep = True)

df['reg_date'] = pd.to_datetime(df['reg_date'], format='%d-%b-%y')

In [189]:
from datetime import datetime

today = pd.Timestamp(datetime.today().date())

def years_months_left(reg_date, lifespan_years=10):
    # Calculate end-of-lifespan date
    end_date = reg_date + pd.DateOffset(years=lifespan_years)
    delta = end_date - today
    if delta.days < 0:
        return "Expired"
    years = delta.days // 365
    months = (delta.days % 365) // 30
    return f"{years} yr {months} mth"

df['years_months_left'] = df['reg_date'].apply(years_months_left)

In [190]:
df.sort_values('years_months_left', ascending=False)

Unnamed: 0,price,mileage,dealer,reg_date,carmodel,type_of_vehicle,url,transmission,fuel_type,curb_weight,power,road_tax,coe,omv,arf,owners,dereg_value,engine_cap,years_months_left
208,49888,91000,Tos Auto Pte Ltd,2019-05-13,Renault Grand Scenic Diesel 1.5A dCi,MPV,https://www.sgcarmart.com/used-cars/info/renau...,Auto,Diesel,1540.0,81.0,1048.0,29159.0,25440.0,17616.0,2.0,21796.0,1461.0,3 yr 6 mth
12,45800,83400,Car Design Motor,2019-04-30,Mitsubishi Attrage 1.2A Sports,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26659.0,13515.0,5000.0,2.0,12614.0,1193.0,3 yr 6 mth
28,45800,83400,Car Design Motor,2019-04-30,Mitsubishi Attrage 1.2A Sports,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26659.0,13515.0,5000.0,2.0,12614.0,1193.0,3 yr 6 mth
221,44888,88000,Platinum Motoring,2019-04-23,Mitsubishi Attrage 1.2A,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26309.0,13554.0,5000.0,2.0,12440.0,1193.0,3 yr 6 mth
517,44888,88000,Platinum Motoring,2019-04-23,Mitsubishi Attrage 1.2A,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26309.0,13554.0,5000.0,2.0,12440.0,1193.0,3 yr 6 mth
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,34800,87634,Carteala,2016-10-12,BMW X1 sDrive20i,SUV,https://www.sgcarmart.com/used-cars/info/bmw-x...,Auto,Petrol,1485.0,141.0,1210.0,55501.0,33667.0,39134.0,2.0,24904.0,1998.0,0 yr 11 mth
332,32800,76000,D Cars Pte Ltd,2016-08-31,Volkswagen Golf GTI 5DR Sunroof,Sports Car,https://www.sgcarmart.com/used-cars/info/volks...,Auto,Petrol,1370.0,162.0,1194.0,56500.0,26999.0,29799.0,1.0,19683.0,1984.0,0 yr 10 mth
109,45800,97383,Starise Automobile,2016-08-25,Mercedes-Benz C-Class C250 AMG Line,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/merce...,Auto,Petrol,1480.0,155.0,1202.0,57010.0,45559.0,55783.0,1.0,32624.0,1991.0,0 yr 10 mth
36,32800,76000,D Cars Pte Ltd,2016-08-31,Volkswagen Golf GTI 5DR Sunroof,Sports Car,https://www.sgcarmart.com/used-cars/info/volks...,Auto,Petrol,1370.0,162.0,1194.0,56500.0,26999.0,29799.0,1.0,19683.0,1984.0,0 yr 10 mth


In [191]:
import pandas as pd
import numpy as np
import re
import yaml

# ---- helpers ----
def parse_money(x):
    x = re.sub(r"[^0-9.]", "", str(x)) if pd.notna(x) else ""
    return float(x) if x else np.nan

# ---- load PQP params ----
with open("params.yaml", "r") as f:
    params = yaml.safe_load(f)

# default COE category (A/B/C etc.)
default_cat = params["pqp"].get("default_cat", "A")
PQP10 = float(params["pqp"]["categories"][default_cat]["ten_year"])
PQP5  = float(params["pqp"]["categories"][default_cat]["five_year"])

# ---- extract only what we need ----
df["ARF_val"] = df["arf"].apply(parse_money)

# (1) Dereg at COE finish → 50% of ARF (PARF rebate)
df["dereg_val_at_10y"] = (df["ARF_val"] * 0.5).round(0)

# (2) Renewal costs (from YAML)
df["pqp_est_10y"] = PQP10
df["pqp_est_5y"]  = PQP5

# (3) Extend net values
# How much you'd "gain or lose" compared to what you'll get back at 10 years
df["extend_net_value_10y"] = df["dereg_val_at_10y"] - df["pqp_est_10y"]
df["extend_net_value_5y"]  = df["dereg_val_at_10y"] - df["pqp_est_5y"]

# (4) calculate the consumption worth till coe finishes
df['cost_minus_dereg'] = df['price'] - df['dereg_val_at_10y']
# monthly consumption worth 
df['monthly_consumption_worth'] = df['cost_minus_dereg'] / (df['years_months_left'].apply(lambda x: int(x.split()[0])*12 + int(x.split()[2])))


df.sort_values('extend_net_value_10y', ascending=False)

Unnamed: 0,price,mileage,dealer,reg_date,carmodel,type_of_vehicle,url,transmission,fuel_type,curb_weight,power,road_tax,coe,omv,arf,owners,dereg_value,engine_cap,years_months_left,ARF_val,dereg_val_at_10y,pqp_est_10y,pqp_est_5y,extend_net_value_10y,extend_net_value_5y,cost_minus_dereg,monthly_consumption_worth
481,44888,92000,Garwen Motoring,2016-01-28,Jaguar XF 2.0A TSS,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/jagua...,Auto,Petrol,1660.0,177.0,1212.0,50089.0,54979.0,70963.0,1.0,36771.0,1999.0,0 yr 3 mth,70963.0,35482.0,122000.0,61000.0,-86518.0,-25518.0,9406.0,3135.333333
185,44888,92000,Garwen Motoring,2016-01-28,Jaguar XF 2.0A TSS,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/jagua...,Auto,Petrol,1660.0,177.0,1212.0,50089.0,54979.0,70963.0,1.0,36771.0,1999.0,0 yr 3 mth,70963.0,35482.0,122000.0,61000.0,-86518.0,-25518.0,9406.0,3135.333333
290,49000,30000,Clutch Auto Pte Ltd,2016-01-11,Mercedes-Benz E-Class E250 AMG Edition E Sunroof,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/merce...,Auto,Petrol,1680.0,155.0,1202.0,60001.0,52959.0,67327.0,2.0,34929.0,1991.0,0 yr 2 mth,67327.0,33664.0,122000.0,61000.0,-88336.0,-27336.0,15336.0,7668.000000
586,49000,30000,Clutch Auto Pte Ltd,2016-01-11,Mercedes-Benz E-Class E250 AMG Edition E Sunroof,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/merce...,Auto,Petrol,1680.0,155.0,1202.0,60001.0,52959.0,67327.0,2.0,34929.0,1991.0,0 yr 2 mth,67327.0,33664.0,122000.0,61000.0,-88336.0,-27336.0,15336.0,7668.000000
294,47000,96000,Autohaven Pte Ltd,2016-04-26,Mercedes-Benz E-Class E250 CGI Coupe,Sports Car,https://www.sgcarmart.com/used-cars/info/merce...,Auto,Petrol,1635.0,155.0,1202.0,46502.0,47726.0,58817.0,1.0,31727.0,1991.0,0 yr 6 mth,58817.0,29408.0,122000.0,61000.0,-92592.0,-31592.0,17592.0,2932.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,46800,69000,Happy Motoring Pte Ltd,2018-12-28,Nissan Note 1.2A,Hatchback,https://www.sgcarmart.com/used-cars/info/nissa...,Auto,Petrol,1048.0,58.0,508.0,23568.0,13438.0,5000.0,1.0,10734.0,1198.0,3 yr 2 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,44300.0,1165.789474
551,39000,86233,Vin,2017-08-21,Toyota Sienta Hybrid 1.5A X,MPV,https://www.sgcarmart.com/used-cars/info/toyot...,Auto,Petrol-Electric,1380.0,73.0,682.0,46778.0,26750.0,5000.0,2.0,11260.0,1496.0,1 yr 10 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,36500.0,1659.090909
541,31000,94000,Monster Motors Alliance Pte Ltd,2017-10-12,Honda Shuttle Hybrid 1.5A,Stationwagon,https://www.sgcarmart.com/used-cars/info/honda...,Auto,Petrol-Electric,1190.0,101.0,682.0,48109.0,20217.0,5000.0,2.0,12187.0,1496.0,1 yr 11 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,28500.0,1239.130435
20,38800,98070,VINCAR Pte Ltd,2018-11-30,Mitsubishi Attrage 1.2A,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,28199.0,13355.0,5000.0,1.0,11988.0,1193.0,3 yr 1 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,36300.0,981.081081


In [168]:
df[df['years_months_left'] > '2 yr 0 mth'].sort_values(
    ['years_months_left', 'dereg_val_at_10y', 'monthly_consumption_worth'],
    ascending=[False, False, True]
).head(20)

Unnamed: 0,price,mileage,dealer,reg_date,carmodel,type_of_vehicle,url,transmission,fuel_type,curb_weight,power,road_tax,coe,omv,arf,owners,dereg_value,engine_cap,years_months_left,ARF_val,dereg_val_at_10y,pqp_est_10y,pqp_est_5y,extend_net_value_10y,extend_net_value_5y,cost_minus_dereg,monthly_consumption_worth
208,49888,91000,Tos Auto Pte Ltd,2019-05-13,Renault Grand Scenic Diesel 1.5A dCi,MPV,https://www.sgcarmart.com/used-cars/info/renau...,Auto,Diesel,1540.0,81.0,1048.0,29159.0,25440.0,17616.0,2.0,21796.0,1461.0,3 yr 6 mth,17616.0,8808.0,122000.0,61000.0,-113192.0,-52192.0,41080.0,978.095238
504,49888,91000,Tos Auto Pte Ltd,2019-05-13,Renault Grand Scenic Diesel 1.5A dCi,MPV,https://www.sgcarmart.com/used-cars/info/renau...,Auto,Diesel,1540.0,81.0,1048.0,29159.0,25440.0,17616.0,2.0,21796.0,1461.0,3 yr 6 mth,17616.0,8808.0,122000.0,61000.0,-113192.0,-52192.0,41080.0,978.095238
221,44888,88000,Platinum Motoring,2019-04-23,Mitsubishi Attrage 1.2A,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26309.0,13554.0,5000.0,2.0,12440.0,1193.0,3 yr 6 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,42388.0,1009.238095
517,44888,88000,Platinum Motoring,2019-04-23,Mitsubishi Attrage 1.2A,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26309.0,13554.0,5000.0,2.0,12440.0,1193.0,3 yr 6 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,42388.0,1009.238095
12,45800,83400,Car Design Motor,2019-04-30,Mitsubishi Attrage 1.2A Sports,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26659.0,13515.0,5000.0,2.0,12614.0,1193.0,3 yr 6 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,43300.0,1030.952381
28,45800,83400,Car Design Motor,2019-04-30,Mitsubishi Attrage 1.2A Sports,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26659.0,13515.0,5000.0,2.0,12614.0,1193.0,3 yr 6 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,43300.0,1030.952381
324,45800,83400,Car Design Motor,2019-04-30,Mitsubishi Attrage 1.2A Sports,Mid-Sized Sedan,https://www.sgcarmart.com/used-cars/info/mitsu...,Auto,Petrol,940.0,59.0,506.0,26659.0,13515.0,5000.0,2.0,12614.0,1193.0,3 yr 6 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,43300.0,1030.952381
260,49800,52000,2,2019-03-28,Citroen C4 Cactus 1.2A PureTech,SUV,https://www.sgcarmart.com/used-cars/info/citro...,Auto,Petrol,1050.0,81.0,508.0,26659.0,17603.0,17603.0,1.0,20564.0,1199.0,3 yr 5 mth,17603.0,8802.0,122000.0,61000.0,-113198.0,-52198.0,40998.0,999.95122
556,49800,52000,2,2019-03-28,Citroen C4 Cactus 1.2A PureTech,SUV,https://www.sgcarmart.com/used-cars/info/citro...,Auto,Petrol,1050.0,81.0,508.0,26659.0,17603.0,17603.0,1.0,20564.0,1199.0,3 yr 5 mth,17603.0,8802.0,122000.0,61000.0,-113198.0,-52198.0,40998.0,999.95122
287,46500,82200,Iterly Motor Pte Ltd,2019-03-26,Nissan Note 1.2A,Hatchback,https://www.sgcarmart.com/used-cars/info/nissa...,Auto,Petrol,1048.0,58.0,508.0,26659.0,13467.0,5000.0,1.0,12358.0,1198.0,3 yr 5 mth,5000.0,2500.0,122000.0,61000.0,-119500.0,-58500.0,44000.0,1073.170732


# Final

## main url to pull all possible links to suitable car types

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re

def collect_listing_links(main_url, max_pages=20):
    """
    Crawl all paginated listing links from sgcarmart Used Cars search results pages.

    Args:
        main_url (str): The URL of the first result page to start scraping from.
        max_pages (int): Maximum number of pages to paginate through (guard against infinite loops).

    Returns:
        Set[str]: A set containing all unique car listing URLs found.
    """
    def extract_links_from_page(html):
        soup = BeautifulSoup(html, "html.parser")
        listing_divs = soup.find_all("div", id=re.compile(r"^listing_\d+$"))
        links = []
        for div in listing_divs:
            for a in div.find_all("a", class_="styles_text_link__wBaHL"):
                href = a.get("href")
                if href and href.startswith("https"):
                    links.append(href)
        return links

    options = webdriver.ChromeOptions()
    # Uncomment if running on server: options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 20)

    driver.get(main_url)

    all_links = set()
    page_idx = 1

    try:
        while page_idx <= max_pages:
            # Wait for listings to load
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='listing_']")))
            # Collect links from current page
            all_links.update(extract_links_from_page(driver.page_source))

            # Find next button in paginator (desktop)
            next_btn = driver.find_element(By.CSS_SELECTOR,
                '#desktopPaginationContainer button[class*="right_control"]')

            # Stop if Next is disabled
            if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
                break

            # Remember a listing element to detect page change
            old_first = driver.find_element(By.CSS_SELECTOR, "div[id^='listing_']")

            # Click Next (using JS for reliability)
            driver.execute_script("arguments[0].click();", next_btn)

            # Wait for listings to change (old page should be stale)
            wait.until(EC.staleness_of(old_first))
            page_idx += 1
    finally:
        driver.quit()

    return all_links

## Single car pulling

In [None]:
link = 'https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1439396/?dl=3382'

# Configure headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
}

# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the <script> tag containing relevant data
script = next(
    (tag for tag in soup.find_all("script")
        if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
    None
)

def clean_js_escapes(s):
        # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
        s = s.encode('utf-8').decode('unicode_escape')
        s = s.replace('\\/', '/')
        s = s.replace('\\\\', '\\')
        return s

cleaned_script10_str = clean_js_escapes(script.text if hasattr(script, 'text') else script)


In [None]:
def get_regdate(cleaned_script10_str):
        match = re.search(r'"reg_date"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
        if match:
            return match.group(1)
        return None
    
    
get_carmodel(str(cleaned_script10_str))

'16-Nov-2020'

In [142]:
def extract_sgcarmart_car_details(url):
    import requests
    from bs4 import BeautifulSoup
    import re
    import json

    # Configure headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
    }

    # Make the request
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <script> tag containing relevant data
    script = next(
        (tag for tag in soup.find_all("script")
         if "success" in tag.text and "coe" in tag.text and "depreciation" in tag.text),
        None
    )
    
    def clean_value(key, value):
        """
        Clean up the extracted value for any of the wanted keys.
        For numeric/currency-like values: remove non-numeric, non-dot, non-minus chars.
        For others: just strip whitespace.
        """
        # Always try to normalize money/numbers (contains $ or numbers or unit suffix)
        # If it contains digits, possibly with currency symbols or known suffixes, try to extract the main number
        if any(char.isdigit() for char in value):
            # Remove currency markers ($), commas, spaces, slashes, "yr", "km", "cc", etc.
            clean = value
            # Remove "$", ",", spaces
            clean = re.sub(r'[,$]', '', clean)
            # Remove common unit suffixes/spaces (will still keep decimals and / where relevant)
            clean = re.sub(r'\s*(/yr|yr|km|cc|as of today|/)', '', clean, flags=re.IGNORECASE)
            # Remove any leftover non-numeric except dot
            clean = re.sub(r'[^\d.]', '', clean)
            return clean
        else:
            return value.strip()
    
    def clean_js_escapes(s):
        # Decode unicode escapes and remove unnecessary backslashes from quotes etc.
        s = s.encode('utf-8').decode('unicode_escape')
        s = s.replace('\\/', '/')
        s = s.replace('\\\\', '\\')
        return s

    cleaned_script10_str = clean_js_escapes(script.text if hasattr(script, 'text') else script)

    def get_carmodel(cleaned_script10_str):
        match = re.search(r'"car_model"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
        if match:
            return match.group(1)
        return None

    def get_regdate(cleaned_script10_str):
        match = re.search(r'"reg_date"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE)
        if match:
            return match.group(1)
        return None
    
    def get_type_of_vehicle(cleaned_script10_str):
        # Handles both object and string representations
        dict_match = re.search(
            r'"type_of_vehicle"\s*:\s*\{(.*?)\}', cleaned_script10_str, re.IGNORECASE | re.DOTALL
        )
        if dict_match:
            dict_str = '{' + dict_match.group(1) + '}'
            dict_str = re.sub(r',\s*\}$', '}', dict_str)
            try:
                dict_str_cleaned = dict_str.replace('\\"', '"').replace("'", '"')
                if dict_str_cleaned.count('{') != dict_str_cleaned.count('}'):
                    dict_str_cleaned = dict_str_cleaned + '}'
                type_obj = json.loads(dict_str_cleaned)
                if isinstance(type_obj, dict) and 'text' in type_obj:
                    return type_obj['text'].strip()
                return type_obj
            except Exception:
                pass
        simple_match = re.search(
            r'"type_of_vehicle"\s*:\s*"([^"]+)"', cleaned_script10_str, re.IGNORECASE
        )
        if simple_match:
            return simple_match.group(1).strip()
        simple_unquoted = re.search(
            r'"type_of_vehicle"\s*:\s*([^,"\}\r\n]+)', cleaned_script10_str, re.IGNORECASE
        )
        if simple_unquoted:
            return simple_unquoted.group(1).strip()
        return None

    def extract_car_details(cleaned_script10_str, clean_value_func):
        wanted_keys = [
            "price","Transmission", "Fuel Type", "Engine Capacity", "Curb Weight", "Power",
            "Road Tax", "Deregistration Value", "COE", "OMV", "ARF",
            "mileage", "owners", "dealer", "dereg_value", "engine_cap"
        ]
        wanted_keys_small = [k.replace(" ", "_").lower() for k in wanted_keys]
        results_smallcaps_cleaned = {}
        for orig, key in zip(wanted_keys, wanted_keys_small):
            pat = re.compile(
                r'["\']?' + re.escape(key) + r'["\']?\s*:\s*["\']?([^"\'}<\n]+)',
                re.IGNORECASE
            )
            if key == "mileage":
                matches = pat.findall(cleaned_script10_str)
                if len(matches) >= 2:
                    value = matches[1].strip()
                elif len(matches) == 1:
                    value = matches[0].strip()
                else:
                    value = None
                if value is not None:
                    value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                    clean = clean_value_func(key, value_noparens)
                    results_smallcaps_cleaned[key] = clean
            else:
                match = pat.search(cleaned_script10_str)
                if match:
                    value = match.group(1).strip()
                    value_noparens = re.sub(r'\([^\)]*\)', '', value).strip()
                    clean = clean_value_func(key, value_noparens)
                    results_smallcaps_cleaned[key] = clean
        return results_smallcaps_cleaned

    carmodel = get_carmodel(cleaned_script10_str)
    type_of_vehicle = get_type_of_vehicle(cleaned_script10_str)
    regdate = get_regdate(cleaned_script10_str)
    
    results_smallcaps_cleaned = extract_car_details(cleaned_script10_str, clean_value)
    results_smallcaps_cleaned['reg_date'] = regdate
    results_smallcaps_cleaned['carmodel'] = carmodel
    results_smallcaps_cleaned['type_of_vehicle'] = type_of_vehicle
    results_smallcaps_cleaned['url'] = url
    
    
    return results_smallcaps_cleaned

In [143]:
details = extract_sgcarmart_car_details('https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1439396/?dl=3382')
details

{'price': '37998',
 'transmission': 'Auto',
 'fuel_type': 'Petrol',
 'curb_weight': '1280',
 'power': '85.0',
 'road_tax': '392',
 'coe': '41761',
 'omv': '24479',
 'arf': '16271',
 'mileage': '91030',
 'owners': '2',
 'dealer': 'CarQuotes SG',
 'dereg_value': '17244',
 'engine_cap': '999',
 'reg_date': '21-Oct-2017',
 'carmodel': 'Audi A3 Sedan 1.0A TFSI S-tronic',
 'type_of_vehicle': 'Luxury Sedan',
 'url': 'https://www.sgcarmart.com/used-cars/info/audi-a3-sedan-10a-1439396/?dl=3382'}

## Reading dataframe and appending

In [145]:
def get_or_append_carlist_df(details, data_folder="data"):
    """
    If a CSV file exists in the data_folder, append the new details as a row.
    If none exists, create a new one with the first details.
    """
    import os
    import pandas as pd
    from datetime import datetime

    os.makedirs(data_folder, exist_ok=True)
    csv_files = [f for f in os.listdir(data_folder) if f.startswith("carlist_") and f.endswith(".csv")]

    if csv_files:
        # Use most recent file
        csv_files_sorted = sorted(
            csv_files,
            key=lambda fn: fn.split("_")[1].replace(".csv", ""),
            reverse=True
        )
        csv_path = os.path.join(data_folder, csv_files_sorted[0])
        df = pd.read_csv(csv_path)

        # Append new details
        df_new = pd.DataFrame([details])
        df = pd.concat([df, df_new], ignore_index=True)
        df.to_csv(csv_path, index=False)
    else:
        # No CSV exists, create new one
        timestamp = datetime.now().strftime("%Y%m%d")
        csv_path = os.path.join(data_folder, f"carlist_{timestamp}.csv")
        df = pd.DataFrame([details])
        df.to_csv(csv_path, index=False)

    return df

get_or_append_carlist_df(details)

Unnamed: 0,price,transmission,fuel_type,curb_weight,power,road_tax,coe,omv,arf,mileage,owners,dealer,dereg_value,engine_cap,reg_date,carmodel,type_of_vehicle,url
0,37998,Auto,Petrol,1280,85.0,392,41761,24479,16271,91030,2,CarQuotes SG,17244,999,21-Oct-2017,Audi A3 Sedan 1.0A TFSI S-tronic,Luxury Sedan,https://www.sgcarmart.com/used-cars/info/audi-...


# Pipeline

In [None]:
from tqdm import tqdm

main_url = "https://www.sgcarmart.com/used-cars/listing?pr1=30001&pr2=50000&cts[]=18&vts[]=12&vts[]=13&vts[]=9&vts[]=10&vts[]=11&vts[]=8&vts[]=7&vts[]=3&vts[]=2&mil1=&mil2=100000&own_c=%3C&own=3&color[]=white&color[]=black&color[]=red&color[]=grey&color[]=silver&avl=a&ord=PRC_ASC"
list_of_cars = collect_listing_links(main_url, max_pages = 15)

for cars in tqdm(list_of_cars, desc="Processing cars", unit="car"):
    import time

    max_retries = 5
    for attempt in range(max_retries):
        details = extract_sgcarmart_car_details(cars)
        if len(details) == 18:
            break
        time.sleep(1)
    else:
        print(f"Warning: Details for {cars} could not be extracted properly after {max_retries} attempts. Got: {details}")
        
    get_or_append_carlist_df(details)

Processing cars: 100%|██████████| 297/297 [03:41<00:00,  1.34car/s]


In [220]:
import os
import pandas as pd
import numpy as np
import re
import yaml
from datetime import datetime

def process_carlist_data(data_folder='data', params_path='params.yaml'):
    # Get most recent carlist CSV in the folder
    csv_files = [f for f in os.listdir(data_folder) if f.startswith("carlist_") and f.endswith(".csv")]
    if not csv_files:
        raise FileNotFoundError("No carlist CSV files found in the given data folder.")
    csv_files_sorted = sorted(
        csv_files,
        key=lambda fn: fn.split("_")[1].replace(".csv", ""),
        reverse=True
    )
    csv_path = os.path.join(data_folder, csv_files_sorted[0])
    print(f"Processing {csv_path}")
    
    df = pd.read_csv(csv_path)

    df = df[~df['transmission'].isnull()].copy(deep=True)

    # Parse registration date
    df['reg_date'] = pd.to_datetime(df['reg_date'], format='%d-%b-%y')

    today = pd.Timestamp(datetime.today().date())

    def years_months_left(reg_date, lifespan_years=10):
        end_date = reg_date + pd.DateOffset(years=lifespan_years)
        delta = end_date - today
        if delta.days < 0:
            return "Expired"
        years = delta.days // 365
        months = (delta.days % 365) // 30
        return f"{years} yr {months} mth"

    df['years_months_left'] = df['reg_date'].apply(years_months_left)

    # helpers
    def parse_money(x):
        x = re.sub(r"[^0-9.]", "", str(x)) if pd.notna(x) else ""
        return float(x) if x else np.nan

    # Load PQP params from yaml
    with open(params_path, "r") as f:
        params = yaml.safe_load(f)

    default_cat = params["pqp"].get("default_cat", "A")
    PQP10 = float(params["pqp"]["categories"][default_cat]["ten_year"])
    PQP5  = float(params["pqp"]["categories"][default_cat]["five_year"])

    # value calculations
    df["ARF_val"] = df["arf"].apply(parse_money)
    df["dereg_val_at_10y"] = (df["ARF_val"] * 0.5).round(0)
    df["pqp_est_10y"] = PQP10
    df["pqp_est_5y"]  = PQP5
    df["extend_net_value_10y"] = df["dereg_val_at_10y"] - df["pqp_est_10y"]
    df["extend_net_value_5y"]  = df["dereg_val_at_10y"] - df["pqp_est_5y"]
    df['cost_minus_dereg'] = df['price'] - df['dereg_val_at_10y']
    # monthly consumption worth 
    def months_left_string_to_int(x):
        if x == "Expired":
            return np.nan
        try:
            return int(x.split()[0])*12 + int(x.split()[2])
        except Exception:
            return np.nan
    months_left = df['years_months_left'].apply(months_left_string_to_int)
    df['monthly_consumption_worth'] = df['cost_minus_dereg'] / months_left

    df = df.sort_values('extend_net_value_10y', ascending=False).reset_index(drop=True)
    
    df = df.drop_duplicates().reset_index(drop=True)
    
    return df


processed_df = process_carlist_data()

Processing data\carlist_20251025.csv
