In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests as req
import pickle
import re
import pandas as pd
import numpy as np
import functools
import operator
import datetime
import os

In [2]:
# define a function which scrape url based on number of pages

# subfunction 1
def get_carModelUrl(url):
    html = req.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    href = soup.body.find('div', {'id':'content'}).find('form', {'name':'searchform'}).next_sibling.next_sibling.find_all(href=re.compile(r"(^(info.php).*)"), string=True)
    return href

# subfunction 2
def get_keypairs(hrefString):
    
    idValue = re.search("(?<=\?ID=)\d+(?=\&)", str(hrefString))
    dlValue = re.search("(?<=\;DL=)\d+(?=(\"|\&))", str(hrefString))
    
    return idValue[0].strip(), dlValue[0].strip()

# main function 1: get the webpage url
def scrape_url(page):
    
    #convert page to list of car per page
    pageToList = [(i+1)*100 for i in range(page)]
    
    #create BeautifulSoup object
    urlList = [f"https://www.sgcarmart.com/used_cars/listing.php?BRSR={car}&RPG=100" for car in pageToList]
    carModelUrlList = list(map(get_carModelUrl, urlList))
    
    #flatten the list
    carModelUrlList = functools.reduce(operator.iconcat, carModelUrlList, [])
    
    #get keypairs(id-dl)
    keypairsList = list(map(get_keypairs, carModelUrlList))
    
    #reformat the webpage url
    carModelUrlList = list(map(lambda x: f"https://www.sgcarmart.com/used_cars/info.php?ID={x[0]}&DL={x[1]}", keypairsList))
    
    return carModelUrlList

In [20]:
# run main function 1: get webpage url based o
carModelUrls = scrape_url(20)
%time

CPU times: total: 0 ns
Wall time: 719 µs


In [3]:
# define a function to scrape features from SGCar Mart
def feature_scraping(url):
    print(url)
    # Get the html page
    html = req.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    filename = f"Y:/Documents/GitHub/CarSmartConsultancy/Data/scrape_data/html_files/car_html_content_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.html"
    with open(filename, "w") as f:
        # traverse paragraphs from soup
        f.write(str(soup))
    return soup

In [22]:
features = list(map(feature_scraping, carModelUrls))

https://www.sgcarmart.com/used_cars/info.php?ID=1166804&DL=3403
https://www.sgcarmart.com/used_cars/info.php?ID=1146395&DL=3403
https://www.sgcarmart.com/used_cars/info.php?ID=1190809&DL=2189
https://www.sgcarmart.com/used_cars/info.php?ID=1190807&DL=3562
https://www.sgcarmart.com/used_cars/info.php?ID=1190806&DL=4052
https://www.sgcarmart.com/used_cars/info.php?ID=1185439&DL=3946
https://www.sgcarmart.com/used_cars/info.php?ID=1182534&DL=4162
https://www.sgcarmart.com/used_cars/info.php?ID=1182307&DL=4162
https://www.sgcarmart.com/used_cars/info.php?ID=1180925&DL=2410
https://www.sgcarmart.com/used_cars/info.php?ID=1173419&DL=4162
https://www.sgcarmart.com/used_cars/info.php?ID=1132256&DL=3403
https://www.sgcarmart.com/used_cars/info.php?ID=1190805&DL=4225
https://www.sgcarmart.com/used_cars/info.php?ID=1190803&DL=4060
https://www.sgcarmart.com/used_cars/info.php?ID=1188087&DL=3142
https://www.sgcarmart.com/used_cars/info.php?ID=1186449&DL=1264
https://www.sgcarmart.com/used_cars/info

In [4]:
# Functions defined to scrape each features
def get_model(soup):
    model = soup.body.find('div', {'id':'contentblank'}).select('div:nth-of-type(2)')[0].select('div:nth-of-type(1)')[0].text.strip()
    return model

def get_price(soup):
    pr = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).find('tr').find_all('strong')[1]
    pr = str(pr).partition('/')[0].partition('>')[-1].partition('<')[0].strip()
    return pr

def get_depreciation(soup):
    dep = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(2)')[0].select('td:nth-of-type(2)')[0].find_all(string=re.compile("$"))
    dep = dep[0].replace(" ", "").strip().partition('/')[0]
    return dep

def get_registrationDate(soup):
    regDate = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(2)')[0].select('td:nth-of-type(4)')[0].find_all(string=True)
    reg = regDate[0].strip()
    return reg

def get_mileage(soup):
    try:
        mi = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('div:nth-of-type(1)')[0].find('div', {'class':'row_info'}).string
        mi = mi.strip()
        return mi
    except:
        return ''

def get_roadTax(soup):
    try:
        roadTax = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(1)')[0].next_sibling.next_sibling.find('div', {'class':'row_info'}).string
        roadTax = roadTax.strip()
        return roadTax
    except:
        return ''

def get_deregistrationValue(soup):
    try:    
        dereg = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(3)')[0].find('div', {'class':'row_info'}).find(string=re.compile("$"))
        dereg = dereg.partition("as")[0].strip()
        return dereg
    except:
        return ''

def get_coe(soup):
    try:
        coe = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(4)')[0].find('div', {'class':'row_info'}).string
        coe = coe.strip()
        return coe
    except:
        return ''
    
def get_engineCap(soup):
    try:
        eng = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(5)')[0].find('div', {'class':'row_info'}).string
        eng = eng.strip()
        return eng
    except:
        return ''
    
def get_curbWeight(soup):
    try:
        weight = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(6)')[0].find('div', {'class':'row_info'}).string
        weight = weight.strip()
        return weight
    except:
        return ''
    
def get_manufacturedYear(soup):
    try:
        year = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(1)')[0].find('div', {'class':'row_info'}).string
        year = year.strip()
        return year
    except:
        return ''
    
def get_transmission(soup):
    try:
        trans = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].find('div', {'class':'eachInfo'}).next_sibling.next_sibling.find('div', {'class':'row_info'}).string
        trans = trans.strip()
        return trans
    except:
        return ''

def get_omv(soup):
    try:
        tag = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(3)')[0].find('div', {'class':'row_title'}).text.strip()
        if tag == 'OMV':
            omv = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(3)')[0].find('div', {'class':'row_info'}).string
            omv = omv.strip()
        elif tag == 'Fuel Type':
            omv = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(4)')[0].find('div', {'class':'row_info'}).string
            omv = omv.strip()
        else:
            omv = 'NA'
        return omv
    except:
        return ''
    
def get_arf(soup):
    try:
        tag = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(4)')[0].find('div', {'class':'row_title'}).text.strip()
        if tag == 'ARF':
            arf = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(4)')[0].find('div', {'class':'row_info'}).string
            arf = arf.strip()
        elif tag == 'OMV':
            arf = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(5)')[0].find('div', {'class':'row_info'}).string
            arf = arf.strip()
        else:
            arf = 'NA'
        return arf
    except:
        return ''
    
def get_power(soup):
    try:
        tag = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(5)')[0].find('div', {'class':'row_title'}).text.strip()
        if tag == 'Power':
            power = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(5)')[0].find('div', {'class':'row_info'}).string
            power = power.strip()
        elif tag == 'ARF':
            power = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(6)')[0].find('div', {'class':'row_info'}).string
            power = power.strip()
        else:
            power = 'NA'
        return power
    except:
        return ''
    
def get_number_of_owner(soup):
    try:
        tag = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(6)')[0].find('div', {'class':'row_title'}).text.strip()
        if tag == 'No. of Owners':
            owner = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(6)')[0].find('div', {'class':'row_info'}).string
            owner = owner.strip()
        elif tag == 'Power':
            owner = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].select('div:nth-of-type(7)')[0].find('div', {'class':'row_info'}).string
            owner = owner.strip()
        else:
            owner = 'NA'
        return owner
    except:
        return ''
    
def get_type(soup):
    try:
        typ = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(4)')[0].find('a').text
        typ = typ.strip()
    except:
        typ = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(3)')[0].find('td').select('div:nth-of-type(7)')[0].find('div', {'class':'row_info'}).text
        typ = typ.strip()   
    return typ

def get_all_other_info(soup):
    try:
        row = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr')
        # list of other info should return in this sequence [features, accessories, descriptions, category, status]
        other_info = ['NA', 'NA', 'NA', 'NA', 'NA']
    
        # get the rows info
        for i in range(5):
            position = i + 5
            row = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(%d)'%position)[0].find('td').text
            cleaned_row = ' '.join([t for t in (row.replace('\r', '').replace('\n','').strip().split(" ")) if len(t) > 0])
            if (cleaned_row.split(" ")[0] == 'Features'):
                other_info[0] = cleaned_row
            elif (cleaned_row.split(" ")[0] == 'Accessories'):
                other_info[1] = cleaned_row
            elif (cleaned_row.split(" ")[0] == 'Description'):
                other_info[2] = cleaned_row
            elif (cleaned_row.split(" ")[0] == 'Category'):
                other_info[3] = cleaned_row
            elif (cleaned_row.split(" ")[0] == 'Status'):
                other_info[4] = cleaned_row
            
        # somehow category and status data might not be read in do continue with the following
        if (other_info[1] == 'NA'):
            cat = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(7)')[0].find('td').text
            cleaned_cat = ' '.join([t for t in (cat.replace('\r', '').replace('\n','').replace('Status', '').strip().split(" ")) if len(t) > 0])
            other_info[3] = cleaned_cat
        
            stat = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(8)')[0].find('td').text
            cleaned_stat = ' '.join([t for t in (stat.replace('\r', '').replace('\n','').replace('Status', '').strip().split(" ")) if len(t) > 0])
            other_info[4] = cleaned_stat
        else:
            cat = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(8)')[0].find('td').text
            cleaned_cat = ' '.join([t for t in (cat.replace('\r', '').replace('\n','').replace('Status', '').strip().split(" ")) if len(t) > 0])
            other_info[3] = cleaned_cat
        
            stat = soup.body.find('div', {'id':'main_left'}).find('table', {'id':'carInfo'}).select('tr:nth-of-type(9)')[0].find('td').text
            cleaned_stat = ' '.join([t for t in (stat.replace('\r', '').replace('\n','').replace('Status', '').strip().split(" ")) if len(t) > 0])
            other_info[4] = cleaned_stat
        
        return other_info
    except:
        return '', '', '', '', ''


def get_postUpdate_date(soup):
    posted = ''
    updated = ''
    try:
        date = soup.body.find('div', {'id':'usedcar_postdate'}).get_text(strip=True).replace('\xa0', '').replace('\r', '').replace('\n', ''). replace('\t', '')
        posted = re.search('(?<=Posted on:).*?(?=\|)', date)[0].strip()
        updated = re.search('(?<=Updated on:).*', date)[0].strip()
    except:
        pass
    
    return posted, updated

In [6]:
# scrape html data from text file
# read a text file in format of .txt which contains html text
def feature_scraping(filePath):

    # read the file
    with open(filePath, 'r') as file:
        html_content = file.read()
        
    print(filePath)

    # read the html content with BS4
    soup = BeautifulSoup(html_content)

    # scrape standard features
    model = get_model(soup)
    price = get_price(soup)
    depreciation = get_depreciation(soup)
    reg_date = get_registrationDate(soup)
    manufactured_year = get_manufacturedYear(soup)
    mileage = get_mileage(soup)
    road_tax = get_roadTax(soup)
    transmission = get_transmission(soup)
    dereg_value = get_deregistrationValue(soup)
    omv = get_omv(soup)
    coe = get_coe(soup)
    arf = get_arf(soup)
    engine_cap = get_engineCap(soup)
    power = get_power(soup)
    curb_weight = get_curbWeight(soup)
    number_of_owner = get_number_of_owner(soup)
    types = get_type(soup)

    # scrape other features
    other_info  = get_all_other_info(soup)
    features = other_info[0]
    accessories = other_info[1]
    descriptions = other_info[2]
    category = other_info[3]
    status = other_info[4]

    return model, price, depreciation, reg_date, manufactured_year, mileage, road_tax, transmission, dereg_value, omv, coe, arf, engine_cap, power, curb_weight, number_of_owner, types, features, accessories, descriptions, category, status

In [7]:
# scrape html data from text file
# read a text file in format of .txt which contains html text
def feature_scraping(filePath):

    # read the file
    with open(filePath, 'r') as file:
        html_content = file.read()
        
    print(filePath)

    # read the html content with BS4
    soup = BeautifulSoup(html_content)

    # scrape standard features
    model = get_model(soup)
    price = get_price(soup)
    depreciation = get_depreciation(soup)
    reg_date = get_registrationDate(soup)
    manufactured_year = get_manufacturedYear(soup)
    mileage = get_mileage(soup)
    road_tax = get_roadTax(soup)
    transmission = get_transmission(soup)
    dereg_value = get_deregistrationValue(soup)
    omv = get_omv(soup)
    coe = get_coe(soup)
    arf = get_arf(soup)
    engine_cap = get_engineCap(soup)
    power = get_power(soup)
    curb_weight = get_curbWeight(soup)
    number_of_owner = get_number_of_owner(soup)
    types = get_type(soup)

    # scrape other features
    other_info  = get_all_other_info(soup)
    features = other_info[0]
    accessories = other_info[1]
    descriptions = other_info[2]
    category = other_info[3]
    status = other_info[4]
    
    # scrape posted and updated date
    posted_date = get_postUpdate_date(soup)[0]
    updated_date = get_postUpdate_date(soup)[1]

    return model, price, depreciation, reg_date, manufactured_year, mileage, road_tax, transmission, dereg_value, omv, coe, arf, engine_cap, power, curb_weight, number_of_owner, types, features, accessories, descriptions, category, status, posted_date, updated_date

In [10]:
def scrape_directory(directory_path):
    results = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.html'):
            file_path = os.path.join(directory_path, file_name)
            if os.path.isfile(file_path):
                result = feature_scraping(file_path)
                results.append(result)
            else:
                print(f'Error: {file_path} is not a valid file path')
    return results

In [11]:
# combine the feature scraping function with the pathList
# NOTE: do replace [path] in the map function with a list of file directory\
directory_path = r'Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418'
%time sgCarMart_features = scrape_directory(directory_path)
#%time sgCarMart_features = list(map(scrape_directory, [directory_path]))

Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-17_22-57-05.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-17_22-55-38.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-18_02-12-22.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-18_01-23-25.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-18_01-21-18.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-17_23-20-39.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-18_00-54-19.html
Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\html_files\0417_0418\car_html_content_2023-04-18_00-56-24.html
Y:\Documents\GitHub\CarSmartConsultancy\

In [13]:
# transform to dataframe
df = pd.DataFrame(sgCarMart_features,
                 columns = ['model', 'price', 'depreciation', 'registration_date', 'manufactured_year', 'mileage', 'road_tax', 'transmission', 'deregistration_value', 'omv', 'coe', 'arf', 'engine_cap', 'power', 'curb_weight', 'number_of_owner', 'types', 'features', 'accessories', 'descriptions', 'category', 'status', 'posted_date', 'updated_date'])
df.head()

Unnamed: 0,model,price,depreciation,registration_date,manufactured_year,mileage,road_tax,transmission,deregistration_value,omv,...,curb_weight,number_of_owner,types,features,accessories,descriptions,category,status,posted_date,updated_date
0,Mercedes-Benz Viano CDI (COE till 09/2027),"$88,000","$19,740",21-Nov-2012,2012,"206,227 km (19.8k /yr)",N.A.,Auto,"$24,523","$46,658",...,"2,280 kg",2,Bus/Mini Bus,,,,,,15-Apr-2023,15-Apr-2023
1,Audi S4 3.0A TFSI Quattro S-tronic (COE till 0...,"$108,800","$17,060",06-Oct-2009,2009,"166,000 km (12.3k /yr)","$3,332 /yr",Auto,"$25,472","$62,346",...,"1,685 kg",More than 6,Sports Car,Features Powerful 3.0L V6 supercharged engine ...,"Accessories Upgraded head unit, leather seats ...",Description Unit super well taken care by curr...,"CategoryCOE Car, Premium Ad Car",Available for sale. Shortlist this car to get ...,15-Apr-2023,15-Apr-2023
2,Subaru Forester 2.0i-L Sunroof,N.A,N.A,28-Apr-2017,2017,N.A.,"$1,208 /yr",Auto,N.A.,"$14,556",...,"1,591 kg",1,SUV,Features 2.0l 4 Cylinders DOHC Horizontally Op...,"Accessories Leather Electric Seats, Panoramic ...",Description Fully agent maintained. Free 3 yea...,CategoryPARF Car,SOLD,24-Mar-2023,16-Apr-2023
3,Toyota Hiace 2.8A DX,"$90,800","$11,580",18-Feb-2021,2020,N.A.,N.A.,Auto,"$26,900","$35,445",...,"1,800 kg",1,Van,Features View specs of the Toyota Hiace,,Description Hiace auto diesel with rear aircon.,CategoryPremium Ad Car,Available for sale. Shortlist this car to get ...,03-Apr-2023,10-Apr-2023
4,Toyota Mark X 2.5A (COE till 05/2026),"$44,000","$14,090",08-Jun-2006,2006,"172,000 km (10.2k /yr)","$2,697 /yr",Auto,"$14,369","$29,495",...,"1,500 kg",4,Luxury Sedan,"Features No dealers, no consignment. Negotiabl...","Accessories 211hp, 260nm torque, 4.1 final dri...","Description Easy maintenance 4GR-FSE, custom i...","CategoryCOE Car, Direct Owner Sale",Available for sale. Shortlist this car to get ...,04-Apr-2023,15-Apr-2023


In [None]:
# write the scraped data to a CSV file
df.to_csv('Y:\Documents\GitHub\CarSmartConsultancy\Data\scrape_data\sgCarMart_features+2dates.csv', index=False)