In [14]:
from selenium import webdriver
import time
import pandas
import csv
from datetime import datetime
from selenium.webdriver.firefox.options import Options

In [2]:
details_mapping = {
    "COLOUR": "EXTERIOR COLOUR",
    "VARIANT": "BADGE",
    "KMS SHOWING": "ODOMETER",
    "MOTOR TYPE": "ENGINE",
    "GEARBOX MAKE/MODEL": "TRANSMISSION",
    "STEERING TYPE": "STEERING",
    "ENGINE": "ENGINE NUMBER",
    "MOTOR TYPE": "ENGINE"}

In [8]:
RESEARCH_BASE_URL = "https://www.slatteryauctions.com.au/search/cars-commercial-vehicles-motorcycles"
EXECUTABLE_PATH = "/home/saronida/lib/geckodriver"

In [15]:
options = Options()
options.add_argument('--headless')

In [16]:
driver = webdriver.Firefox(executable_path=EXECUTABLE_PATH, options=options)
#driver = webdriver.PhantomJS()

In [17]:
driver.set_page_load_timeout(30)
driver.get(RESEARCH_BASE_URL)

In [18]:
time.sleep(10)
item_cards = driver.find_elements_by_class_name("item-card__inner")

In [8]:
len(item_cards)

104

In [9]:
cars_urls = [item_card.find_element_by_tag_name("a").get_attribute("href") for item_card in item_cards]

In [10]:
len(cars_urls)

104

In [11]:
#first_example_url = "https://www.slatteryauctions.com.au/product/4459/950529/2017-lamborghini-aventador-lp-750-4-car-auction/2017-lamborghini-aventador"

In [12]:
#first_example_url

In [13]:
def parse_details(description_details, details, asset_conditions, extra, initial_details):
    """ Parses vehicle's details box. """

    parsed_details = initial_details
    
    for detail in description_details:
        key = detail.split(":")[0]
        value = detail.split(":")[1]
        if value != "":
            parsed_details[key] = value
            
    for detail in details:
        parsed_details[detail.find_element_by_tag_name("th").text] = detail.find_element_by_tag_name("td").text
    
    for detail in asset_conditions:
        parsed_details[detail.find_element_by_tag_name("th").text] = detail.find_element_by_tag_name("td").text
    
    features_list = []
    for feature in extra:
        if "text-success" in feature.find_element_by_tag_name("td").find_element_by_tag_name("i").get_attribute("class"):
            features_list.append(feature.find_element_by_tag_name("th").text)
    parsed_details["VEHICLE FEATURES"] = ",".join(features_list)
    
    parsed_details_df = pandas.DataFrame.from_dict(parsed_details, orient="index", columns=["value"])
    parsed_details_df["key"] = parsed_details_df.index
    parsed_details_df.reset_index(inplace=True, drop=True)
    return parsed_details_df

In [14]:
def alter_details(parsed_details_df):
    """ Alters details to match mapping format. """

    parsed_details_df = parsed_details_df[~pandas.isnull(parsed_details_df.key)]
    parsed_details_df["key"] = parsed_details_df["key"].apply(lambda key: key.replace(":", "").strip().upper())
    parsed_details_df["key"] = parsed_details_df["key"].apply(
        lambda key: details_mapping[key] if key in details_mapping.keys() else key)
    parsed_details_df.drop_duplicates(subset ="key", inplace = True)
    return parsed_details_df

In [16]:
all_cars = []
for i, example_url in enumerate(cars_urls):
    print("Scrapping %d out of %d ...", (i, len(cars_urls)))
    driver.get(example_url)
    link = example_url
    title = driver.find_elements_by_class_name("product-details__title")[1].text
    initial_details = {
        "TITLE": title, "LINK": link, "TIMESTAMP": int(datetime.timestamp(datetime.now()))}
    initial_details["DEALER NAME"] = "Slattery Auctions"
    location = driver.find_element_by_class_name("snapshot__details").find_elements_by_tag_name("li")[2].text
    initial_details["LOCATION"] = location
    #price = driver.find_element_by_class_name("starting-bid").find_element_by_class_name("ng-binding").text
    product_details_comments_features = driver.find_element_by_id("product-details")
    description = product_details_comments_features.find_elements_by_class_name("table-product-details")[0]
    description_items = description.text.split("\n")
    description_details = [item for item in description_items if ":" in item and len(item.split(":"))==2]
    details_block = product_details_comments_features.find_elements_by_class_name("table-product-details")[1]
    details = details_block.find_elements_by_tag_name("tr")
    asset_conditions_block = product_details_comments_features.find_elements_by_class_name("table-product-details")[2]
    asset_conditions = asset_conditions_block.find_elements_by_tag_name("tr")
    extra_block = product_details_comments_features.find_elements_by_class_name("table-product-details")[3]
    extra = extra_block.find_elements_by_tag_name("tr")
    parsed_details_df = parse_details(description_details, details, asset_conditions, extra, initial_details)
    parsed_details_df = alter_details(parsed_details_df)
    tmp_dict = parsed_details_df.to_dict(orient="list")
    parsed_details = dict(zip(tmp_dict["key"], tmp_dict["value"]))
    all_cars.append(parsed_details)

Scrapping %d out of %d ... (0, 104)
Scrapping %d out of %d ... (1, 104)
Scrapping %d out of %d ... (2, 104)
Scrapping %d out of %d ... (3, 104)
Scrapping %d out of %d ... (4, 104)
Scrapping %d out of %d ... (5, 104)
Scrapping %d out of %d ... (6, 104)
Scrapping %d out of %d ... (7, 104)
Scrapping %d out of %d ... (8, 104)
Scrapping %d out of %d ... (9, 104)
Scrapping %d out of %d ... (10, 104)
Scrapping %d out of %d ... (11, 104)
Scrapping %d out of %d ... (12, 104)
Scrapping %d out of %d ... (13, 104)
Scrapping %d out of %d ... (14, 104)
Scrapping %d out of %d ... (15, 104)
Scrapping %d out of %d ... (16, 104)
Scrapping %d out of %d ... (17, 104)
Scrapping %d out of %d ... (18, 104)
Scrapping %d out of %d ... (19, 104)
Scrapping %d out of %d ... (20, 104)
Scrapping %d out of %d ... (21, 104)
Scrapping %d out of %d ... (22, 104)
Scrapping %d out of %d ... (23, 104)
Scrapping %d out of %d ... (24, 104)
Scrapping %d out of %d ... (25, 104)
Scrapping %d out of %d ... (26, 104)
Scrapping %

In [17]:
len(all_cars)

104

In [19]:
import json
with open('../results/slattery.json', 'w') as f:
    for dic in all_cars:
        json.dump(dic, f) 
        f.write("\n")