In [54]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import undetected_chromedriver as uc
import pandas
import time
from datetime import datetime
import random
import os

In [55]:
# Adds random milliseconds to passed seconds and sleeps for the duration
def random_wait_ms(seconds):
    number = random.randint(1, 100)
    
    milliseconds = number/100
    
    wait_time = seconds + milliseconds
    
    print(f"\nWaiting {wait_time} seconds...")
    time.sleep(wait_time)
    print('Proceeding\n')

In [56]:
# Define helper function for parsing page soup to cars dictionary
def map_soup_to_cars(page_soup, cars_array):
    main_blocks = page_soup.find_all('div', id=lambda x: x and x.startswith("mainBlock"))
    
    cars_found = 0
    
    for block in main_blocks:
        summary_element = block.find('a', 'childVifUrl tricky_link')

        car_id = summary_element['data-id']
        car_manufacturer = summary_element['data-make']
        car_model = summary_element['data-model']
        car_mileage = summary_element['data-mileage']
        car_price = summary_element['data-price']
        car_year = summary_element['data-year']
        seller = summary_element['data-postedby']
        nettiauto_url = summary_element['href']

        short_description = block.find('div', class_=lambda c: c and c.startswith('checkLnesFlat '))\
                                .text.strip(' \n')

        cars_array.append((car_id, car_manufacturer, car_model, car_price, \
                           car_year, car_mileage, seller, short_description, nettiauto_url))
        cars_found += 1
        
    print(f"{cars_found} cars found from page")
    
    return cars_array
    

In [57]:
def post_process_dataframe(dataframe):
    dataframe['timestamp'] = datetime.now()
    
    #drop duplicates, "ohituskaista" ads create couple of duplicates
    cars_unique_df = dataframe.drop_duplicates(subset=['car_id'], keep='last').reset_index(drop=True)
    
    return cars_unique_df

In [58]:
def write_dataframe_to_csv(dataframe, folder, file):
    root_path = '/users/markusk/git/nettiauto'
    folder_path = os.path.join(root_path, folder)
    
    folder_exists = os.path.isdir(folder_path)

    if not folder_exists:
        os.makedirs(folder_path)
        print('Created folder: ', folder_path)

    else:
        print(folder_path, 'folder already exists.')
        
    file_name = f"{file}_{datetime.now().strftime('%Y%m%dT%H%M%S')}.csv"
    
    full_path = os.path.join(folder_path, file_name)
    
    dataframe.to_csv(full_path, header=True, sep=',', index=False, )
    print(f"{full_path} created!")
    

In [59]:
targets = [
    {
        "manufacturer": "Tesla",
        "model": "Model Y",
        "file_path": "data/tesla/modely",
        "file_name_prefix": "tesla_modely",
        "url": "https://www.nettiauto.com/tesla/model-y?id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Skoda",
        "model": "EnyaQ",
        "file_path": "data/skoda/enyaq",
        "file_name_prefix": "skoda_enyaq",
        "url": "https://www.nettiauto.com/skoda/enyaq?id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Kia",
        "model": "EV 6",
        "file_path": "data/kia/ev6",
        "file_name_prefix": "kia_ev6",
        "url": "https://www.nettiauto.com/kia/ev6?id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Ford",
        "model": "Mach-E",
        "file_path": "data/ford/mache",
        "file_name_prefix": "ford_mache",
        "url": "https://www.nettiauto.com/ford/mustang-mach-e?id_vehicle_type=1&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Audi",
        "model": "Q4 e-tron",
        "file_path": "data/audi/q4etron",
        "file_name_prefix": "audi_q4etron",
        "url": "https://www.nettiauto.com/audi/q4-e-tron?id_vehicle_type=1&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Mercedes-Benz",
        "model": "EQE",
        "file_path": "data/mb/eqe",
        "file_name_prefix": "mb_eqe",
        "url": "https://www.nettiauto.com/mercedes-benz/eqe?id_vehicle_type=1&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Porsche",
        "model": "Taycan",
        "file_path": "data/porsche/taycan",
        "file_name_prefix": "porsche_taycan",
        "url": "https://www.nettiauto.com/porsche/taycan?id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Tesla",
        "model": "Model 3",
        "file_path": "data/tesla/model3",
        "file_name_prefix": "tesla_model3",
        "url": "https://www.nettiauto.com/tesla/model-3?id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "BMW",
        "model": "iX3",
        "file_path": "data/bmw/ix3",
        "file_name_prefix": "bmw_ix3",
        "url": "https://www.nettiauto.com/bmw/ix3?id_vehicle_type=1&id_car_type=5&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Hyundai",
        "model": "Ioniq 5",
        "file_path": "data/hyundai/ioniq5",
        "file_name_prefix": "hyundai_ioniq5",
        "url": "https://www.nettiauto.com/hyundai/ioniq-5?id_vehicle_type=1&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    },
    {
        "manufacturer": "Polestar",
        "model": "2",
        "file_path": "data/polestar/2",
        "file_name_prefix": "polestar_2",
        "url": "https://www.nettiauto.com/polestar/2?id_vehicle_type=1&id_country[]=73&chargingPowerFrom=&chargingPowerTo="
    }
    
]

In [60]:
#setup
service = Service("/opt/homebrew/bin/chromedriver")
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")


In [61]:
start_url = 'https://www.nettiauto.com/'

print(f"Beginning to scrape! Opening {start_url}")

driver = uc.Chrome(use_subprocess=True)
driver.get(start_url)


Beginning to scrape! Opening https://www.nettiauto.com/


In [62]:
random_wait_ms(3)

print('Accepting cookies')
accept_cookies_element = driver.find_element('id', 'almacmp-modalConfirmBtn')
accept_cookies_element.click()


Waiting 3.7 seconds...
Proceeding

Accepting cookies


In [63]:

for target in targets:
    random_wait_ms(random.randint(3, 9))
    
    print(f"Beginning to process {target['manufacturer']} {target['model']}\n")

    cars = []
    url = target['url']
    
    while True:
        print(f"Fetching {url}")
        driver.get(url)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        cars = map_soup_to_cars(soup, cars)
        
        #Check if next page arrow exists
        next_page_element = soup.find('a', class_='pageNavigation next_link')
        if(next_page_element is None):
            print('No more pages available')
            break
        else:
            url = next_page_element['href']
            print('More pages available')
            
        random_wait_ms(random.randint(4, 29))

    
    cars_df = pandas.DataFrame(cars, columns=['car_id', 'car_manufacturer', 'car_model', 'car_price', \
                                              'car_year', 'car_mileage', 'seller', 'short_description', \
                                              'nettiauto_url'])
    
    cars_df = post_process_dataframe(cars_df)
    
    print(f"{len(cars_df)} cars found.")
           
    write_dataframe_to_csv(cars_df, target['file_path'], target['file_name_prefix'])
    
    print(f"{target['manufacturer']} {target['model']} processed!\n\n")
    


Waiting 6.82 seconds...
Proceeding

Beginning to process Tesla Model Y

Fetching https://www.nettiauto.com/tesla/model-y?id_country[]=73&chargingPowerFrom=&chargingPowerTo=
32 cars found from page
More pages available

Waiting 28.39 seconds...
Proceeding

Fetching https://www.nettiauto.com/tesla/model-y?id_country[]=73&chargingPowerFrom=&chargingPowerTo=&page=2
30 cars found from page
More pages available

Waiting 24.51 seconds...
Proceeding

Fetching https://www.nettiauto.com/tesla/model-y?id_country[]=73&chargingPowerFrom=&chargingPowerTo=&page=3
30 cars found from page
No more pages available
90 cars found.
/users/markusk/git/nettiauto/data/tesla/modely folder already exists.
/users/markusk/git/nettiauto/data/tesla/modely/tesla_modely_20230223T190035.csv created!
Tesla Model Y processed!



Waiting 8.87 seconds...
Proceeding

Beginning to process Skoda EnyaQ

Fetching https://www.nettiauto.com/skoda/enyaq?id_country[]=73&chargingPowerFrom=&chargingPowerTo=
32 cars found from page
M

30 cars found from page
More pages available

Waiting 20.62 seconds...
Proceeding

Fetching https://www.nettiauto.com/tesla/model-3?id_country[]=73&chargingPowerFrom=&chargingPowerTo=&page=12
30 cars found from page
More pages available

Waiting 7.19 seconds...
Proceeding

Fetching https://www.nettiauto.com/tesla/model-3?id_country[]=73&chargingPowerFrom=&chargingPowerTo=&page=13
30 cars found from page
No more pages available
390 cars found.
/users/markusk/git/nettiauto/data/tesla/model3 folder already exists.
/users/markusk/git/nettiauto/data/tesla/model3/tesla_model3_20230223T190914.csv created!
Tesla Model 3 processed!



Waiting 8.81 seconds...
Proceeding

Beginning to process BMW iX3

Fetching https://www.nettiauto.com/bmw/ix3?id_vehicle_type=1&id_car_type=5&id_country[]=73&chargingPowerFrom=&chargingPowerTo=
32 cars found from page
More pages available

Waiting 4.43 seconds...
Proceeding

Fetching https://www.nettiauto.com/bmw/ix3?id_vehicle_type=1&id_car_type=5&id_country[]=73&

In [64]:
driver.quit()