# Car price prediction - Web scraping
_Cong Tung Nguyen (October 2022)_ 
<br>
----

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In this project, I wanted to collect my own dataset with web scraping techniques. The data is from 'https://www.hasznaltauto.hu/', which is a hungarian car trading platform. I set the filtering based on my own preferences. The car price ranges between ~500.000 to ~10M HUF.

The method for collecting the data was the following:
- Divide the data into 4 groups based on the price:
    - 500.000-2.000.000
    - 2.000.000-5.000.000
    - 5.000.000-7.000.000
    - 7.000.000-10.000.000
    
- For each group scrape the data within the following pages:
    - 1,51,101,151,201,251

The code execution took about ~35 minutes.

### 1. Importing the libraries

In [None]:
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import os
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime

### 2. Car data collector

In [None]:
pages = [
        'https://www.hasznaltauto.hu/talalatilista/PCOG2VGRN3RDAEH4C57UAFICFGOPW6BHOSKU5KGSJF6Y2FVSAQJ5XC3ME4VFJ7PXLPDYCJQ4XSYJUXPPZ2HMOANZZDSKPF5TLSFAMAWW4QKGZMATF6GM3JHQC42KRT6QA3VKDBIUUCTYKG6GK5JGZKKTAFBUQQB43NQUADNOGTFMNM6PKIKFT7EJ7M6HYRDIFFCZ47IXCRJGB26KJGJYVT5NWKT2E6CUCC5VI4VRZTZ2O7DICR77HAPWAH6MLAQO4UIZY2P3CFJMZ6T7TZOIGA53KOQJK72QSBDSCQETPWIGGGS3OSUA6GHIIH5IOFCMTV6K6TLZDUZYQFDJFFL7CBV6QOFATCUXVXGCMGK2YMY4B4L2EKJWFT7LOLWWWRO7KIHVQUXJH4YCVM2JBI57JLIOHZJOHJPSULT3ROVH6HJ3X7OQU7UJL62NA372F26UB36YQO7YW7B5TEL5K4KQ43DSD26GW7JA6ND7T4HGZBIFAZC7VOKH53SPXW7W4BURWKVURSVPLU5KG3F5KY5KBO2ZZOAGXICUIOF2QKCVVD2Q3YMSUBHK2P5CR3WVRI5LYWYOGG4BGDK6KBUOORUVI5FPD2FFRGR5HM7IJRS43V5W2OSJEXRQ4KPG2XZGWAL3Q35NMF5UQYJ7DIYALHEE7FJOW73YNEVTCTTTDTSTHSH7CPGFRMMC6UWM672MLQM42VBVXI3LLW4CONWNJWMMLJOZIXKJJ3DCAPPUXNQ2HVHFAXUCKXKGQLWUITJFZ5OZBMOL6AAGAKGO7DMLZH5UXQ7WN7DPQ454J3NKXD6JEUZYUGU3CAW6CS6YNKMBZCRTOTFZ5NPKBYHGQTTR62SFX7ZEUMRS6DOZOM345MIS5BHXORB3WJ2CAR7VDBP2NDYQX7ETDSUWN62DFE2V4M3YFJ5BHWRS757ENLIGNRHTFOE4ZVX6IW55NIA5SH67UB476OW4G/page{}',
        'https://www.hasznaltauto.hu/talalatilista/PCOG2VGRR3RDADH4S57MBKQUXJYNTRZ3UFBTVIMVJ3NNPSSQKNBJGGBFNG2YB5W7Z5UVVNXF4ACWX3GY4PE2IIAWVG4DSMOLIRJIHR4KVSCC3NWJDNRTHENYFNVFIF3IHR25AQREUCTIKW6GK4RNSUJLHV5B7ALQWZCY6CTMUGSQSZ4XELE5F3UXHQLMAZ27SCHQU4URMBRYWSMXKIWJMWPWSL2Y3QTP72WDMNBMPHDCVTK76LC325WWV5YAI5ZVUB6HCAVLTNXJUSDG3U7ZHKGAQLMUWUGSHUU4RARIUDEDZSJRRUOVUVCH6T2CJ7KDES6ALOKOU5RBRUYLCZUESW7SA2XILEQJEZXTXEKOGK2INE4H2NYCEFOJQHL6LWSNJHP3F52YCTWD6MBIWOGWEW3URXZC4UHDUWZLZY5YPKSPDU6ZIPPSP34U7NGRV75CNXSR5XMIHO4PPY6FSL4ZAJPWNQZB5PDPTQT72RZ275XETE4XMQ3GKTF7O2TT4NW3L2SHIECJXK2Y2Z5EVHLFE6NUO6NWTMBVCSZG6EIUUUJJLORDLTROLOLRE5KKCHLLGS4A2W56CHYFGXPO4BEJOWQGE6V35CFFHGYPXPNP6XQOJIPJEQKT3XFHTBL4GLYHZQC2U627KAFM5RAPARCC43RSLZXOODJJMYY44QU4YV4B5YTZRQLSCXWETDX6TC4DHGVYPJSC3NMP6SDMTKNTAC2LWOF2QLWYYUNL5F3MCQ5JZIF4ASV2QUC7VEHKJOPK2YERZPQGDI6IZ34NRPE3S26N62X4P6DLXBHDVSGPZESTHCQKNMPSZYIK3AUZQHAKGNKMHHRV6IA7NKGPMH3KIW7XESBSHLZVTFFTXTVRCLUM4D2ELMZHIJCL2UMF7RXPCG74SMOKS2XZLVETKXBTPAVHUG42AKYOMQVVA6WE6MVYVTGW7ZC7WRVDDGI736QH6TYLACI/page{}',
        'https://www.hasznaltauto.hu/talalatilista/PCOG2VG3R3NDAEH5S56MAKTEBFIO6YZLKSSAVLKUNFP2HAIMYHCPMIG3RELNB7T3Y5Y4EJQULYMJ3OI5D4PQPRB4CU3SOZUZJBVPAWERSWYMKNTZMNWCMETXIWG6UAUNU4HJVCYE2S2HBS7YJISDW2VFI7XSGED2LP2KRQAWLKNNBOYUJGPHNP7EWEAD564C4WUCAFYJG23JQTBJXFXZLZRP7E6B27VTM62WCIGXF3ZGZ6NSZR56ER3QK4B4UFZHWC5OS5UIMTLP347KBIWJRPIEEXOZHARMJAAYVTETDQX57IKFOVCE6T6SH5CMFXGJOXVBJQ42L2TEAS62SJ43WFSKE2MLZ3KEHLE5AGSODZHEGRZKSIBR7ETLG4SX3C65MNI2N76AUDDTMSTM2E24VO2ARUHZLZI5Y7KSHDM7ZYPPUOLZU7OG7UXYC5WSX57II3OMDPI7F6LMZBZMWFTZH4PCPXRTZ2J72L4XOS42XQSLGKLDPLIXK7LMW26UR2JASJ2GWG33USU5MUTZWR3XG2NQGUKLEHYRCSSRFFN2ENOOFZNZOETVIYI4NM2HQDC3XYI7AU2553QERF22A372VPUIUU43BZ55V726BX3B5ASBLHO3U6MFPQ7PAXGALKX3LRIAVTWEZ4BEILTOGJPG5ZYNFFTDDTSCTTC7QNOEV4YZ5B3YDZR35JZPBZPFSPJUCXNMN6SHMLJ5ZBCZHA2LXCHKQJO2ZYP3SCGUFBYTSQH4CKRK2RL2UA3JV25HKAUHN6AAMD7DPPRWE4XOLKZ732ZRXYENOSIH7E43SZCGKFQ63AZFLQATWBMTU2YEGNPMLERR6YEP7IOHMT5GOVHTEOHSLKJ3TNHDDQVYD3X4IC2CKQ2HWJKI34LF7QFPDGY4UNWPSWSJHU4WV6CG6A3DIF3UYWAWYBKKR54H4RMR3HNL62G7NA2CHMS774AATGFOHE/page{}',
        'https://www.hasznaltauto.hu/talalatilista/PCOG2VG3R3RDADH5S57MBKBNW7M4ZY5O2CHLIQRDVU2K7FNBU2CCMMKK2JVAB4PP5M2C3U5S6CBHL3GYY4TSOBJRJ7C5LCJMC5EQ2HRLWIJDNWBGN6GGKITRC7KKRTWQPDVKBOKIIBGQWN4MX6FGIS5N6TUH2BGC3ELD2KVQQWLCNHC5RFSJK5V74SYQBPUS3ASEWRYVBOIWAY4LJGNVEZGLIW7HUWJNTZKYIPVFZBT4WLD3JFP5HYMX67GA7YBOA2KC7DTAOXJ43YER3U7ZHK6AQLMUSUGSHUU4RAYWUDEDZSJRRUWVUVAH6T2CJ7KDETGJ2XBHJ4YYZ2LVBK2KILLZAPLUFSIESO322SBHDFNMHUODOE4JDCTEZ7VXF3L3JHPWV52YSTUT6MBIWOEZOYGRGXFLWQENS7FBOHOH25DRUP453X3X2FU5OK7UTY27WSW5ZII3OED7O4JYLMZJ7MWETZG45A3XRXZ2J72I447SY2PSSLGHWGOTZPXNJZ73UYLSSHKESJXG3YYZNGVLKVA65XOV2GTMBWC6YWYEIWUUJJPOBCLXQ6LK3QIVK2GHDIGSNQ4WN4CPYVEXHM4NFJMWYHAKUL3CNH3GY6XPNM6RQ6DIHJEUL27WFHTNL4EDAFZYN2W6C6KIEE7RXMALCECXDEX3P44GCRMYZ4YQ44YZ5BLYC3RXTCHHGHTHXOTRGBR7LUCUNAN6WH4JJVZRGZTBNF3BDVIFXNMMH54SRVBIOE4US5QBKFPKBPKYCNCXLU5YDQ2HICBYP4N5OEYTS55J3H7PKGC7Q3V2JPP4TJODFI5IWD3MCEVOQCWYFSNBGBBTL3CZEMPWBD72CRTE7OTVJ4ZDR4SSSM43PYY4FOA655CAWQSUGB5SKSG7CVP4BTYZWHFDNT4VUSJ5LE2XYI3YDKNDHOTCJ63AFJKHHRXVCZFW6YX5UNI2ZWEOZ3I7GLGK4ZA/page{}'
        ]

start_datetime = datetime.now()

cars = []

for p in pages:
    for page in range(1,252,50):
        
        # in case of network issue try again, break if the code ran successfully
        while True:

            # Defining home page
            home_page = p.format(page)
            home_page_content = urllib.request.urlopen(home_page)
            home_page_html = BeautifulSoup(home_page_content, 'html.parser')

            # Setting up options for WebDriver
            options = webdriver.ChromeOptions()
            options.add_argument('--start-maximized')

            # Initiate WebDriver
            driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
            driver.get(home_page)

            # Handle cookies --> press OK
            try:
                #driver.find_element_by_link_text("Elfogadás és bezárás").click()
                button = driver.find_element_by_id('didomi-notice-agree-button')
                button.click()

            except:
                cookie = False

            # Parse HTML of current page source
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # In HTML find each car title
            talalat = soup.find_all('div',attrs={'class':'row talalati-sor kiemelt'})
            talalat_2 = soup.find_all('div',attrs={'class':'row talalati-sor'})
            talalat_merged = talalat+talalat_2

            for title in talalat_merged:

                # If cannot find title then continue next iteration
                try:

                    car_data_dict = {}

                    # Get the car title, add it to dictinonary
                    section = title.find('h3')
                    title = section.find('a')
                    title = title.text.strip()

                    car_data_dict['Marka'] = title

                    # Click to title, to get car information 
                    try:
                        driver.find_element_by_link_text(title).click()
                    except:
                        try:
                            driver.execute_script("window.scrollTo(0,500)")
                            driver.find_element_by_link_text(title).click()

                        except:
                            try:
                                driver.execute_script("window.scrollTo(0,1500)")
                                driver.find_element_by_link_text(title).click()
                            except:
                                driver.execute_script("window.scrollTo(0,2000)")
                                driver.find_element_by_link_text(title).click()

                    # Parse to HTML content, and find each information, and add it to the dictionary
                    page = driver.current_url
                    page = requests.get(page)
                    content = page.content
                    specific_car_page = BeautifulSoup(content, 'html.parser')

                    car_data_html = specific_car_page.find('table', class_='hirdetesadatok')
                    car_data = car_data_html.find_all('strong')
                    key_data = car_data_html.find_all('td', class_='bal pontos')

                    attr_num = len(car_data)

                    for i in range(attr_num):
                        car_data_dict[key_data[i].text.strip()] = car_data[i].text.strip()

                    # Append the collected data into the cars list
                    cars.append(car_data_dict)

                    # Get back to the home page, and continue with the next car
                    driver.get(home_page)

                except:
                    continue                

            # Close the chrome driver after the page is finishes, and go to next one
            driver.close() 
                     
            break
    
end_datetime = datetime.now()
exec_seconds = (end_datetime-start_datetime).total_seconds() 
print('The execution took approx {} minutes.'.format(round(exec_seconds/60)))

The execution took approx 36 minutes.


In [None]:
cars_df = pd.DataFrame(cars)

In [None]:
# Saving dataframe as pickle
cars_df.to_pickle('./dataset/cars_df.pkl')

# Read pickle file
#cars_df = pd.read_pickle('./dataset/cars_df.pkl')