# Car price prediction - Web scraping
_Cong Tung Nguyen (May 2021)_ 
<br>
----

In this project, I wanted to collect my own dataset with web scraping techniques. The data is from 'https://www.hasznaltauto.hu/', which is a hungarian car trading platform. I set the filtering based on my own preferences. The cars ranges between 2011 and 2019, with a maximum 3.500.000 HUF price. I ran the function below for the first 50 pages, which resulted 350 cars.

### 1. Importing the libraries

In [2]:
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import os
import requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait

### 2. Car data collector

In [7]:
cars = []

for page in range (2,50):
    
    # Defining home page
    home_page = 'https://www.hasznaltauto.hu/talalatilista/PCOG2VGRR3RDADH4S57MBCIWLAQDZXQKDXJATHLU2K6VNBU2CKNMIKCJVMC4JP472MKGQWP2QI2XN3HRMQBIR4K4LSOUQM4RKTQLCJBLMGBU3MTEFQKYTO5AIZ2YNWSTBNGUIATKLC4GC7BOSIWTLUVD64IQQZY36SUMBZS2TI3SA7EFQYZZDDFHUPYLL3NHELA5VZW7J67CNQXJIIWBNM7UY5WDC6T4ROHOGAO4YWAPF6IRVSXNXVRCJHNV7HSYQIC3HE5AUR5VHEEFVVAZC6MTMMNFXNFIB3UOSTJ2JSLLMYESV2AYEOJEZOWRQDJSWSBKHBZDOR5Y2RFS46G3Q5S52BJ2KDQ3JL3QBQ7STMUJ4RKXFPX4E5HGTVDBLZ5PGR7DVO37BLE6F7BCRV75CNTSQ4XCMMQ6WCVZ2J75LM5P7R4SEYX4TLELTD7GRD75LNX3U7PDUCCURJC33F4ENGTSEWKUP65QQQDFWQJOXMLFZLFJICSR4CEXPB42VXIRKVUMOWW4NWDQ3CYLH3XO6BFLS6G5F5C3WYTUIJMD7VKLOJLLWFT4W6S75GPIMA7UP2LGTVLMOIYGOKA73VACSOARPMM4IVC4MW6MH5UWUVEGHHEFHCF7COGE4MME6QR4RHYTIWO6HAMLG6DIJK2LL5J5HJB7RTRHFTEBNIJKM4LWU6CSV4KTUHFAJSO2KU5IF5KALKS4OVWVKDOPKWBYP7TFOEYTS55J3C7FKH77BB3UQ67ZGW4FSRSRMHSYGD24BZNQLA2DMGDKVWMQI27MAHXVFHGB45DLT5KLBW5FEJZW56RZCYA532MFNBBYMDTEVHWOHM7Y47AU7N4PWKPSWJPVMDK7BRTZDKNDHOT4J63AJJL3HRXVCZHW6MT622UNUZD7N5Q7Z4NJKWI/page{}'.format(page)
    home_page_content = urllib.request.urlopen(home_page)
    home_page_html = BeautifulSoup(home_page_content, 'html.parser')
    
    # Setting up options for WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--start-maximized')

    # Initiate WebDriver
    driver = webdriver.Chrome(options=options)
    driver.get(home_page)
    
    # Handle cookies --> press OK
    try:
        driver.find_element_by_link_text("OK").click()
    except:
        cookie = False
    
    # Parse HTML of current page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # In HTML find each car title
    talalat = soup.find_all('div',attrs={'class':'row talalati-sor kiemelt'})
    
    for title in talalat:
        car_data_dict = {}
        
        # Get the car title, add it to dictinonary
        section = title.find('h3')
        title = section.find('a')
        title = title.text.strip()
        
        car_data_dict['Marka'] = title
        
        # Click to title, to get car information 
        try:
            driver.find_element_by_link_text(title).click()
        except:
            try:
                driver.execute_script("window.scrollTo(0,500)")
                driver.find_element_by_link_text(title).click()
            except:
                try:
                    driver.execute_script("window.scrollTo(0,1500)")
                    driver.find_element_by_link_text(title).click()
                except:
                    driver.execute_script("window.scrollTo(0,2000)")
                    driver.find_element_by_link_text(title).click()
                    
        # Parse to HTML content, and find each information, and add it to the dictionary
        page = driver.current_url
        page = requests.get(page)
        content = page.content
        specific_car_page = BeautifulSoup(content, 'html.parser')
        
        car_data_html = specific_car_page.find('table', class_='hirdetesadatok')
        car_data = car_data_html.find_all('strong')
        key_data = car_data_html.find_all('td', class_='bal pontos')
        
        attr_num = len(car_data)
        
        for i in range(attr_num):
            car_data_dict[key_data[i].text.strip()] = car_data[i].text.strip()
        
        # Append the collected data into the cars list
        cars.append(car_data_dict)
        
        # Get back to the home page, and continue with the next car
        driver.get(home_page)
    
    # Close the chrome driver after the page is finishes, and go to next one
    driver.close()     

In [None]:
cars_df = pd.DataFrame(cars)

In [29]:
cars_df.head()

Unnamed: 0,Marka,Vételár:,Vételár EUR:,Évjárat:,Állapot:,Kivitel:,Finanszírozás:,Kilométeróra állása:,Szállítható szem. száma:,Ajtók száma:,...,Kárpit színe (2):,Tető:,Téli gumi méret:,Hátsó nyári gumi méret:,Hátsó téli gumi méret:,Kezdőrészlet:,Futamidő:,Bérlési lehetőség:,Egyéb költségek:,Fizetendő magyarországi forgalomba helyezés esetén:
0,RENAULT MEGANE Grandtour 1.5 dCi Dynamique FŰT...,1 488 000 Ft,€ 4 239,2011/11,Újszerű,Kombi,20%-tól elvihető,231 000 km,5 fő,5,...,,,,,,,,,,
1,MAZDA 2,,€ 4 245,2012/7,Normál,Ferdehátú,,188 000 km,5 fő,5,...,Világosszürke,Lemeztető,,,,,,,,
2,OPEL CORSA D 1.2 Enjoy KLÍMA/MEGKÍMÉLT!,1 499 000 Ft,€ 4 271,2014/2,Megkímélt,Ferdehátú,20%-tól elvihető,199 000 km,5 fő,5,...,,,,,,,,,,
3,FIAT PUNTO 1.4 Easy S&S,1 499 999 Ft,€ 4 274,2012/7,Normál,Ferdehátú,,108 260 km,5 fő,5,...,,Lemeztető,,,,,,,,
4,FIAT DOBLO Dobló Panorama 1.4 16V Dynamic Star...,1 515 000 Ft,€ 4 316,2011/3,Normál,Egyterű,,188 000 km,5 fő,4,...,,Lemeztető,195/60 R 16,,,,,,,


In [3]:
# Saving dataframe as pickle
# cars_df.to_pickle('./dataset/cars_df.pkl')

# Read pickle file
cars_df = pd.read_pickle('./dataset/cars_df.pkl')