# Web-scraping

Script gets data from online car marketplace and creates _CSV_ file.

#### Modules:

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import os
import sys

#### Settings:

In [2]:
# Num of displayed df columns
pd.set_option('display.max_columns', 60)

# Hard drive results location
DIR = '.'
EXT = '.csv'
RES_PATH = os.path.join(DIR, "Results")


def save_res(var, name, extension=EXT):
    """Saves generated DF into .csv file"""
    
    os.makedirs(RES_PATH, exist_ok=True)
    file_path = os.path.join(RES_PATH, name + extension)
    var.to_csv(file_path, index=False)

#### Functions:

In [3]:
def advert_links(start_yr, end_yr, basic_url, num_of_pages):
    """Gets links to all adverts from specified number of pages"""
    all_links = []
      
    for i in range(1, num_of_pages + 1):
        page_url = basic_url.format(start_yr, end_yr, i)
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find_all(class_=['offer-title__link'])
        
        for elem in content:
            link = str(elem).split('href=')[1].split(' ')[0]
            link = link[1:-1] # To get rid of quotes: "link"
            all_links.append(link)
    
    return all_links


def links_generator(all_links):
    """Generates next link from the list"""
    
    for link in all_links:
        yield link


def advert_features(advert):
    """Gets all features (car params) from single advert"""
    
    # Gets content of page of single advert
    response = requests.get(advert)
    soup = BeautifulSoup(response.text, 'html.parser')

    # FEATURES
    # dict for features and their values
    features = {}

    # Price located in different place than other features -> parsed separately
    price = soup.find("span", {"class": "offer-price__number"}).text.strip().replace(' ', '')
    features["Cena"] = price

    # Other features located in 'li' tags with class name 'offer-params__item', so below code
    # looks for all 'li' tags with class 'offer-params__item' in single advert
    content = soup.find_all('li', attrs={'class': 'offer-params__item'})

    # Iterates on every 'li' tag (every 'li' tag contains single feature and its value)
    for c in content:

        # Feature (dict key)
        label = c.find("span", {"class": "offer-params__label"}).string

        # Feature's value (values are stored in 'a' tags (links) and 'div' tags -
        # thats why 'if' statement needed)
        if c.find('a'):
            value = c.find('a').string.strip()
        else:
            value = c.find('div').string.strip()

        features[label] = value

    return features


def update_data(data, features):
    """Updates DF with new observation (car features from single advert)"""
    
    observation = pd.DataFrame(features, index=[0])
    data = pd.concat([data, observation], sort=False).fillna(np.nan)
    
    return data

#### Variables:

In [4]:
# Range of production years
start_yr = 2005
end_yr = 2015

# Universal version of url
basic_url = "https://www.otomoto.pl/osobowe/od-{}/?search%5Bfilter_float_year%3Ato%5D={}&search%5Border%5D=created_at%3Adesc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}"

# Number of pages to scrape
num_of_pages = 157

### Main code

#### 1. Getting links to adverts:

In [5]:
# Timer
start = time.time()

# Gets list of all adverts' links from all pages
all_links = advert_links(start_yr, end_yr, basic_url, num_of_pages)

stop = time.time()
print("Time taken: ", round((stop-start)/60, 3), "mins")

# Only unique links
all_links = list(set(all_links))

print("Number of links: ", len(all_links)) # n=1 -> 32

Time taken:  3.772 mins
Number of links:  5024


#### 2. Creating DF with scraped data:

In [6]:
# Creates generator for links
gen = links_generator(all_links)

# Creates DF for scraped data
cars_data = pd.DataFrame()

# Timer
start = time.time()
print("Started at:", time.asctime(time.localtime(start)))

# Loop to move on each link
for i in range(len(all_links)):
    
    # Generates link to advert
    advert = next(gen)
    
    try:
        # Gets all features from single advert (car)
        features = advert_features(advert)

        # Updates DF with scraped data from single advert
        cars_data = update_data(cars_data, features)
        
    except:
        print("Error occured:", sys.exc_info()[0])
        time_of_error = time.asctime(time.localtime(time.time()))
        print("Time of error occurrence:", time_of_error)

        
stop = time.time()
print("Finished at:", time.asctime(time.localtime(stop)))
print("Time taken: ", round((stop-start)/60, 3), "mins")

Started at: Thu Aug 20 18:58:31 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:06:54 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:07:05 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:07:31 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:07:36 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:07:41 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:07:41 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:42:14 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:45:20 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 19:46:48 2020
Error occured: <class 'AttributeError'>
Time of error occurrence: Thu Aug 20 20:11:05 2020
Finished at: Thu Aug 20 20:15:58 2020
Time taken:  77

In [7]:
cars_data.reset_index(drop=True, inplace=True)

# DF check
print(cars_data.shape)
cars_data.head()

(5014, 50)


Unnamed: 0,Cena,Oferta od,Kategoria,Marka pojazdu,Model pojazdu,Rok produkcji,Przebieg,Pojemność skokowa,Rodzaj paliwa,Moc,Skrzynia biegów,Napęd,Typ,Liczba drzwi,Liczba miejsc,Kolor,Zarejestrowany w Polsce,Stan,Wersja,Generacja,Metalik,Pierwsza rejestracja,Bezwypadkowy,Kod Silnika,Możliwość finansowania,Faktura VAT,Leasing,Numer rejestracyjny pojazdu,Serwisowany w ASO,Kraj pochodzenia,Pierwszy właściciel,VAT marża,Miesięczna rata,lub do (przebieg km),Filtr cząstek stałych,Emisja CO2,Perłowy,Homologacja ciężarowa,Akryl (niemetalizowany),Gwarancja dealerska (w cenie),Uszkodzony,Tuning,VIN,Matowy,Okres gwarancji producenta,Kierownica po prawej (Anglik),Opłata początkowa,Liczba pozostałych rat,Wartość wykupu,Zarejestrowany jako zabytek
0,30800PLN,Osoby prywatnej,Osobowe,Kia,Pro_cee'd,2013,75 000 km,1 582 cm3,Diesel,111 KM,Manualna,Na przednie koła,Kompakt,3.0,5.0,Biały,Tak,Używane,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10600PLN,Osoby prywatnej,Osobowe,Opel,Corsa,2007,63 000 km,1 229 cm3,Benzyna,80 KM,Manualna,Na przednie koła,Auta miejskie,3.0,5.0,Srebrny,Tak,Używane,1.2 16V Enjoy,D (2006-2014),Tak,29/12/2007,Tak,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,89790PLN,Osoby prywatnej,Osobowe,BMW,Seria 5,2015,151 000 km,1 995 cm3,Diesel,184 KM,Automatyczna,4x4 (dołączany automatycznie),Sedan,,,Biały,Tak,Używane,F10/F11 (2009-2017),F10/F11 (2009-2017),,,Tak,520.0,Tak,Tak,Tak,WB908AP,Tak,,,,,,,,,,,,,,,,,,,,,
3,58500PLN,Osoby prywatnej,Osobowe,Chrysler,Town & Country,2014,158 000 km,3 605 cm3,Benzyna+LPG,283 KM,Automatyczna,Na przednie koła,Minivan,5.0,7.0,Srebrny,Tak,Używane,,II (2001-),Tak,,,,,,,,,Stany Zjednoczone,Tak,,,,,,,,,,,,,,,,,,,
4,17900PLN,Osoby prywatnej,Osobowe,Ford,Galaxy,2006,224 000 km,1 997 cm3,Diesel,140 KM,Manualna,Na przednie koła,Minivan,5.0,7.0,Czarny,,Używane,2.0 TDCi Ghia,Mk2 (2006-2015),Tak,,,,,,,,,,,Tak,,,,,,,,,,,,,,,,,,


#### 3. Saving DF into _.csv_ file:

In [8]:
save_res(cars_data, 'raw_data_f')