# Web-scraping

Script gets data from online car marketplace and creates _CSV_ file.

#### Modules:

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import os
import sys

#### Settings:

In [2]:
# Num of displayed df columns
pd.set_option('display.max_columns', 60)

# Hard drive results location
DIR = '.'
EXT = '.csv'
RES_PATH = os.path.join(DIR, "Results")


def save_res(var, name, extension=EXT):
    """Saves generated DF into .csv file"""
    
    os.makedirs(RES_PATH, exist_ok=True)
    file_path = os.path.join(RES_PATH, name + extension)
    var.to_csv(file_path, index=False)

#### Functions:

In [3]:
def advert_links(start_yr, end_yr, basic_url, num_of_pages):
    """Gets links to all adverts from specified number of pages"""
    all_links = []
      
    for i in range(1, num_of_pages + 1):
        page_url = basic_url.format(start_yr, end_yr, i)
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find_all(class_=['offer-title__link'])
        
        for elem in content:
            link = str(elem).split('href=')[1].split(' ')[0]
            link = link[1:-1] # To get rid of quotes: "link"
            all_links.append(link)
    
    return all_links


def links_generator(all_links):
    """Generates next link from the list"""
    
    for link in all_links:
        yield link


def advert_features(advert):
    """Gets all features (car params) from single advert"""
    
    # Gets content of page of single advert
    response = requests.get(advert)
    soup = BeautifulSoup(response.text, 'html.parser')

    # FEATURES
    # dict for features and their values
    features = {}

    # Price located in different place than other features -> parsed separately
    price = soup.find("span", {"class": "offer-price__number"}).text.strip().replace(' ', '')
    features["Cena"] = price

    # Other features located in 'li' tags with class name 'offer-params__item', so below code
    # looks for all 'li' tags with class 'offer-params__item' in single advert
    content = soup.find_all('li', attrs={'class': 'offer-params__item'})

    # Iterates on every 'li' tag (every 'li' tag contains single feature and its value)
    for c in content:

        # Feature (dict key)
        label = c.find("span", {"class": "offer-params__label"}).string

        # Feature's value (values are stored in 'a' tags (links) and 'div' tags -
        # thats why 'if' statement needed)
        if c.find('a'):
            value = c.find('a').string.strip()
        else:
            value = c.find('div').string.strip()

        features[label] = value

    return features


def update_data(data, features):
    """Updates DF with new observation (car features from single advert)"""
    
    observation = pd.DataFrame(features, index=[0])
    data = pd.concat([data, observation], sort=False).fillna(np.nan)
    
    return data

#### Variables:

In [4]:
# Range of production years
start_yr = 2005
end_yr = 2015

# Universal version of url
basic_url = "https://www.otomoto.pl/osobowe/od-{}/?search%5Bfilter_float_year%3Ato%5D={}&search%5Border%5D=created_at%3Adesc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}"

# Number of pages to scrape
num_of_pages = 1

### Main code

#### 1. Getting links to adverts:

In [5]:
# Timer
start = time.time()

# Gets list of all adverts' links from all pages
all_links = advert_links(start_yr, end_yr, basic_url, num_of_pages)

stop = time.time()
print("Time taken: ", round((stop-start)/60, 3), "mins")

# Only unique links
all_links = list(set(all_links))

print("Number of links: ", len(all_links)) # n=1 -> 32

Time taken:  0.028 mins
Number of links:  32


#### 2. Creating DF with scraped data:

In [6]:
# Creates generator for links
gen = links_generator(all_links)

# Creates DF for scraped data
cars_data = pd.DataFrame()

# Timer
start = time.time()
print("Started at:", time.asctime(time.localtime(start)))

# Loop to move on each link
for i in range(len(all_links)):
    
    # Generates link to advert
    advert = next(gen)
    
    try:
        # Gets all features from single advert (car)
        features = advert_features(advert)

        # Updates DF with scraped data from single advert
        cars_data = update_data(cars_data, features)
        
    except:
        print("Error occured:", sys.exc_info()[0])
        time_of_error = time.asctime(time.localtime(time.time()))
        print("Time of error occurrence:", time_of_error)

        
stop = time.time()
print("Finished at:", time.asctime(time.localtime(stop)))
print("Time taken: ", round((stop-start)/60, 3), "mins")

Started at: Thu Aug 20 18:15:52 2020
Finished at: Thu Aug 20 18:16:18 2020
Time taken:  0.442 mins


In [7]:
cars_data.reset_index(drop=True, inplace=True)

# DF check
print(cars_data.shape)
cars_data.head()

(32, 37)


Unnamed: 0,Cena,Oferta od,Kategoria,Marka pojazdu,Model pojazdu,Wersja,Generacja,Rok produkcji,Przebieg,Pojemność skokowa,Rodzaj paliwa,Moc,Skrzynia biegów,Napęd,Filtr cząstek stałych,Typ,Liczba drzwi,Liczba miejsc,Kolor,Metalik,Perłowy,Kraj pochodzenia,Pierwsza rejestracja,Numer rejestracyjny pojazdu,Zarejestrowany w Polsce,Pierwszy właściciel,Bezwypadkowy,Stan,Serwisowany w ASO,Możliwość finansowania,VAT marża,Akryl (niemetalizowany),Faktura VAT,Uszkodzony,Homologacja ciężarowa,Emisja CO2,lub do (przebieg km)
0,29900PLN,Osoby prywatnej,Osobowe,Opel,Insignia,2.0 CDTI Sport,A (2008-2017),2012,179 000 km,1 956 cm3,Diesel,160 KM,Automatyczna,Na przednie koła,Tak,Kombi,5,5.0,Czarny,Tak,Tak,Polska,26/09/2012,ZS8323X,Tak,Tak,Tak,Używane,,,,,,,,,
1,10500PLN,Osoby prywatnej,Osobowe,Opel,Meriva,1.6 Essentia,I (2002-2010),2008,207 000 km,1 598 cm3,Benzyna,105 KM,Manualna,,,Minivan,5,5.0,Biały,,,,,,,,,Używane,,,,,,,,,
2,65000PLN,Osoby prywatnej,Osobowe,Volkswagen,Touareg,3.0 V6 TDI 4XMot BMT,II (2010-),2011,213 981 km,2 967 cm3,Diesel,245 KM,Automatyczna,4x4 (stały),Tak,SUV,5,5.0,Czarny,,Tak,Polska,21/09/2011,gda44087,Tak,,Tak,Używane,Tak,,,,,,,,
3,17300PLN,Osoby prywatnej,Osobowe,Citroën,C5,2.2 HDi Exclusive,III (2008-),2008,348 381 km,2 179 cm3,Diesel,170 KM,Manualna,Na przednie koła,Tak,Kombi,5,5.0,Czarny,Tak,,,,,Tak,,,Używane,,Tak,,,,,,,
4,13900PLN,Osoby prywatnej,Osobowe,Volkswagen,Passat,2.0 TDI DPF Comfortline,B6 (2005-2010),2008,303 000 km,1 968 cm3,Diesel,140 KM,Manualna,,,Sedan,5,,Srebrny,,,,,,Tak,,,Używane,,,,,,,,,


#### 3. Saving DF into _.csv_ file:

In [None]:
save_res(cars_data, 'raw_data')