# Web-scraping

Script gets data from online car marketplace and creates _CSV_ file.

#### Modules:

In [29]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import os
import sys

#### Settings:

In [14]:
# Num of displayed df columns
pd.set_option('display.max_columns', 60)

# Hard drive results location
DIR = '.'
EXT = '.csv'
RES_PATH = os.path.join(DIR, "Results")


def save_res(var, name, extension=EXT):
    """Saves generated DF into .csv file"""
    
    os.makedirs(RES_PATH, exist_ok=True)
    file_path = os.path.join(RES_PATH, name + extension)
    var.to_csv(file_path, index=False)

#### Functions:

In [15]:
def advert_links(start_yr, end_yr, basic_url, num_of_pages):
    """Gets links to all adverts from specified number of pages"""
    all_links = []
      
    for i in range(1, num_of_pages + 1):
        page_url = basic_url.format(start_yr, end_yr, i)
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find_all(class_=['offer-title__link'])
        
        for elem in content:
            link = str(elem).split('href=')[1].split(' ')[0]
            link = link[1:-1] # To get rid of quotes: "link"
            all_links.append(link)
    
    return all_links


def links_generator(all_links):
    """Generates next link from the list"""
    
    for link in all_links:
        yield link


def advert_features(advert):
    """Gets all features (car params) from single advert"""
    
    # Gets content of page of single advert
    response = requests.get(advert)
    soup = BeautifulSoup(response.text, 'html.parser')

    # FEATURES
    # dict for features and their values
    features = {}

    # Price located in different place than other features -> parsed separately
    price = soup.find("span", {"class": "offer-price__number"}).text.strip().replace(' ', '')
    features["Cena"] = price

    # Other features located in 'li' tags with class name 'offer-params__item', so below code
    # looks for all 'li' tags with class 'offer-params__item' in single advert
    content = soup.find_all('li', attrs={'class': 'offer-params__item'})

    # Iterates on every 'li' tag (every 'li' tag contains single feature and its value)
    for c in content:

        # Feature (dict key)
        label = c.find("span", {"class": "offer-params__label"}).string

        # Feature's value (values are stored in 'a' tags (links) and 'div' tags -
        # thats why 'if' statement needed)
        if c.find('a'):
            value = c.find('a').string.strip()
        else:
            value = c.find('div').string.strip()

        features[label] = value

    return features


def update_data(data, features):
    """Updates DF with new observation (car features from single advert)"""
    
    observation = pd.DataFrame(features, index=[0])
    data = pd.concat([data, observation], sort=False).fillna(np.nan)
    
    return data

#### Variables:

In [23]:
# Range of production years
start_yr = 2005
end_yr = 2015

# Universal version of url
basic_url = "https://www.otomoto.pl/osobowe/od-{}/?search%5Bfilter_float_year%3Ato%5D={}&search%5Border%5D=created_at%3Adesc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}"

# Number of pages to scrape
num_of_pages = 1

### Main code

#### 1. Getting links to adverts:

In [24]:
# Timer
start = time.time()

# Gets list of all adverts' links from all pages
all_links = advert_links(start_yr, end_yr, basic_url, num_of_pages)

stop = time.time()
print("Time taken: ", round((stop-start)/60, 3), "mins")

# Only unique links
all_links = list(set(all_links))

print("Number of links: ", len(all_links)) # n=1 -> 32

Time taken:  0.024 mins
Number of links:  32


#### 2. Creating DF with scraped data:

In [32]:
# Creates generator for links
gen = links_generator(all_links)

# Creates DF for scraped data
cars_data = pd.DataFrame()

# Timer
start = time.time()
print("Started at:", time.asctime(time.localtime(start)))

# Loop to move on each link
for i in range(len(all_links)):
    
    # Generates link to advert
    advert = next(gen)
    
    try:
        # Gets all features from single advert (car)
        features = advert_features(advert)

        # Updates DF with scraped data from single advert
        cars_data = update_data(cars_data, features)
        
    except:
        print("Error occured:", sys.exc_info()[0])
        time_of_error = time.asctime(time.localtime(time.time()))
        print("Time of error occurrence:", time_of_error)

        
stop = time.time()
print("Finished at:", time.asctime(time.localtime(stop)))
print("Time taken: ", round((stop-start)/60, 3), "mins")

Started at: Thu Aug 20 18:12:24 2020
Error occured: <class 'KeyboardInterrupt'>
Time of error occurrence: Thu Aug 20 18:12:29 2020
Error occured: <class 'KeyboardInterrupt'>
Time of error occurrence: Thu Aug 20 18:12:33 2020
Finished at: Thu Aug 20 18:12:49 2020
Time taken:  0.405 mins


SPRAWDZ!!!

https://stackoverflow.com/questions/58295555/pandas-append-new-row-with-a-different-number-of-columns

https://stackoverflow.com/questions/3160699/python-progress-bar

In [31]:
cars_data.reset_index(drop=True, inplace=True)

# DF check
print(cars_data.shape)
cars_data.head()

(30, 35)


Unnamed: 0,Cena,Oferta od,Kategoria,Marka pojazdu,Model pojazdu,Generacja,Rok produkcji,Przebieg,Pojemność skokowa,Rodzaj paliwa,Moc,Skrzynia biegów,Napęd,Typ,Liczba drzwi,Liczba miejsc,Kolor,Metalik,VAT marża,Kraj pochodzenia,Pierwsza rejestracja,Numer rejestracyjny pojazdu,Zarejestrowany w Polsce,Pierwszy właściciel,Bezwypadkowy,Serwisowany w ASO,Stan,Wersja,Kod Silnika,Możliwość finansowania,Filtr cząstek stałych,Perłowy,Faktura VAT,lub do (przebieg km),Akryl (niemetalizowany)
0,29900PLN,Firmy,Osobowe,Volkswagen,Passat,B7 (2010-2014),2010,209 559 km,1 598 cm3,Diesel,105 KM,Manualna,Na przednie koła,Kombi,5,5.0,Czarny,Tak,Tak,Holandia,07/01/2011,SI80949,Tak,Tak,Tak,Tak,Używane,,,,,,,,
1,13900PLN,Osoby prywatnej,Osobowe,Volkswagen,Passat,B6 (2005-2010),2008,303 000 km,1 968 cm3,Diesel,140 KM,Manualna,,Sedan,5,,Srebrny,,,,,,Tak,,,,Używane,2.0 TDI DPF Comfortline,,,,,,,
2,28888PLN,Osoby prywatnej,Osobowe,Opel,Insignia,A (2008-2017),2009,169 456 km,2 000 cm3,Benzyna,220 KM,Manualna,4x4 (dołączany automatycznie),Sedan,4,5.0,Czarny,Tak,Tak,,,,,Tak,Tak,Tak,Używane,,,,,,,,
3,67000PLN,Osoby prywatnej,Osobowe,BMW,Seria 5,F10/F11 (2009-2017),2011,170 000 km,2 979 cm3,Benzyna,306 KM,Automatyczna,4x4 (dołączany automatycznie),Sedan,4,5.0,Szary,Tak,,Stany Zjednoczone,,,Tak,,Tak,Tak,Używane,F10/F11 (2009-2017),535.0,,,,,,
4,43990PLN,Osoby prywatnej,Osobowe,Citroën,C4 Grand Picasso,II (2013-),2014,102 069 km,1 598 cm3,Benzyna,156 KM,Manualna,Na przednie koła,Minivan,5,7.0,Szary,Tak,,,,,Tak,,Tak,Tak,Używane,Gr 1.6 THP Exclusive,,Tak,,,,,


#### 3. Saving DF into _.csv_ file:

In [20]:
save_res(cars_data, 'raw_data')