# Web Scraping - Cars

In [1]:
# Importing libraries

from urllib.request import urlopen, urlretrieve, Request
from urllib.error import URLError, HTTPError
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm_notebook
from math import ceil
from datetime import datetime
from pathlib import Path

In [2]:
def url_reader(url) -> object:

    """"
    This function reads the html code of the url and returns a BeautifulSoup object.

    Parameters:
    url (str): The url to be read.
    """

    # Need to set user agent to avoid 403 error
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

    try:
        req = Request(url, headers = headers)
        response = urlopen(req)
        html = response.read().decode('utf-8')
        soup_obj = BeautifulSoup(html, 'html.parser')
        return soup_obj

    except HTTPError as e:
        print(e.status, e.reason)

    except URLError as e:
        print(e.reason)
        

In [3]:
def list_of_ads(soup) -> list:
    
    """
    This function returns a list of all the ads in the page.

    Parameters:
    soup (object): The BeautifulSoup object of the page.

    Returns:
    ad_links (list): A list of all the ads in the page.
    """

    ad_list = soup.findAll('a', class_='fnmrjs-0 fyjObc')

    ad_links = []
    for ad in ad_list:
        ad_links.append((ad.get('href')))
        
    return ad_links

In [4]:
def text_from_tag(tag):

    """
    This function returns the text from a tag.

    Parameters:
    tag (object): The BeautifulSoup object of the page.

    Returns:
    text (str): The text from the tag.
    """

    if tag is not None:
        return tag.get_text()
    else:
        return 'None'

In [5]:
def main(get_images=False, number_of_pages=None, main_page=None):

    """
    This function scrapes the olx website and returns a dataframe with all the ads properties.

    Parameters:
    get_images (bool): If True, the images of the ads will be downloaded.
    number_of_pages (int): The number of pages to be scraped.

    Returns:
    df (pandas.DataFrame): A dataframe with all the ads properties.
    """

    # Reads the main page
    soup = url_reader(main_page)

    # Gets the number of pages and ads
    aux = soup.find('span', class_='sc-1mi5vq6-0 eDXljX sc-ifAKCX fhJlIo').get_text().split()
    ad_per_page = int(aux[2])
    n_total_ads = int(aux[-2].replace('.', ''))


    # If number_of_pages is not specified, calculate the number of pages
    if number_of_pages is not None:
        n_pages = number_of_pages
    else:
        n_pages = ceil(n_total_ads/ad_per_page)

    # Initialize the dataframe
    all_ad_data = []

    # Loop over the pages
    for page_number in tqdm_notebook(range(n_pages), desc='Pages', colour='red'):
        soup = url_reader(main_page + '?o={}'.format(page_number+1))
        ad_links = list_of_ads(soup)
        
        # Loop through all the ads
        for ad in ad_links:
            soup = url_reader(ad)
            if soup is not None:
            
                # Title
                title_tag = soup.find('h1', class_='sc-1q2spfr-0 lcTcEs sc-ifAKCX cmFKIN')
                title = text_from_tag(title_tag)

                # ID
                id_tag = soup.find('span', class_='sc-16iz3i7-0 qJvUT sc-ifAKCX fizSrB')
                id_ad = text_from_tag(id_tag).split()[-1]

                # Date and hour
                datetime_tag = soup.find('span', class_='sc-1oq8jzc-0 jvuXUB sc-ifAKCX fizSrB')
                datetime = text_from_tag(datetime_tag).split()
                date_ad, hour_ad = datetime[2], datetime[-1]

                # Description
                description_tag = soup.find('span', class_='sc-1sj3nln-1 eOSweo sc-ifAKCX cmFKIN')
                description = text_from_tag(description_tag).replace('\n',' ')

                # Price
                price_tag = soup.find('h2', class_='sc-ifAKCX eQLrcK')
                price = text_from_tag(price_tag).replace('.','')

                # Ad class
                calss_ad = 'amateur'
                if soup.find('span', class_='sc-16bj9n5-0 IIBHN sc-ifAKCX fizSrB') is not None: 
                    calss_ad = 'professional'

                # Vehicle data
                items_tags = soup.findAll('span', class_='sc-ifAKCX dCObfG')
                data = []
                if items_tags is not None:
                    items_tags.pop()
                    for tag in items_tags:
                        data.append(tag.get_text())

                # Vehicle subdata
                subitems_tags = soup.findAll('div', class_='duvuxf-0 h3us20-0 hCwZcX') + soup.findAll('div', class_='duvuxf-0 h3us20-0 kjKryV')
                subdata = []
                if subitems_tags is not None:
                    for tag in subitems_tags:
                        a = tag.find('span', class_='sc-ifAKCX cmFKIN')
                        b = tag.find('a')

                        if a is None:
                            subdata.append(b.get_text())
                        else:
                            subdata.append(a.get_text())

                # Optionals
                optionals_tags_parent = soup.find('div', class_='sc-bwzfXH h3us20-0 cNYGOs')
                optionals = []
                if optionals_tags_parent is not None:
                    optionals_tags = optionals_tags_parent.findAll('div', class_="duvuxf-0 h3us20-0 jyICCp")
                    for tag in optionals_tags:
                        optionals.append(tag.get_text())

                # Location
                location_tags = soup.find('div', class_='h3us20-6 govcZZ').findAll('dd', class_='sc-1f2ug0x-1 ljYeKO sc-ifAKCX kaNiaQ')
                location_titles = ['CEP', 'City', 'Neighborhood']
                location = []
                if location_tags is not None:
                    for tag in location_tags:
                        location.append(tag.get_text())

                # Get images
                if get_images:
                    img_tags = soup.find('div', class_='h3us20-6 fAprjt')

                    # Checks if exists images
                    if img_tags is not None:
                        # Creates the path directory
                        Path("./output/img").mkdir(parents=True, exist_ok=True)

                        img_list = img_tags.findAll('img', class_='image')
                        if img_list is not None:
                            for i, img in enumerate(img_list):
                                urlretrieve(img.get('src'), './output/img/' + id_ad + '_' + str(i) + '.jpg')

                # Create a dictionary with all the data
                ad_data = {}
                ad_data['Title'] = title
                ad_data['ID'] = id_ad
                ad_data['Description'] = description
                ad_data['Date'] = date_ad
                ad_data['Hour'] = hour_ad
                ad_data['Class'] = calss_ad
                ad_data['Price'] = price

                for i, _ in enumerate(range(len(data))):
                    ad_data[data[i]] = subdata[i]
                
                if soup.find('div', class_='sc-bwzfXH h3us20-0 cNYGOs') is not None:
                    for o in optionals:
                        ad_data[o] = 'yes'

                for i, _ in enumerate(range(len(location))):
                    ad_data[location_titles[i]] = location[i]              

                ad_data['Link'] = ad

                # Append the dictionary to the list
                all_ad_data.append(ad_data)

        else:
            continue

    return all_ad_data

In [6]:
def create_csv(data):

    """
    This function creates a csv file with all the ads properties.

    Parameters:
    data (list): A list with all the ads properties.

    Returns:
    None
    """

    dataset = pd.DataFrame(data)

    today = datetime.now().strftime('%Y-%m-%d')

    # Creates the path directory
    Path("./output/data").mkdir(parents=True, exist_ok=True)

    return dataset.to_csv('./output/data/' + today + '_dataset.csv', sep=';', index = False, encoding = 'utf-8-sig')

In [7]:
# if number_of_pages is not specified, scrape all the ads from all the pages
alldata = main(
    get_images = True, 
    number_of_pages=1, 
    main_page='https://al.olx.com.br/autos-e-pecas/carros-vans-e-utilitarios'
    )
create_csv(alldata)

Pages:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# creates a dataframe and shows the first 5 rows
dataset = pd.DataFrame(alldata)
dataset.head()

Unnamed: 0,Title,ID,Description,Date,Hour,Class,Price,Categoria,Modelo,Marca,...,Som,Câmera de ré,CEP,City,Neighborhood,Link,Tipo de veículo,Final de placa,Direção hidráulica,Sensor de ré
0,Hrv Exl Unico Dono - Pneus novos -Excelente Es...,952509540,PersonalCarMcz -Foco na Qualidade ESTADO DO CA...,07/12,20:26,professional,87900,"Carros, vans e utilitários",HONDA HR-V EXL 1.8 FLEXONE 16V 5P AUT.,HONDA,...,yes,yes,57035700,Maceió,Jatiúca,https://al.olx.com.br/alagoas/autos-e-pecas/ca...,,,,
1,GM MONTANA ANO 2010 CONQUEST /// VALOR 24.800,966304413,ACEITAMOS MOTO COMO BASE DE TROCA ACEITAMOS C...,07/12,20:24,amateur,24800,"Carros, vans e utilitários",GM - CHEVROLET MONTANA 1.8/ 1.8 CONQUEST FLEXP...,GM - CHEVROLET,...,,,57010780,Maceió,Trapiche da Barra,https://al.olx.com.br/alagoas/autos-e-pecas/ca...,Pick-up,0.0,yes,
2,"HB20 2020 C/ Apenas 3.000 km $59.000,",945875012,Hb20 2020 1.0 completo Retrovisores e Maçanet...,07/12,20:18,amateur,59000,"Carros, vans e utilitários",HYUNDAI HB20 SENSE 1.0 12V FLEX MEC.,HYUNDAI,...,yes,,57036540,Maceió,Jatiúca,https://al.olx.com.br/alagoas/autos-e-pecas/ca...,Hatch,2.0,,yes
3,"VENDO HB20 2013, PRATA",966301607,"VENDO HB20 confort, 1.0, 2013 Banco de couro ...",07/12,20:14,amateur,37900,"Carros, vans e utilitários",HYUNDAI HB20 COMF./C.PLUS/C.STYLE 1.0 FLEX 12V,HYUNDAI,...,yes,,57048710,Maceió,Antares,https://al.olx.com.br/alagoas/autos-e-pecas/ca...,Hatch,2.0,yes,
4,Audi a3 turbo 150 vc,966301393,Vendo audi a3 turbo 150 cv originais MANUAL ...,07/12,20:13,amateur,22000,"Carros, vans e utilitários",AUDI A3 1.8 TURBO 5P MEC.,AUDI,...,yes,,57086478,Maceió,Benedito Bentes,https://al.olx.com.br/alagoas/autos-e-pecas/ca...,Hatch,,yes,
