# Autovit Data Scraper

This notebook can scrape text data (such as the price of a car, or its model) from a fixed list of URLs.

In [1]:
import importlib
import logging
import os
import time
import urllib.request
from typing import Dict, List, Optional
from urllib.request import urlopen

import pandas as pd
import ratelimit
from bs4 import BeautifulSoup

Parameters.

In [2]:
# The file which contains the URLs to scrape.
LINKS_FILE = os.path.join("..", "..", "data", "all_cars.csv")
# The output filename (also used for checkpointing).
DATAFRAME_FILE = os.path.join("..", "..", "data", "carsWithImages.csv")

# The first and last URLs to scrape (indices inside the URL list).
LINKS_START = 16_000
LINKS_STOP = 16_002

# How often do we save the data to disk (once every `CHECKPOINT_RATE` entries).
CHECKPOINT_RATE = 32

# Maximum number of requests in a `LIMIT_PERIOD` period.
LIMIT_CALLS: int = 1
# Duration of a period, in seconds.
LIMIT_PERIOD: int = 2


Functions for handling saving and reloading.

In [3]:
def save_checkpoint(df: pd.DataFrame) -> None:
    """Save the scraped data to disk."""
    df.to_csv(DATAFRAME_FILE)


def load_checkpoint() -> pd.DataFrame:
    """Load the latest version of the scraped data from disk."""
    return pd.read_csv(DATAFRAME_FILE, index_col=0)


def checkpoint_exists() -> bool:
    """Check if previously scraped data exists on disk."""
    return os.path.isfile(DATAFRAME_FILE)


Functions for downloading data.

In [4]:
def download_webpage(url: str) -> str:
    """Download a text page from a given URL."""
    resp = urlopen(url).read().decode('utf-8')
    return resp


Requests are rate-limited to avoid overwhelming the server.

In [5]:
@ratelimit.sleep_and_retry
@ratelimit.limits(calls=LIMIT_CALLS, period=LIMIT_PERIOD)
def check_autovit_limit() -> None:
    """
    Utility function which forces requests to Autovit to respect rate limits.
    This function should be used as a "guard", simply call it before you make a
    request.
    """
    pass


Functions for parsing data.

In [6]:
def parse_webpage(html: str, url: str) -> Dict[str, str]:
    """Parse the useful information from an HTML page."""
    soup = BeautifulSoup(html, features="html.parser")

    body = soup.body
    siteWrap = body.find("div", {"id":"siteWrap"})
    flexContainerMain = siteWrap.find("div", {"class":"flex-container-main"})
    flexContainerMainLeft = flexContainerMain.find("div", {"class":"flex-container-main__left"})
    flexContainerMainRight = flexContainerMain.find("div", {"class":"flex-container-main__right"})

    offerContentAsside = flexContainerMainRight.find("div", {"class":"offer-content__aside"})
    offerSummary = offerContentAsside.find("div",{"class":"offer-summary"})
    priceWrapper = offerSummary.find("div", {"class":"price-wrapper"})
    offerPrice = priceWrapper.find("div", {"class":"offer-price"})
    offerContent = flexContainerMainLeft.find("div", {"class":"offer-content offer-content--secondary"})
    offerContentRaw = offerContent.find("div", {"class":"offer-content__row om-offer-main"})
    offerContentRawMain = offerContentRaw.find("div", {"class":"offer-content__main-column"})
    parametersArea = offerContentRawMain.find("div", {"class":"parametersArea"})
    parameters = parametersArea.find("div", {"id": "parameters"})

    price_sum = offerPrice["data-price"]
    price_currency = offerPrice.find("span", {"class": "offer-price__currency"}).text
    myDic = {
        'Url': url,
        'Autovit Id': soup.find("span", {"id": "ad_id"}).text,
        'Pret': f"{price_sum} {price_currency}",
    }

    for listCaract in parameters.find_all_next("ul"):
        for values in listCaract.find_all_next("li"):
            valuesParam =values.find("div",{"class":"offer-params__value"})
            valuesName = values.find("span",{"class":"offer-params__label"})
            try:
                if valuesParam.a != None:
                    myDic[valuesName.string] = valuesParam.a.string.strip()
                else:
                    myDic[valuesName.string] = valuesParam.string.strip()
            except:
                continue

    return myDic


The actual scraping process.

In [7]:
def scrape_url(url: str) -> Optional[Dict[str, str]]:
    """Download and parse a given web page."""
    try:
        check_autovit_limit()
        html = download_webpage(url)
        return parse_webpage(html, url)
    except Exception as e:
        return None


def append_to_dataset(dictionary: Dict[str, str], df: pd.DataFrame) -> pd.DataFrame:
    """Add a new entry to the dataset."""
    return pd.concat([df, pd.DataFrame([dictionary])], ignore_index=True)


def scrape_all(df: pd.DataFrame, urls: List[str]) -> pd.DataFrame:
    """Scrape all the given URLs, printing debug information along the way."""
    for i, url in enumerate(urls):
        display_name = f"({i}) {url}"

        if not df.empty and url in df["Url"].values:
            logging.info(f"Skipping already parsed {display_name}")
        elif dictionary := scrape_url(url):
            df = append_to_dataset(dictionary, df)
            logging.info(f"Done parsing {display_name}")
            if len(df) % CHECKPOINT_RATE == 0:
                save_checkpoint(df)
        else:
            logging.warning(f"Giving up, failed to parse {display_name}")

    return df


In [8]:
importlib.reload(logging)
logging.basicConfig(
    format="[%(asctime)s] %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.DEBUG,
)


In [9]:
# Load all the URLs we want to parse.
urls = list(pd.read_csv(LINKS_FILE)["link-href"])[LINKS_START:LINKS_STOP + 1]
logging.info(f"Read {len(urls)} URLs to scrape")

# Reload the previous scraped results if they exist.
df = load_checkpoint() if checkpoint_exists() else pd.DataFrame()
logging.info(f"Reloaded {len(df)} entries from disk")

# Add all the new entries to the dataset.
df = scrape_all(df, urls)
logging.info(f"Have {len(df)} entries in total")

# Save the final results.
save_checkpoint(df)
logging.info("Done saving everything")


[2023-01-21 12:44:22] INFO     Read 0 URLs to scrape
[2023-01-21 12:44:22] INFO     Reloaded 12229 entries from disk
[2023-01-21 12:44:22] INFO     Have 12229 entries in total
[2023-01-21 12:44:22] INFO     Done saving everything
