# Autovit Image Scraper

This notebook can scrape car images from a fixed list of URLs.

Each URL should point to the "offer web page" of a car.

In [10]:
import importlib
import logging
import os
import re
import time
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.request import urlopen

import pandas as pd
import ratelimit
from bs4 import BeautifulSoup

Parameters.

In [11]:
# The file which contains the URLs to scrape.
LINKS_FILE = os.path.join("..", "..", "data", "carsWithImages.csv")
# The directory where output is saved.
OUTPUT_DIR = os.path.join("..", "..", "data", "autovit_images")

# The first and last URLs to scrape (indices inside the URL list).
LINKS_START = 0
LINKS_STOP = 2

# Maximum number of requests in a `LIMIT_PERIOD` period.
LIMIT_CALLS: int = 1
# Duration of a period, in seconds.
LIMIT_PERIOD: int = 1


Functions for downloading data.

In [12]:
def download_webpage(url: str) -> str:
    """Download a text page from a given URL."""
    resp = urlopen(url).read().decode('utf-8')
    return resp


def download_image(url: str) -> bytes:
    """Download the raw bytes of an image from a given URL."""
    return urlopen(url).read()


Requests are rate-limited to avoid overwhelming the server.

In [13]:
@ratelimit.sleep_and_retry
@ratelimit.limits(calls=LIMIT_CALLS, period=LIMIT_PERIOD)
def check_autovit_limit() -> None:
    """
    Utility function which forces requests to Autovit to respect rate limits.
    This function should be used as a "guard", simply call it before you make a
    request.
    """
    pass


Functions for parsing the pages.

In [14]:
@dataclass
class CarInfo:
    """Contains metadata about a car."""
    id: str
    img_url: Optional[str]
    name: str


@dataclass
class Image:
    """Wraps an image and its metadata."""
    image: bytes
    info: CarInfo


def parse_webpage(html: str) -> CarInfo:
    """Parse car metadata from an HTML page."""
    soup = BeautifulSoup(html, "html.parser")

    autovit_id = soup.find("span", {"id": "ad_id"}).text

    name = soup.find("span", {"class": "offer-title"}).text.strip()

    # Image URLs seem to contain the resolution of the image as a suffix of the
    # URL. If we remove this, suffix we should have access to the full image.
    img_size_pattern = r";s=\d+x\d+$"
    img_tag = soup.find("div", {"class": "photo-item"}).find("img")
    img_url = re.sub(img_size_pattern, "", img_tag["data-lazy"])

    return CarInfo(id=autovit_id, img_url=img_url, name=name)


Functions for storing data.

In [15]:
def save_dir(autovit_id: str) -> str:
    """Returns the path of the directory where an article's info is saved."""
    return os.path.join(OUTPUT_DIR, autovit_id)


def done_path(autovit_id: str) -> str:
    """Return the path of the file that says an article has been processed."""
    return os.path.join(save_dir(autovit_id), ".done")


def already_scraped(autovit_id: str) -> bool:
    """Check if a given article can already be found on disk."""
    return os.path.isfile(done_path(autovit_id))


def save_to_disk(image: Image) -> None:
    """Save all information related to an image to disk."""
    save_path = save_dir(image.info.id)

    # Make sure the directory exists.
    os.makedirs(save_path, exist_ok=True)

    # Save the image to disk.
    img_path = os.path.join(save_path, f"{image.info.id}.webp")
    with open(img_path, "wb") as fout:
        fout.write(image.image)

    # Remember that we processed this article.
    with open(done_path(image.info.id), "w") as fout:
        fout.write(image.info.name)


The actual scraping process.

In [16]:
def obtain_image(webpage_url: str) -> Optional[Image]:
    """Try to download the image and metadata which corresponds to a car."""
    try:
        check_autovit_limit()
        html = download_webpage(webpage_url)
        info = parse_webpage(html)

        check_autovit_limit()
        img = download_image(info.img_url)

        return Image(image=img, info=info)
    except Exception as e:
        return None


def scrape_all(df: pd.DataFrame) -> None:
    """Scrape all images from the given collection, logging the progress."""
    for i, (_, entry) in enumerate(df.iterrows()):
        autovit_id = str(entry["Autovit Id"])
        webpage_url = entry["Url"]
        display_name = f"({i}) {autovit_id} {webpage_url}"

        if already_scraped(autovit_id):
            logging.info(f"Skipping already scraped {display_name}")
            continue

        if image := obtain_image(webpage_url):
            save_to_disk(image)
            logging.info(f"Done processing {display_name}")
        else:
            logging.warning(f"Giving up, failed to parse {display_name}")


In [17]:
importlib.reload(logging)
logging.basicConfig(
    format="[%(asctime)s] %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.DEBUG,
)


In [18]:
# Load only the entries we want to parse.
df = pd.read_csv(LINKS_FILE).iloc[LINKS_START:LINKS_STOP + 1]
logging.info(f"Read {len(df)} entries to scrape")

# Actually scrape the images.
scrape_all(df)
logging.info("Done processing everything")


[2023-01-21 15:42:00] INFO     Read 3 entries to scrape
[2023-01-21 15:42:00] INFO     Skipping already scraped (0) 7049990250 https://www.autovit.ro/anunt/suzuki-vitara-1-6-ID7H72va.html
[2023-01-21 15:42:00] INFO     Skipping already scraped (1) 7049960669 https://www.autovit.ro/anunt/toyota-auris-1-8-vvt-i-hybrid-automatik-touring-sports-ID7H6UP3.html
[2023-01-21 15:42:00] INFO     Skipping already scraped (2) 7049895868 https://www.autovit.ro/anunt/skoda-octavia-1-6-tdi-ID7H6DXS.html
[2023-01-21 15:42:00] INFO     Done processing everything
