### Imports and Config

In [None]:
# Основные библиотеки для работ с веб
# Main web works libraries
import requests
from bs4 import BeautifulSoup

from time import sleep

In [None]:
# Главный URL для скрэпинга и URL каталога марок и моделей
# Main URL for scraping and URL of marks and models catalog
URL = 'https://auto.ru/-/ajax/desktop/listing/'
URL_CATALOG = 'https://auto.ru/htmlsitemap/mark_model_catalog.html'

# Заголовки веб-страницы
# Webpage headers
HEADERS = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Content-Length': '77',
    'content-type': 'application/json',
    'Cookie': '_csrf_token=23608122d7607cf614fa3696f170304f2c942825086d7100; autoru_sid=a%3Ag5e54cb202kbrp94k5stjgeuok5qc0lr.a4bf836f5de7a860384d4bb97a55d0e0%7C1582615328026.604800.Q6xoprzzbkFU9J433uQN0A.34nVWWhPYMf2gQUyYdxm-hAPmkz-UZuRfIKcHxDth7w; autoruuid=g5e54cb202kbrp94k5stjgeuok5qc0lr.a4bf836f5de7a860384d4bb97a55d0e0; suid=f70cb3a665b30dabcc2aefb60beba688.092164db5081dbf54d3645dd1160a738; yandexuid=4412548211582615329; _ym_wasSynced=%7B%22time%22%3A1582615332141%2C%22params%22%3A%7B%22eu%22%3A0%7D%2C%22bkParams%22%3A%7B%7D%7D; gdpr=0; _ym_uid=1582615332878294126; _ym_isad=2; gids=; from=direct; X-Vertis-DC=myt; _ym_visorc_22753222=b; cycada=1Ra6P74hpA1DJTHrQBDvqC2RWMZ0zfqbWbqR95lvoFc=; from_lifetime=1582636077325; _ym_d=1582636077',
    'Host': 'auto.ru',
    'Origin': 'https://auto.ru',
    'Referer': 'https://auto.ru/cars/used/?sort=fresh_relevance_1-desc&page=1',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'same-origin',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
    'x-client-app-version': '202002.20.164944',
    'x-client-date': '1582636185640',
    'x-csrf-token': '23608122d7607cf614fa3696f170304f2c942825086d7100',
    'x-page-request-id': '4e70327218c7ba488485edc641875179',
    'x-requested-with': 'fetch'
}

# Параметры запроса
# Request parameters
PARAMS = {
    'section': 'used',
    'category': 'cars',
    'sort': 'fresh_relevance_1-desc',
    'geo_radius': 200,
    'geo_id': 213,
    "top_days": 0
}

# Шаблон считываемых характеристик автомобиля
# Pattern of vehicle characteristics to read
CAR_PATTERN = {
    "bodyType"            : "['vehicle_info']['configuration']['human_name']",
    "brand"               : "['vehicle_info']['mark_info']['code']",
    "color"               : "['color_hex']",
    "fuelType"            : "['lk_summary'].split()[-1]",
    "modelDate"           : "['vehicle_info']['super_gen']['year_from']",
    "name"                : "['vehicle_info']['tech_param']['human_name']",
    "numberOfDoors"       : "['vehicle_info']['configuration']['doors_count']",
    "productionDate"      : "['documents']['year']",
    "vehicleConfiguration": "['vehicle_info']['tech_param']['transmission']",
    "engineDisplacement"  : "['vehicle_info']['tech_param']['human_name'].split()[0]",
    "enginePower"         : "['vehicle_info']['tech_param']['power']",
    "description"         : "['description']",
    "mileage"             : "['state']['mileage']",
    "Комплектация"        : "['vehicle_info']['complectation']",
    "Привод"              : "['lk_summary'].split(', ')[-2]",
    "Руль"                : "['vehicle_info']['steering_wheel']",
    "Владельцы"           : "['documents']['owners_number']",
    "ПТС"                 : "['documents']['pts']",
    "Таможня"             : "['documents']['custom_cleared']",
    "Владение"            : "['documents']['purchase_date']",
    "Price"               : "['price_info']['RUR']"
}

# Время ожидания при ошибках от сервера (в сек.)
# Time delay when server request error occurs
SLEEP_TIME = 20

# Искомые марки автомобилей
# Car brands to look for
MARKS = ['MERCEDES', 'INFINITI', 'NISSAN', 'BMW', 'VOLKSWAGEN', 'MITSUBISHI', 
         'TOYOTA', 'VOLVO', 'SKODA', 'LEXUS', 'AUDI', 'HONDA', 'SUZUKI']

# Глобальная переменная для полученных результатов
# Global variable for obtained results
results = {}

### Helper Functions

In [None]:
# Функция создает список всех нужных марок и моделей для скрэпера из каталога, размещенного на сайте
def make_models_list():
    '''Creates models list needed for the scraper from auto.ru catalog webpage'''
    # Retrieve cars catalog page contents
    r = requests.get(URL_CATALOG)
    r.encoding = 'utf-8'
    r = r.text
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(r, 'html.parser')
    rows = soup.find('div', class_='sitemap').find_all('a')
    
    # Obtain models list from parsing result
    models_list = []
    for r in rows:
        parts = r.get('href').split('/')
        mark, model = parts[3].upper(), parts[4].upper()
        if mark in MARKS:  # take only car brands we're looking for
            models_list.append((mark, model))
            
    return models_list

In [None]:
# Функция обрабатывает страницы сайта и пополняет словарь результатов
def get_cars(m):
    '''
    Recieves mark-model pair ('m') from the models list.
    Processes the site pages and replenishes the results dictionary.    
    '''
    # Iterate through first 600 pages, updating the request parameters
    for page in range(1, 600):
        PARAMS.update({
            'catalog_filter': [{'mark': m[0], 'model': m[1]}],
            'page': page
        })
        
        # Try to get requested until success
        while True:
            try:
                r = requests.post(URL, json=PARAMS, headers=HEADERS)
                break
            except:
                print(f'ERROR! Waiting {SLEEP_TIME} secs...')
                sleep(SLEEP_TIME)
                
        offers = r.json()['offers']  # obtain offers

        if offers == []: break  # next page if no offers
        for i in offers:
            id_ = len(results)
            results[id_] = {}  # update the global results dict with obtained offers

            # Try to get vehicle characteristics
            for k in CAR_PATTERN:
                try: v = eval('i' + CAR_PATTERN[k])
                except: v = None
                results[id_].update({k: v})  # update if succeed
                
        print(f'id: [{id_}] page: [{page}] :: {m[0]} {m[1]}')

### Perform Scraping

In [None]:
from multiprocessing.dummy import Pool as ThreadPool

# Функция осуществляет скрэпинг с возможностью работы в многопоточном режиме
# и опционально получать данные только за последний день
def PerformScraping(this_day=False, multithreaded=False):
    '''
    Performs the scrapping process with the ability to work in multithreaded mode 
    and optionally receive data for the last day only.
    
    Params:
        this_day (bool): whether to get contents for the last day only.
        multithreaded (bool): if True, use multithreading.

    Returns:
        replenishes the global 'results' dictionary
    '''
    results = {}
    models_list = make_models_list()
    PARAMS.update({ 'top_days': int(this_day) })
    
    if multithreaded:
        pool = ThreadPool()
        pool.map(get_cars, models_list)
        pool.close()
        pool.join()
    else:
        for mm in models_list:
            get_cars(mm)

In [None]:
'''
::: INSTRUCTIONS FOR USE :::

PerformScraping() function implements the entire web scraping 
process filling the 'results' dictionary with obtained data.
First, it generates the cars models list via make_models_list().
Then goes to scraping using get_cars().

The available param arguments are as follows:
    * this_day (default: False) 
        - defines whether the data will be searched for current day only;
          (this is used in order to make daily updates of the dataset);
    * multithreaded (default: False) 
        - toggles use of multithreading for faster experience; if neither defined, 
          scraper performs a standart one-threaded sequential processing
          (in case multithreading goes wrong).

Just uncomment the desired run mode:
'''

# PerformScraping(this_day=False, multithreaded=True)
PerformScraping(this_day=True, multithreaded=True)
# PerformScraping(this_day=False, multithreaded=False)
# PerformScraping(this_day=True, multithreaded=False)

### Save

In [None]:
# Сохраняем полученные данные
# Save the obtained data

from datetime import datetime
now = datetime.now().strftime('[%d.%m.%Y]-[%H-%M]')
prefix = 'auto_ru_Scraped'
filename = f'{prefix}-{now}.csv'

from pandas import DataFrame
DataFrame(results).T.to_csv(filename, index=False)
print('Written file:', filename)