# Сбор данных

__Заметка:__ Так как парсер был упрощен и перезапущен, то собранные им данные изменились. В дальнейшей работе мы будем пользоваться данными, актуальными на 03.05.2023. Старый вариант парсера можно найти в папке "Прочее" репозитория github.

### 1. Загрузим нужные библиотеки и напишем функции

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from tqdm.notebook import tqdm


def get_soup(url: str):
    res = requests.get(url)
    return BeautifulSoup(res.content, 'html.parser')


def get_cars_hrefs(tree):
    cars = tree.find_all('a', {'class': 'vehicle-card-link'})
    return [
        {'model': car.h2.text, 'href': car.get('href')} for car in cars
    ]


def columns_maker(table):
    soup = BeautifulSoup(str(table), 'html.parser')

    dt = soup.find_all('dt')
    
    columns = []
    for i in range(len(dt)):
        columns.append(dt[i].text)

    return columns


def table_maker(table):
    soup = BeautifulSoup(str(table), 'html.parser')

    dd = soup.find_all('dd')

    res = []
    for i in range(len(dd)):
        res.append(dd[i].text.strip())
    
    return res


MAIN_URL = 'https://www.cars.com/'

### 2. Соберем нужное количество данных

__Заметка:__ Пропишем ниже более компактный вариант реализации первого шага парсинга, но при этом оставим прошлый код, так как те данные, с которыми мы будем работать, были собраны с его помощью.

In [2]:
def parsing_first_step(p):
    url = MAIN_URL + f'shopping/results/?list_price_max=&makes[]=&maximum_distance=all&models[]=&page={p}&stock_type=used&zip='
    tree = get_soup(url)
    hlp = get_cars_hrefs(tree)
    data.extend(hlp)
    sleep(0.02)

In [3]:
data = []
for p in tqdm(range(50, 251)):
    parsing_first_step(p)

  0%|          | 0/201 [00:00<?, ?it/s]

In [4]:
cars = pd.DataFrame(data)
cars.head()

Unnamed: 0,model,href
0,2022 Mazda CX-9 Carbon Edition,/vehicledetail/2e233c09-13a7-40fa-bc55-c62ad7e...
1,2020 Hyundai Tucson Sport,/vehicledetail/c7c4481a-9509-46ca-86da-0d047cf...
2,2021 Ford F-150 Raptor,/vehicledetail/bf528522-adbc-479a-874a-6074213...
3,2016 Acura RDX Base,/vehicledetail/7d967865-4de8-48ff-a366-5016658...
4,2019 Ford Mustang GT Premium,/vehicledetail/407db0d2-337c-4e08-8590-6fd413c...


---

In [5]:
url = MAIN_URL + cars['href'][4]
tree = get_soup(url)

basics = tree.find('dl', {'class': 'fancy-description-list'})
history = tree.find('section', {'class': 'sds-page-section vehicle-history-section'})

In [6]:
final_columns = [
    'Model',
    'href',
    'Exterior color',
    'Interior color',
    'Drivetrain',
    'MPG',
    'Fuel type',
    'Transmission',
    'Engine',
    'VIN',
    'Stock #',
    'Mileage',
    'Accidents or damage',
    '1-owner vehicle',
    'Personal use only',
    'Price'
]

In [7]:
cars_df = pd.DataFrame(columns=final_columns)

In [8]:
cars_df

Unnamed: 0,Model,href,Exterior color,Interior color,Drivetrain,MPG,Fuel type,Transmission,Engine,VIN,Stock #,Mileage,Accidents or damage,1-owner vehicle,Personal use only,Price


---

In [9]:
def parsing_second_step(first_idx, last_idx):
    for i in tqdm(range(first_idx, last_idx + 1)):
        car = cars.iloc[i, :]
        url = MAIN_URL + car['href']
        tree = get_soup(url)
        
        basics = tree.find('dl', {'class': 'fancy-description-list'})
        history = tree.find('section', {'class': 'sds-page-section vehicle-history-section'})
        
        res = [car['model'], car['href']]
        if basics != None and history != None:    
            price = tree.find('span', {'class': 'primary-price'}).text
            res.extend(table_maker(basics))
            res.extend(table_maker(history.dl))

            if len(res) == 16:
                res = res[:-1]
            
            if len(res) == 15:
                res.append(price)
                cars_df.loc[i] = res
        
        sleep(0.02)

In [10]:
idx_pairs = [(0, 1000), (1001, 2000), (2001, 3000), (3001, 4214)]

# Покажем на примере первой пары, так как данные уже собраны
for i in range(1):
    first_idx, last_idx = idx_pairs[i]
    parsing_second_step(first_idx, last_idx)

  0%|          | 0/1001 [00:00<?, ?it/s]

In [11]:
cars_df.drop_duplicates(inplace=True)

In [12]:
cars_df

Unnamed: 0,Model,href,Exterior color,Interior color,Drivetrain,MPG,Fuel type,Transmission,Engine,VIN,Stock #,Mileage,Accidents or damage,1-owner vehicle,Personal use only,Price
0,2022 Mazda CX-9 Carbon Edition,/vehicledetail/2e233c09-13a7-40fa-bc55-c62ad7e...,Polymetal Gray Metallic,Red,All-wheel Drive,–,Gasoline,6-Speed Automatic,2.5L I4 16V GDI DOHC Turbo,JM3TCBDY6N0628837,M919SL,"3,804 mi.",None reported,Yes,Yes,"$40,374"
1,2020 Hyundai Tucson Sport,/vehicledetail/c7c4481a-9509-46ca-86da-0d047cf...,Stellar Silver,Gray,All-wheel Drive,21–26\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,6-Speed Automatic,2.4L I4 16V GDI DOHC,KM8J3CAL2LU204183,U22279,"21,063 mi.",None reported,Yes,Yes,"$24,376"
2,2021 Ford F-150 Raptor,/vehicledetail/bf528522-adbc-479a-874a-6074213...,Agate Black Metallic,Black,Four-wheel Drive,–,Gasoline,10-Speed Automatic,3.5L V6 24V PDI DOHC Twin Turbo,1FTFW1RG8MFC40141,WPMFC40141B,"13,565 mi.",None reported,Yes,Yes,"$79,692"
3,2016 Acura RDX Base,/vehicledetail/7d967865-4de8-48ff-a366-5016658...,Gray,Graystone,All-wheel Drive,19–28\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,6-Speed Automatic,3.5L V6 24V MPFI SOHC,5J8TB4H59GL001057,001057,"76,414 mi.",None reported,No,No,"$21,487"
4,2019 Ford Mustang GT Premium,/vehicledetail/407db0d2-337c-4e08-8590-6fd413c...,Oxford White,Ebony,Rear-wheel Drive,15–24\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,10-Speed Automatic,"5L V-8 port/direct injection, DOHC, Ti-VCT var...",1FATP8FF5K5138115,P30263,"1,248 mi.",None reported,No,Yes,"$93,500"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,2020 Porsche Cayenne Base,/vehicledetail/29766710-84ec-4c93-bc9d-ac00252...,Quartzite Grey Metallic,Black,All-wheel Drive,19–23\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,8-Speed Automatic,3.0L V6 24V GDI DOHC Turbo,WP1AA2AY9LDA04075,323028-1,"52,908 mi.",None reported,Yes,Yes,"$56,777"
992,2020 Porsche Panamera 4S,/vehicledetail/6547454b-d5e9-4f02-b03e-d866a5e...,Black,Black,All-wheel Drive,18–26\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,Automatic,"2.9L V-6 gasoline direct injection, DOHC, vari...",WP0AB2A75LL140652,40025T,"6,265 mi.",None reported,Yes,Yes,"$92,998"
993,2003 Mercedes-Benz C-Class 4dr Sdn 2.6L,/vehicledetail/c8831633-3eb1-4e21-94de-50647ff...,Leather Upholstery,Leather Upholstery,Rear-wheel Drive,19–26\n\n\n\nBased on EPA mileage ratings. Use...,Gasoline,Automatic,2.6L V-6 engine with 168HP,WDBRF61J13F354688,H212030A,"206,080 mi.",None reported,No,Yes,"$6,250"
994,2023 Porsche Macan Base (PDK),/vehicledetail/077befc1-560c-450a-b04d-38aa7f8...,White,Agate Grey,All-wheel Drive,–,Gasoline,Automatic,2.0L I4 16V GDI DOHC Turbo,WP1AA2A53PLB01192,L00214,"6,171 mi.",None reported,Yes,No,"$64,500"


### 3. Сохраним таблицу в файл

In [13]:
#cars.to_csv('cars_raw_data.csv', index=False)

In [14]:
#cars_df.to_csv('cars_df.csv', index=False)