In [2]:
import pandas as pd
import requests
from lxml import etree, html as lhtml
from tqdm import tqdm

from bs4 import BeautifulSoup
import re 

In [4]:
req = requests.post("http://www.autonet.ru/auto/ttx")
tree = lhtml.fromstring(req.text)

links = []
for el, tag, link, pos in tqdm(list(tree.iterlinks())):
    if link.startswith('/auto/ttx/'):
        car = lhtml.fromstring(requests.post(f'http://www.autonet.ru{link}').text)
        for _, _, car_link, _ in car.iterlinks():
            if car_link.startswith(link+'/'):
                links.append(f'http://www.autonet.ru{car_link}')

cars_urls = pd.DataFrame({'url': links})
cars_urls = cars_urls.sort_values(by='url')

100%|█████████████████████████████████████████| 239/239 [01:35<00:00,  2.51it/s]


In [75]:
def get_info(modification):
    res = {}
    
    res['name'] = modification.find('td', {'class': 'mod'}).text.strip()
    
    res['release_year'] = modification.find('td', {'class': 'edition'}).text.strip()
    res['release_year'] = res['release_year'].split()[0].split('.')[-1]
    res['release_year'] = int(res['release_year'])
    
    carcass = modification.find('td', {'class': 'carcass'}).text.strip()
    res['carcass'] = carcass.split()[0]
    res['doors'] = int(''.join(filter(str.isdigit, carcass)))

    res['volume'] = modification.find('td', {'class': 'volume'}).text.strip()
    volume = re.findall('[0-9]+', res['volume'])
    res['volume'] = int(volume[0]) if len(volume) else 'None'

    res['power'] = modification.find('td', {'class': 'power'}).text.strip()
    power = re.findall('[0-9]+', res['power'])
    res['power'] = int(power[0]) if len(power) else 'None'
    
    
    links = []
    for url in modification.find_all('td', {'class': 'mod'}):
        links.append('http://www.autonet.ru' + url.a['href'])
    res['url_auto'] = links[0]
    
    return res

for link in tqdm(cars_urls['url'].unique()):
    if str(link) == 'None':
        continue
        
    req = requests.post(link)
    soup = BeautifulSoup(req.text, 'html')
    scrapped_info = []
    for mark in soup.find_all("div", {'class': 'mod-list'})[:-1] + soup.find_all("div", {'class': 'mod-list bt-null'}):
        for modification in mark.find_all('tr')[1:]:
            scrapped_info.append(get_info(modification))
            
            
    for idx in cars_urls[cars_urls['url'] == link].index:
        applicable = []
        for el in scrapped_info:
            applicable.append(el)
        
        if len(applicable):
            df = pd.DataFrame(applicable)
            df = df.sort_values(by=['name', 'release_year', 'carcass', 'doors', 'volume', 'power', 'url_auto'])
            for k, v in df.loc[0].items():
                cars_urls.loc[idx, k] = v

sub = cars_urls[['url'] + ['name', 'release_year', 'carcass', 'doors', 'volume', 'power', 'url_auto']]

100%|███████████████████████████████████████| 1820/1820 [33:16<00:00,  1.10s/it]


In [77]:
sub.dropna(inplace=True)
sub['model'] = sub.apply(lambda x: ' '.join(x['name'].split()[:len(x['url'].split('/')[-2].split('_'))]),
                           axis=1)

In [78]:
sub.reset_index(drop=True, inplace=True)

In [18]:
full_characteristics = []
image_auto = []
description_auto = []

for link in tqdm(sub['url_auto'].values):
    try:
        soup = BeautifulSoup(requests.post(link).text, 'html')
        full_characteristics.append('http://www.autonet.ru' + \
                                    soup.find('div', {'class': 'mod-characteristics-over'}).img['src'])
        try:
            image_auto.append(soup.find('div', {'id': 'slider'}).img['src'] if not None else 'None')
        except TypeError:
            image_auto.append('None')
        description_auto.append(str(soup.find('div', {'class': 'mod-description'}).p)[3:-4])
    except ConnectionError:
        continue


    
sub['full_characteristics_url'] = pd.Series(full_characteristics)
sub['image_auto_url'] = pd.Series(image_auto)
sub['desription_auto'] = pd.Series(description_auto)

100%|███████████████████████████████████████| 1799/1799 [35:21<00:00,  1.18s/it]


In [23]:
subdata.to_csv('database.csv', index=False)