# Web scraper de información de vehículos usados

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm, trange
import time

## Mercado Libre

In [None]:
def get_car_info(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    car_info = {}
    try:
        car_info['Precio'] = soup.find('div', class_='ui-pdp-price__second-line').find('span', class_='andes-visually-hidden').text.split(' ')[0]
        car_info['Precio'] = int(car_info['Precio'])
        table = soup.find('table', class_='andes-table')
        for tr in table.find_all('tr'):
            key = tr.find('th').text
            value = tr.find('td').text
            car_info[key] = value
        
        car_info['Kilómetros'] = int(car_info['Kilómetros'].replace('km', ''))
        car_info['Año'] = int(car_info['Año'])
        return car_info
    except:
        return get_car_info(url)

In [None]:
html_text = requests.get('https://carros.mercadolibre.com.co/').text
soup = BeautifulSoup(html_text, 'lxml')

In [None]:
num_pages = int(soup.find('li', class_='andes-pagination__page-count').text.split(' ')[-1])
num_pages

In [None]:
df = pd.DataFrame()
for i in trange(1,num_pages+1):
    html_text = requests.get(f'https://carros.mercadolibre.com.co/_Desde_{(i*48)+1}').text
    soup = BeautifulSoup(html_text, 'lxml')
    cars = soup.find_all('div', class_='ui-search-result__wrapper shops__result-wrapper')
    for car in cars:
        url = car.find('a', class_='ui-search-link')['href']
        df = pd.concat([df, pd.DataFrame([get_car_info(url)])])


df.reset_index(drop=True, inplace=True)
df.rename(columns={'Tipo de combustible':'Combustible', 'Kilómetros':'Kilometraje'}, inplace=True)

In [None]:
df

## Vende tu Nave

In [None]:
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By
import numpy as np

In [390]:
def start_chrome_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.page_load_strategy = 'none'
    options.experimental_options["prefs"] = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.javascript": 2,
        "profile.managed_default_content_settings.cookies": 2,
        "profile.managed_default_content_settings.geolocation": 2,
        "profile.default_content_setting_values.notifications": 2,
    }
    driver = Chrome(options=options)
    driver.implicitly_wait(5)

    return driver


In [299]:
def get_car_info(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    try:
        car_info = {}
        car_info['Precio'] = int(soup.find('h2', class_='ui header').text.split(' ')[1].replace('.', ''))
        rows = soup.find('div', id='desc-desk').find_all('div', class_='two column row')
        for row in rows:
            keys = row.find_all('h3')
            values = row.find_all('p')
            for key, value in zip(keys, values):
                car_info[key.text.replace(':','')] = value.text
        
        car_info['Kilometraje'] = int(car_info['Kilometraje'].replace('km', '').replace('.',''))
        car_info['Motor'] = car_info.pop('Cilindraje').replace('cc', '')
        car_info['Año'] = int(car_info['Año'])
        del car_info['Estado'], car_info['Tipo precio'], car_info['Último dígito de placa'], car_info['Blindaje'], car_info['Peritaje']
        
        return car_info
    except Exception as e:
        print('Error trying to get car info from url: ', url)
        print(e)
        return None
        

In [257]:
driver = start_chrome_driver()
url = "https://www.vendetunave.co/vehiculos/carrosycamionetas?&estado=Usado&page=999" 
driver.get(url) 
# time.sleep(0.5)

In [255]:
num_pages = driver.find_element(By.CLASS_NAME, 'ui.pagination.pointing.secondary.menu').find_elements(By.TAG_NAME, 'a')[-2].get_attribute('value')
num_pages = int(num_pages)
num_pages

113

In [None]:
for i in trange(1,num_pages+1):
    url = f'https://www.vendetunave.co/vehiculos/carrosycamionetas?&estado=Usado&page={i}'
    driver.get(url)
    time.sleep(1)
    cars = driver.find_elements(By.CLASS_NAME, 'ui.card')
    for car in cars:
        url = car.get_attribute('href')
        car_info = get_car_info(url)
        df = pd.concat([df, pd.DataFrame([car_info])]) if car_info else df

df

## CarroYa

In [367]:
def get_car_info(url, driver):
    driver.get(url)
    time.sleep(3)

    try:
        car_info = {}
        
        aux_info = {}
        features = driver.find_elements(By.CSS_SELECTOR, "div[class='feature']")
        for f in features:
            key = f.find_element(By.TAG_NAME, 'h5').text
            value = f.find_element(By.TAG_NAME, 'h4').text
            aux_info[key] = value

        car_info['Precio'] = int(driver.find_element(By.ID, 'priceInfo').text.replace('$', '').replace('.', ''))
        
        marca_modelo = driver.find_element(By.CSS_SELECTOR, "h1[class='title text']").text
        for brand in ['Mercedes', 'Land Rover', 'Mini', 'Alfa', 'Aston']:
            if brand in marca_modelo:
                index = marca_modelo.find(' ', marca_modelo.find(' ')+1)
                break
            else:
                index = marca_modelo.find(' ')
        car_info['Marca'] = marca_modelo[:index]
        car_info['Modelo'] = marca_modelo[index:]
        
        car_info['Año'] = int(driver.find_element(By.CSS_SELECTOR, "h3[class='h3P year']").text)
        car_info['Versión'] = driver.find_element(By.CSS_SELECTOR, "h3[class='h3P text']").text
        car_info['Color'] = aux_info.get('COLOR', np.nan)
        car_info['Combustible'] = aux_info.get('COMBUSTIBLE', np.nan)
        car_info['Puertas'] = aux_info.get('PUERTAS', np.nan)
        car_info['Transmisión'] = aux_info.get('TIPO DE CAJA', np.nan)
        car_info['Motor'] = aux_info.get('CILINDRAJE', np.nan)
        car_info['Kilómetros'] = int(driver.find_element(By.CSS_SELECTOR, "h3[class='h3P kilometers']").text.replace('km', '').replace('.', ''))

        del aux_info

        return car_info
        
    except Exception as e:
        print('Error trying to get car info from url: ', url)
        print(e)
        return None

In [385]:
driver = start_chrome_driver()
url = "https://www.carroya.com/automoviles-y-camionetas/usado?page=1" 
driver.get(url)
time.sleep(1)

In [378]:
num_pages = int(driver.find_element(By.CSS_SELECTOR, "ul[class='ant-pagination mini hidePrevArrow']").find_elements(By.TAG_NAME, 'li')[-2].text)
num_pages = min(num_pages, 200)
num_pages

200

In [None]:
df = pd.DataFrame()
for i in trange(1,num_pages+1):
    # print(f'Page {i} of {num_pages}')
    driver = start_chrome_driver()
    url = f'https://www.carroya.com/automoviles-y-camionetas/usado?page={i}'
    driver.get(url)
    time.sleep(1)
    cars = driver.find_elements(By.CLASS_NAME, 'contentCurrentCard')
    for car in cars:
        url = car.find_element(By.TAG_NAME, 'a').get_attribute('href')
        driver2 = start_chrome_driver()
        car_info = get_car_info(url, driver2)
        df = pd.concat([df, pd.DataFrame([car_info])]) if car_info else df

In [394]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Precio,Marca,Modelo,Año,Versión,Color,Combustible,Puertas,Transmisión,Motor,Kilómetros
0,34500000,Chevrolet,Spark,2015,GT Mecanico Full Equipo,Negro,Gasolina,,Mecánica,1.200,56900
1,18200000,Nissan,Sentra,2007,1.6 16V,Gris,Gasolina,,Mecánica,1.600,125000
2,63500000,Volvo,XC 60,2012,2.0 Turbo,Negro,Gasolina,,Automática,2.000,99500
3,105000000,Toyota,Land Cruiser,2001,100 V8 Imperial Automatica,Gris,Gasolina,,Automática,4.700,193500
4,92000000,Nissan,Kicks,2019,Exclusive Aut,Plateado,Gasolina,5,Automática,1.600,35000
...,...,...,...,...,...,...,...,...,...,...,...
276,32000000,Chevrolet,Sail,2017,1.4 LS Mecanico Aire Acondicionado,Gris,Gasolina,4,Mecánica,1.400,77000
277,42000000,Volkswagen,Gol Trendline,2017,,Negro,Gasolina,,Mecánica,1.600,35000
278,30900000,Chevrolet,Spark,2012,1.2 GT LT Mecanico,Plateado,Gasolina,,Mecánica,1.206,90167
279,190900000,Land Rover,Discovery,2019,Sport S 2.0 Suv Automatico,Plateado,Gasolina,,Automática,2.000,49328
