# Web scraper de información de vehículos usados

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm, trange
import time

# Mercado Libre

In [None]:
def get_car_info(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    car_info = {}
    try:
        car_info['Precio'] = soup.find('div', class_='ui-pdp-price__second-line').find('span', class_='andes-visually-hidden').text.split(' ')[0]
        car_info['Precio'] = int(car_info['Precio'])
        table = soup.find('table', class_='andes-table')
        for tr in table.find_all('tr'):
            key = tr.find('th').text
            value = tr.find('td').text
            car_info[key] = value
        
        car_info['Kilómetros'] = int(car_info['Kilómetros'].replace('km', ''))
        return car_info
    except:
        return get_car_info(url)

In [None]:
html_text = requests.get('https://carros.mercadolibre.com.co/').text
soup = BeautifulSoup(html_text, 'lxml')

In [None]:
num_pages = int(soup.find('li', class_='andes-pagination__page-count').text.split(' ')[-1])
num_pages

In [None]:
df = pd.DataFrame()
for i in trange(1,num_pages+1):
    html_text = requests.get(f'https://carros.mercadolibre.com.co/_Desde_{(i*48)+1}').text
    soup = BeautifulSoup(html_text, 'lxml')
    cars = soup.find_all('div', class_='ui-search-result__wrapper shops__result-wrapper')
    for car in cars:
        url = car.find('a', class_='ui-search-link')['href']
        df = pd.concat([df, pd.DataFrame([get_car_info(url)])])


df.reset_index(drop=True, inplace=True)
df.rename(columns={'Tipo de combustible':'Combustible', 'Kilómetros':'Kilometraje'}, inplace=True)

In [None]:
df

# Vende tu Nave

In [None]:
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def get_car_info(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    try:
        car_info = {}
        car_info['Precio'] = int(soup.find('h2', class_='ui header').text.split(' ')[1].replace('.', ''))
        rows = soup.find('div', id='desc-desk').find_all('div', class_='two column row')
        for row in rows:
            keys = row.find_all('h3')
            values = row.find_all('p')
            for key, value in zip(keys, values):
                car_info[key.text.replace(':','')] = value.text
        
        car_info['Kilometraje'] = int(car_info['Kilometraje'].replace('km', '').replace('.',''))
        car_info['Motor'] = car_info.pop('Cilindraje').replace('cc', '')
        del car_info['Estado'], car_info['Tipo precio'], car_info['Último dígito de placa'], car_info['Blindaje'], car_info['Peritaje']
        
        return car_info
    except Exception as e:
        print('Error trying to get car info from url: ', url)
        print(e)
        return None
        

In [None]:
# start by defining the options 
options = webdriver.ChromeOptions() 
options.add_argument('--headless')
# normally, selenium waits for all resources to download 
# we don't need it as the page also populated with the running javascript code. 
options.page_load_strategy = 'none' 
# this returns the path web driver downloaded 
chrome_path = ChromeDriverManager().install() 
chrome_service = Service(chrome_path) 
# pass the defined options and service objects to initialize the web driver 
driver = Chrome(options=options, service=chrome_service) 
driver.implicitly_wait(5)

In [None]:
url = "https://www.vendetunave.co/vehiculos/carrosycamionetas?&estado=Usado&page=999" 
driver.get(url) 
# time.sleep(0.5)

In [None]:
num_pages = driver.find_element(By.CLASS_NAME, 'ui.pagination.pointing.secondary.menu').find_elements(By.TAG_NAME, 'a')[-2].get_attribute('value')
num_pages = int(num_pages)
num_pages

In [None]:
for i in trange(1,num_pages+1):
    url = f'https://www.vendetunave.co/vehiculos/carrosycamionetas?&estado=Usado&page={i}'
    driver.get(url)
    # time.sleep(0.5)
    cars = driver.find_elements(By.CLASS_NAME, 'ui.card')
    for car in cars:
        url = car.get_attribute('href')
        car_info = get_car_info(url)
        df = pd.concat([df, pd.DataFrame([car_info])]) if car_info else df

df