In [22]:
import re
import json
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [23]:
def _get_listing_id(soup):
    script_tag = soup.find('script', string=re.compile(r'var infoVeiculo'))
    match = re.search(r'var infoVeiculo = ({.*?});', script_tag.string)
    return json.loads(match.group(1))['id']

def _get_general_features(soup):
    text = soup.find("div", class_="d-flex flex-wrap w-100 mt-3").p.get_text(strip=True)
    return_list = [feature.strip() for feature in text.split("|")]
    return_list = (return_list + [np.nan] * 5)[:5]
    
    return return_list

def _get_header_info(soup):
    h1 = soup.find("h1", class_="text-uppercase desktop")

    maker = h1.contents[0].strip()
    model = h1.find("span").get_text(strip=True)
    other_info = h1.find("span", class_="gray").get_text(strip=True)
    other_info = other_info.replace("\xa0", " ")

    other_info = [feature.strip() for feature in other_info.split(' ')]
    return_list = [maker, model]
    return_list = return_list + other_info

    return_list = (return_list + [np.nan] * 5)[:5]

    return return_list

def _get_li_features(soup):
    features = soup.find_all("li", class_="list-style-none mb-3")
    return [feature.get_text(strip=True) for feature in features]

def _get_seller_description(soup):
    text = soup.find("p", itemprop="description")
    return text.get_text(separator="\n", strip=True).replace("\n", " ") if text else np.nan
    
def _get_image(soup):
    text = soup.find('meta', {'property': 'og:image'})
    return text.get('content') if text else np.nan
    
def _get_city(soup):
    text = soup.find("p", class_="mb-0 mt-3 d-flex align-items-baseline")
    return text.get_text(strip=True) if text else np.nan

def _get_price(soup):
    text = soup.find('p', class_='mb-0 pr-2 text-color-1')
    return text.get_text(strip=True) if text else np.nan

def _get_year(soup):
    text = soup.find('p', class_='mb-0 px-2 meio')
    return text.get_text(strip=True) if text else np.nan

def get_car_features(url):
    response = requests.get(url) 
    soup = BeautifulSoup(response.text, 'html.parser')

    general_features = _get_general_features(soup)
    header_info = _get_header_info(soup)
    li_features = _get_li_features(soup)
    seller_description = _get_seller_description(soup)
    image = _get_image(soup)
    year = _get_year(soup)
    city = _get_city(soup)
    price = _get_price(soup)

    features_dict = {
        'id': _get_listing_id(soup),
        'title': soup.title.string,
        'seller_description': seller_description,
        'link': soup.find('meta', {'property': 'og:url'}).get('content'), 
        'image': image,
        'maker': header_info[0],
        'model': header_info[1],
        'year': year,
        'engine': header_info[2],
        'valves': header_info[3],
        'transmission': general_features[0],
        'fuel_type': general_features[1],
        'body_type': general_features[2],
        'doors': header_info[4],
        'color': general_features[3],
        'mileage': general_features[4],
        'other_fatures': li_features,
        'city': city,
        'price': price,
    }

    return pd.DataFrame([features_dict])


In [24]:
# url = 'https://carrosp.com.br/comprar/sedan/hyundai/hb-20-sedan/1.0-12v-4p-flex-vision/2022/7241455/'
# get_car_features(url)

# url = 'https://carrosp.com.br/comprar/sedan/toyota/yaris-sedan/1.5-16v-4p-flex-xl/2023/7241914/'
# get_car_features(url)['image'].iloc[0]

In [25]:
# url = 'https://carrosp.com.br/carros/todos/?revendedor=0&revendedor=S&particular=0&particular=S&tipo_id=1&marca_id=&ano1=&ano2=&zero=0&zero=S&usado=0&usado=S&kmIni=&kmFim=&precoIni=3000&precoFim=150000&idForm=formBuscaVeiculo&id=&cor_id=&combustivel_id=&distancia=100&cidadeNome=&cidade_id=&nocidade=1&ordem=preco2&'
# response = requests.get(url) 
# pd.DataFrame({'html': [response.text]}).to_clipboard()

In [26]:
url = 'https://carrosp.com.br/carros/'
response = requests.get(url) 
soup = BeautifulSoup(response.text, 'html.parser')

car_listings = []
for a_tag in soup.find_all("a", class_="titulo novajanela mb-1"):
    href = a_tag.get("href")
    if href:
        car_listings.append(href)

car_listings = list(set(car_listings))

In [None]:
df = pd.DataFrame()

for listing in car_listings:
   aux = get_car_features(listing) 
   df = pd.concat([df, aux], axis=0)

In [70]:
def _extract_id(url):
    # Extract numeric ID at the end of the URL using regex
    match = re.search(r'/(\d+)/$', url)
    return int(match.group(1)) if match else None

def _get_unique_items(input_list):
    unique_items = []
    for item in input_list:
        if item not in unique_items:
            unique_items.append(item)
    return unique_items

In [71]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

driver = webdriver.Chrome()
#url = 'https://carrosp.com.br/carros/'

price_range = [0, 10000, 20000, 30000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 
               80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 300000]

#for i in range(len(price_range) - 1):
for i in range(2):
    print(f"Price Range: R$ {price_range[i]} - R$ {price_range[i+1]}")

    # Fix URL an open it
    url = f"https://carrosp.com.br/carros/todos/?revendedor=0&revendedor=S&particular=0&particular=S&tipo_id=1&marca_id=&ano1=&ano2=&zero=0&zero=S&usado=0&usado=S&kmIni=&kmFim=&precoIni={price_range[i]}&precoFim={price_range[i+1]}&idForm=formBuscaVeiculo&id=&cor_id=&combustivel_id=&distancia=100&cidadeNome=&cidade_id=&nocidade=1&"
    driver.get(url)
    car_listings = set()
    car_ids = set()
    has_new_listings = True

    while has_new_listings:
        # Scroll down to load listings
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        driver.execute_script("window.scrollBy(0, -1100);")

        # WebDriverWait(driver, 10).until(
        #              EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.titulo.novajanela.mb-1"))
        #                                )  # Wait for dynamic content to load
        time.sleep(3)

        # Get listings URL
        new_listings = np.nan
        new_listings = []
        print(f"new_listings: {new_listings}")
        elements = driver.find_elements(By.CSS_SELECTOR, "a.titulo.novajanela.mb-1")
        
        for element in elements:
            href = element.get_attribute("href")
            if href:
                new_listings.append(href)

        new_listings = _get_unique_items(new_listings)[-21:]
        new_listings = set(new_listings)
        new_listings_ids = set([_extract_id(listing) for listing in new_listings])

        # Check if there are new listings
        if any(item not in car_ids for item in new_listings_ids):
            # Append only new elements to the bigger list
            car_listings.update(new_listings)
            car_ids.update(new_listings_ids)
        else:
            # Break the loop if no new listings are found
            has_new_listings = False

    # Writes listings url
    with open("listings.txt", "a") as file:
        for url in car_listings:
            file.write(url + "\n")

driver.quit()   

Price Range: R$ 0 - R$10000
new_listings: []
new_listings: []
Price Range: R$ 10000 - R$20000
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []
new_listings: []


In [65]:
len(car_ids)

59

In [66]:
len(new_listings_ids)

21

In [40]:
any(item not in car_ids for item in new_listings_ids)

False

In [67]:
a = [1, 1, 3, 5, 6, 2, 2, 2, 8, 8, 8, 8, 8]
set(a)

{1, 2, 3, 5, 6, 8}