<h1>Scraping and analyzing a page with real state public data from Uberaba, Minas Gerais, Brazil</h1>
<p><b>Public legal data - for academic purposes</p></b>

In [11]:
#Using builtwith to identificate technologies the website is built with
!pip install builtwith
import builtwith

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=2dd9b1c9c3b73a9e29f9dc4953b4ff483f216a82238cc1c6668c2bd17b5a0e66
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4


In [None]:
#applying builtwith to the desired website - the '.parse()' method returns the technologies used in the website
builtwith.parse('https://imobiliariachavedeouro.com.br/imoveis/')

In [None]:
#whois library - to identificate the website's owner(s)
!pip install python-whois
import whois

In [None]:
#applying whois - it returns very rich informations about the website we want to sccrape
print(whois.whois('https://imobiliariachavedeouro.com.br/imoveis/'))

In [None]:
#analyzing the pure HTML from the main page
response = requests.get('https://imobiliariachavedeouro.com.br/imoveis/', headers=headers)
print(response.status_code)
html_bruto = response.text
print(html_bruto)

# Analysis of the website we want to scrape accessing the main page HTML source

When we look at the HTML returned by the main page, we notice something important.

There is a small piece of JavaScript inside wich tells us that the website is using ajax fetching.

This already tells us a lot.

The main page itself does not really contain the data.  
It only has the layout, the structure, and a button.

The real data is not here.

This page is only a door.  
Behind this door, the website uses AJAX to fetch the real content from another place.

That was very very tough to find

In [None]:
# another exploratory step: checking the strucure of a random page/link from the list
resposta = requests.get('https://imobiliariachavedeouro.com.br/imovel/casa-a-venda-no-condominio-mario-franco-em-uberaba-mg/6618/', headers=headers)
print(resposta.status_code)
html_bruto = resposta.text
print(html_bruto)

In [1]:
#using a header pool - to try another header when the page goes 403. I picked these headers from internet
HEADERS_POOL = [

    # chrome windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    #chrome linux
    {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": "*/*",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # firefox windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) "
            "Gecko/20100101 Firefox/121.0"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # firefox linux
    {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) "
            "Gecko/20100101 Firefox/120.0"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # edge windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36 "
            "Edg/122.0.0.0"
        ),
        "Accept": "*/*",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # chrome macOS
    {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/121.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    }
]


In [2]:
from bs4 import BeautifulSoup
import requests
import random
import time

MAX_RETRIES = 5  #for headers pool

def scrape_imovel(link, id_seq):
    response = None

    # if a header fails, it tryes another one
    for attempt in range(MAX_RETRIES):
        headers = random.choice(HEADERS_POOL)

        try:
            r = requests.get(link, headers=headers, timeout=15)
        except requests.RequestException as e:
            print(f"request error {id_seq}: {e}")
            time.sleep(random.uniform(1.5, 3.5))
            continue

        if r.status_code == 200:
            response = r
            break
        elif r.status_code == 403:
            print(f"403 error {id_seq} — changing header ({attempt+1}/{MAX_RETRIES})") #changing header
            time.sleep(random.uniform(2.5, 5.0))
            continue
        else:
            print(f"HTTP {r.status_code} at {id_seq}")
            return None  # qualquer outro status aborta esse link

    if response is None:
        print(f"Warning - persistent failings - aborting {id_seq}")
        return None

    soup = BeautifulSoup(response.text, "html.parser") #creating soup

    dados = {
        "id": id_seq,
        "tipo": None, #type
        "bairro": None, #suburb
        "area_lote": None, #lot area
        "area_interna": None, #internal area
        "salas": None, #rooms
        "suites": None, #suites
        "quartos": None, #bedrooms
        "vagas": None, #parking spaces
        "preco": None #price
    } #defining columns from the dataset as a dcitionary

    #real state type
    if soup.title:
        dados["tipo"] = soup.title.text.strip().split()[0].lower()

    #main features
    for item in soup.select(".left-content--items .item"):
        titulo = item.select_one(".item-title")
        valor = item.select_one(".item-number")
        if not titulo or not valor:
            continue

        chave = titulo.get_text(strip=True).lower()
        val = valor.get_text(strip=True)

        # I added try and exceptions becausee the ValueError was very common in other runs
        if "bairro" in chave:
            dados["bairro"] = val
        elif "lote" in chave or "terreno" in chave:
            try:
                dados["area_lote"] = float(val.replace("m²", "").replace(",", "."))
            except ValueError:
                pass
        elif "área interna" in chave:
            try:
                dados["area_interna"] = float(val.replace("m²", "").replace(",", "."))
            except ValueError:
                pass
        elif "sala" in chave:
            try:
                dados["salas"] = int(val)
            except ValueError:
                pass
        elif "suíte" in chave:
            try:
                dados["suites"] = int(val)
            except ValueError:
                pass
        elif "quarto" in chave:
            try:
                dados["quartos"] = int(val)
            except ValueError:
                pass

    #the parking spaces are located in a different area of the HTML file, so I did an specific collection for that
    for tag in soup.select(".info-title"):
        texto = tag.get_text(strip=True).lower()
        if "vaga" in texto:
            try:
                dados["vagas"] = int(texto.replace("vagas", "").replace("vaga", "").strip())
            except ValueError:
                pass
            break

    # price is also located in an specific area of the HTML
    price_div = soup.select_one(".form-price")
    if price_div:
        parte_inteira = price_div.select_one("span")
        centavos = price_div.select_one("strong")
        if parte_inteira and centavos:
            preco_str = parte_inteira.text + centavos.text
            try:
                dados["preco"] = float(preco_str.replace(".", "").replace(",", "."))
            except ValueError:
                print(f"[AVISO] Preço inválido no imóvel {id_seq}: '{preco_str}'")
                dados["preco"] = None

    return dados


In [4]:
url_template = (
    "https://imobiliariachavedeouro.com.br/novo/wp-admin/admin-ajax.php"
    "?action=imoveis_listing_content&page={page}"
    "&finalidade=2&codigo=false&tipo=false&bairros=false&quartos=false"
    "&ordenacao=false&banheiros=false&suites=false&vagas=false"
    "&valor_min=0&valor_max=10000000&area_min=false&area_max=false"
) #this is The AJAX fatch found the the main page
#for now, we are not going to to scrape any HTML file - we need to communicate with the server, requesting links, since the design of the page is complicated


In [6]:
all_links = []

#here, we are going to insert all the links we got from the main page inside a list
page = 1
while True:
    print(f"Processing page {page}...")
    url = url_template.format(page=page)
    response = None

    #acessing the page with the header pool
    for attempt in range(MAX_RETRIES):
        headers = random.choice(HEADERS_POOL)
        try:
            r = requests.get(url, headers=headers, timeout=15)
        except requests.RequestException as e:
            print(f"[AVISO] request error at page {page}: {e}")
            time.sleep(random.uniform(1.5, 3.5))
            continue

        if r.status_code == 200:
            response = r
            break
        elif r.status_code == 403:
            print(f"[WARNING] 403 at page {page} — changing header ({attempt+1}/{MAX_RETRIES})")
            time.sleep(random.uniform(2.5, 5.0))
            continue
        else:
            print(f"[WARNING] HTTP {r.status_code} at page {page}")
            response = None
            break

    if response is None:
        print(f"persistent fail at page {page}, finishing scraping.")
        break

    # processing the cards (with the links) from the main page
    try:
        data = r.json()
    except ValueError:
        print(f"non JSON reponse at the page {page}, finishing.")
        break

    cards = data.get("cards", [])
    if not cards:
        print("no more cards - finishing")
        break

    for card_html in cards:
        soup = BeautifulSoup(card_html, "html.parser")
        link_tag = soup.find("a", class_="card-link")
        if link_tag and link_tag.get("href"):
            all_links.append(link_tag["href"])

    page += 1

print(f"Total of collected links: {len(all_links)}")


Processando página 1...
Processando página 2...
Processando página 3...
Processando página 4...
Processando página 5...
Processando página 6...
Processando página 7...
Processando página 8...
Processando página 9...
Processando página 10...
Processando página 11...
Processando página 12...
Processando página 13...
Processando página 14...
Processando página 15...
Processando página 16...
Processando página 17...
Processando página 18...
Processando página 19...
Processando página 20...
Processando página 21...
Processando página 22...
Processando página 23...
Processando página 24...
Processando página 25...
Processando página 26...
Processando página 27...
Processando página 28...
Processando página 29...
Processando página 30...
Processando página 31...
Processando página 32...
Processando página 33...
Processando página 34...
Processando página 35...
Processando página 36...
Processando página 37...
Processando página 38...
Processando página 39...
Processando página 40...
Processan

In [7]:
#now, we scrape all the links we have inserted in the list
base_dados = []
lista_links = all_links

for i, link in enumerate(lista_links, start=1):
    print(f"scraping real state {i}")
    dados = scrape_imovel(link, i)
    base_dados.append(dados)


scraping real state 1
scraping real state 2
scraping real state 3
scraping real state 4
scraping real state 5
scraping real state 6
scraping real state 7
scraping real state 8
scraping real state 9
scraping real state 10
scraping real state 11
scraping real state 12
scraping real state 13
scraping real state 14
scraping real state 15
scraping real state 16
scraping real state 17
scraping real state 18
scraping real state 19
scraping real state 20
scraping real state 21
scraping real state 22
scraping real state 23
scraping real state 24
scraping real state 25
scraping real state 26
scraping real state 27
scraping real state 28
scraping real state 29
scraping real state 30
scraping real state 31
scraping real state 32
scraping real state 33
scraping real state 34
scraping real state 35
scraping real state 36
scraping real state 37
scraping real state 38
scraping real state 39
scraping real state 40
scraping real state 41
scraping real state 42
scraping real state 43
scraping real state 

In [8]:
#little exploratory step - to check if the links have been sucessfully collected
for imovel in lista_links:
    print(imovel)

https://imobiliariachavedeouro.com.br/imovel/casa-a-venda-no-condominio-damha-3-em-uberaba/7137?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-a-venda-no-condominio-damha-3-em-uberaba/6949?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-a-venda-no-condominio-mario-franco-em-uberaba-mg/6219?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-em-condominio-a-venda-4-quartos-4-suites-4-vagas-damha-residencial-uberaba-ii-uberaba-mg/6887?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-em-condominio-a-venda-3-quartos-1-suite-3-vagas-condominio-terra-nova-uberaba-mg/6803?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-em-condominio-a-venda-4-quartos-3-suites-4-vagas-damha-residencial-uberaba-iii-uberaba-mg/7254?preloader=false
https://imobiliariachavedeouro.com.br/imovel/apartamento-em-frente-ao-parque-das-acacias-127-m/7511?preloader=false
https://imobiliariachavedeouro.com.br/imovel/casa-terrea-com-3-suit

In [9]:
#checking if the scrape worked
for item in base_dados:
  print(item)

print(len(base_dados))

{'id': 1, 'tipo': 'casa', 'bairro': 'Damha Residencial Uberaba III', 'area_lote': 360.0, 'area_interna': 284.74, 'salas': 1, 'suites': 3, 'quartos': 3, 'vagas': 4, 'preco': 2950000.0}
{'id': 2, 'tipo': 'casa', 'bairro': 'Damha Residencial Uberaba III', 'area_lote': 362.91, 'area_interna': 281.81, 'salas': 1, 'suites': 3, 'quartos': 3, 'vagas': 4, 'preco': 2700000.0}
{'id': 3, 'tipo': 'casa', 'bairro': 'Residencial Mário de Almeida Franco', 'area_lote': 1880.0, 'area_interna': 778.0, 'salas': 2, 'suites': 5, 'quartos': 7, 'vagas': 10, 'preco': 5400000.0}
{'id': 4, 'tipo': 'casa', 'bairro': 'Damha Residencial Uberaba II', 'area_lote': 461.0, 'area_interna': 340.0, 'salas': 2, 'suites': 4, 'quartos': 4, 'vagas': 4, 'preco': 6350000.0}
{'id': 5, 'tipo': 'casa', 'bairro': 'Condomínio Terra Nova', 'area_lote': 143.0, 'area_interna': 140.0, 'salas': 1, 'suites': 1, 'quartos': 3, 'vagas': 3, 'preco': 580000.0}
{'id': 6, 'tipo': 'casa', 'bairro': 'Damha Residencial Uberaba III', 'area_lote': 40

In [10]:
#converting to csv
import csv

nome_arquivo = "imoveis_uberaba.csv"

# Filter out None values from base_dados
base_dados_filtered = [item for item in base_dados if item is not None]

# Check if the filtered list is empty before proceeding
if not base_dados_filtered:
    print("There are no data compatible with CSV.")
else:
    with open(nome_arquivo, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=base_dados_filtered[0].keys()
        )
        writer.writeheader()
        writer.writerows(base_dados_filtered)

    print("well succeeded CSV generation!")

CSV gerado com sucesso!


In [None]:
#run this if you want to download the .csv file
from google.colab import files
files.download("imoveis_uberaba.csv")