<h1>Scraping and analyzing a page with real state public data from Uberaba, Minas Gerais, Brazil</h1>
<p><b>Public legal data - for academic purposes</p></b>

In [26]:
#Importing libs
#beautifoul soup for web scraping
from bs4 import BeautifulSoup

#Using builtwith to identificate technologies the website is built with
!pip install builtwith
import builtwith

#whois library - to identificate the website's owner(s)
!pip install python-whois
import whois

#
import requests



In [27]:
#applying builtwith to the desired website - the '.parse()' method returns the technologies used in the website
builtwith.parse('https://imobiliariachavedeouro.com.br/imoveis/')

{'cdn': ['CloudFlare'],
 'font-scripts': ['Google Font API'],
 'cms': ['WordPress'],
 'programming-languages': ['PHP'],
 'blogs': ['PHP']}

In [7]:
#applying whois - it returns very rich informations about the website we want to sccrape
print(whois.whois('https://imobiliariachavedeouro.com.br/imoveis/'))

{
  "domain_name": "imobiliariachavedeouro.com.br",
  "registrant_name": "Chave de Ouro Imoveis",
  "registrant_id": "17.894.102/0001-09",
  "country": "BR",
  "owner_c": "VBCRF1",
  "admin_c": null,
  "tech_c": "JNS66",
  "billing_c": null,
  "name_servers": [
    "clyde.ns.cloudflare.com",
    "dara.ns.cloudflare.com"
  ],
  "nsstat": "20251228 AA",
  "nslastaa": "20251228",
  "saci": "yes",
  "creation_date": [
    "2018-08-29 00:00:00+00:00",
    "2003-02-17 00:00:00+00:00"
  ],
  "updated_date": [
    "2025-08-25 00:00:00+00:00",
    "2018-08-29 00:00:00+00:00",
    "2025-05-27 00:00:00+00:00"
  ],
  "expiration_date": "2028-08-29 00:00:00+00:00",
  "status": "published",
  "nic_hdl_br": [
    "VBCRF1",
    "JNS66"
  ],
  "person": [
    "Vin�cius Barbosa Cabral R S de Freitas",
    "Joao Nivaldo Lombardi Sales"
  ],
  "email": [
    "vinicius.corretagem@gmail.com",
    "joao.nivaldo@jnwebstudio.com.br"
  ]
}


# Analysis of the website we want to scrape accessing the main page HTML source

When we look at the HTML returned by the main page, we notice something important.

There is a small piece of JavaScript inside wich tells us that the website is using ajax fetching.

This already tells us a lot.

The main page itself does not really contain the data.  
It only has the layout, the structure, and a button.

The real data is not here.

This page is only a door.  
Behind this door, the website uses AJAX to fetch the real content from another place.

That was very very tough to find

In [16]:
#using a header pool - to try another header when the page goes 403. I picked these headers from internet
HEADERS_POOL = [

    # chrome windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    #chrome linux
    {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": "*/*",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # firefox windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) "
            "Gecko/20100101 Firefox/121.0"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # firefox linux
    {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) "
            "Gecko/20100101 Firefox/120.0"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # edge windows
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36 "
            "Edg/122.0.0.0"
        ),
        "Accept": "*/*",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    },

    # chrome macOS
    {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/121.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
        "Referer": "https://imobiliariachavedeouro.com.br/imoveis/",
        "X-Requested-With": "XMLHttpRequest"
    }
]


In [32]:
#analyzing the pure HTML from the main page
response = requests.get('https://imobiliariachavedeouro.com.br/imoveis/', headers=HEADERS_POOL[0])
print(response.status_code)

soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

200
<!DOCTYPE html>
<html lang="pt-br">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/assets/images/globals/favicon.png" rel="shortcut icon"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/build/css/main.min.css?version=1.5" rel="stylesheet"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/build/css/buildings.min.css?version=1.5" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com" rel="preconnect"/>
  <link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
  <link href="https://fonts.googleapis.com/css2?family=Montserrat:ital,wght@0,100..900;1,100..900&amp;display=swap" rel="stylesheet"/>
  <title>
   Chave de Ouro Imóveis
  </title>
  <meta content="A Imobiliária Chave de Ouro tem 

In [25]:
# another exploratory step: checking the strucure of a random page/link from the list
resposta = requests.get('https://imobiliariachavedeouro.com.br/imovel/casa-a-venda-no-condominio-mario-franco-em-uberaba-mg/6618/', headers=HEADERS_POOL[0])
print(resposta.status_code)

#making the HTML file readable
soup = BeautifulSoup(resposta.text, 'html.parser')
print(soup.prettify())

200
<!DOCTYPE html>
<html lang="pt-br">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/assets/images/globals/favicon.png" rel="shortcut icon"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/build/css/main.min.css?version=1.5" rel="stylesheet"/>
  <link href="https://imobiliariachavedeouro.com.br/novo/wp-content/themes/chave-de-ouro/build/css/building.min.css?version=1.5" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com" rel="preconnect"/>
  <link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
  <link href="https://fonts.googleapis.com/css2?family=Montserrat:ital,wght@0,100..900;1,100..900&amp;display=swap" rel="stylesheet"/>
  <title>
   Casa à venda no condomínio Mario Franco, em Uberaba-MG - Cód.: 6618 | Chave de Ouro

In [28]:
from bs4 import BeautifulSoup
import requests
import random
import time

MAX_RETRIES = 5  #for headers pool

def scrape_imovel(link, id_seq):
    response = None

    # if a header fails, it tryes another one
    for attempt in range(MAX_RETRIES):
        headers = random.choice(HEADERS_POOL)

        try:
            r = requests.get(link, headers=headers, timeout=15)
        except requests.RequestException as e:
            print(f"request error {id_seq}: {e}")
            time.sleep(random.uniform(1.5, 3.5))
            continue

        if r.status_code == 200:
            response = r
            break
        elif r.status_code == 403:
            print(f"403 error {id_seq} — changing header ({attempt+1}/{MAX_RETRIES})") #changing header
            time.sleep(random.uniform(2.5, 5.0))
            continue
        else:
            print(f"HTTP {r.status_code} at {id_seq}")
            return None  # qualquer outro status aborta esse link

    if response is None:
        print(f"Warning - persistent failings - aborting {id_seq}")
        return None

    soup = BeautifulSoup(response.text, "html.parser") #creating soup

    dados = {
        "id": id_seq,
        "tipo": None, #type
        "bairro": None, #suburb
        "area_lote": None, #lot area
        "area_interna": None, #internal area
        "salas": None, #rooms
        "suites": None, #suites
        "quartos": None, #bedrooms
        "vagas": None, #parking spaces
        "preco": None #price
    } #defining columns from the dataset as a dcitionary

    #real state type
    if soup.title:
        dados["tipo"] = soup.title.text.strip().split()[0].lower()

    #main features
    for item in soup.select(".left-content--items .item"):
        titulo = item.select_one(".item-title")
        valor = item.select_one(".item-number")
        if not titulo or not valor:
            continue

        chave = titulo.get_text(strip=True).lower()
        val = valor.get_text(strip=True)

        # I added try and exceptions becausee the ValueError was very common in other runs
        if "bairro" in chave:
            dados["bairro"] = val
        elif "lote" in chave or "terreno" in chave:
            try:
                dados["area_lote"] = float(val.replace("m²", "").replace(",", "."))
            except ValueError:
                pass
        elif "área interna" in chave:
            try:
                dados["area_interna"] = float(val.replace("m²", "").replace(",", "."))
            except ValueError:
                pass
        elif "sala" in chave:
            try:
                dados["salas"] = int(val)
            except ValueError:
                pass
        elif "suíte" in chave:
            try:
                dados["suites"] = int(val)
            except ValueError:
                pass
        elif "quarto" in chave:
            try:
                dados["quartos"] = int(val)
            except ValueError:
                pass

    #the parking spaces are located in a different area of the HTML file, so I did an specific collection for that
    for tag in soup.select(".info-title"):
        texto = tag.get_text(strip=True).lower()
        if "vaga" in texto:
            try:
                dados["vagas"] = int(texto.replace("vagas", "").replace("vaga", "").strip())
            except ValueError:
                pass
            break

    # price is also located in an specific area of the HTML
    price_div = soup.select_one(".form-price")
    if price_div:
        parte_inteira = price_div.select_one("span")
        centavos = price_div.select_one("strong")
        if parte_inteira and centavos:
            preco_str = parte_inteira.text + centavos.text
            try:
                dados["preco"] = float(preco_str.replace(".", "").replace(",", "."))
            except ValueError:
                print(f"[AVISO] Preço inválido no imóvel {id_seq}: '{preco_str}'")
                dados["preco"] = None

    return dados


In [29]:
url_template = (
    "https://imobiliariachavedeouro.com.br/novo/wp-admin/admin-ajax.php"
    "?action=imoveis_listing_content&page={page}"
    "&finalidade=2&codigo=false&tipo=false&bairros=false&quartos=false"
    "&ordenacao=false&banheiros=false&suites=false&vagas=false"
    "&valor_min=0&valor_max=10000000&area_min=false&area_max=false"
) #this is The AJAX fatch found the the main page
#for now, we are not going to to scrape any HTML file - we need to communicate with the server, requesting links, since the design of the page is complicated


In [30]:
all_links = []

#here, we are going to insert all the links we got from the main page inside a list
page = 1
while True:
    print(f"Processing page {page}...")
    url = url_template.format(page=page)
    response = None

    #acessing the page with the header pool
    for attempt in range(MAX_RETRIES):
        headers = random.choice(HEADERS_POOL)
        try:
            r = requests.get(url, headers=headers, timeout=15)
        except requests.RequestException as e:
            print(f"[AVISO] request error at page {page}: {e}")
            time.sleep(random.uniform(1.5, 3.5))
            continue

        if r.status_code == 200:
            response = r
            break
        elif r.status_code == 403:
            print(f"[WARNING] 403 at page {page} — changing header ({attempt+1}/{MAX_RETRIES})")
            time.sleep(random.uniform(2.5, 5.0))
            continue
        else:
            print(f"[WARNING] HTTP {r.status_code} at page {page}")
            response = None
            break

    if response is None:
        print(f"persistent fail at page {page}, finishing scraping.")
        break

    # processing the cards (with the links) from the main page
    try:
        data = r.json()
    except ValueError:
        print(f"non JSON reponse at the page {page}, finishing.")
        break

    cards = data.get("cards", [])
    if not cards:
        print("no more cards - finishing")
        break

    for card_html in cards:
        soup = BeautifulSoup(card_html, "html.parser")
        link_tag = soup.find("a", class_="card-link")
        if link_tag and link_tag.get("href"):
            all_links.append(link_tag["href"])

    page += 1

print(f"Total of collected links: {len(all_links)}")


Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...


KeyboardInterrupt: 

In [31]:
#now, we scrape all the links we have inserted in the list
base_dados = []
lista_links = all_links

for i, link in enumerate(lista_links, start=1):
    print(f"scraping real state {i}")
    dados = scrape_imovel(link, i)
    base_dados.append(dados)


scraping real state 1
scraping real state 2


KeyboardInterrupt: 

In [None]:
#little exploratory step - to check if the links have been sucessfully collected
for imovel in lista_links:
    print(imovel)

In [None]:
#checking if the scrape worked
for item in base_dados:
  print(item)

print(len(base_dados))

In [None]:
#converting to csv
import csv

nome_arquivo = "imoveis_uberaba.csv"

# Filter out None values from base_dados
base_dados_filtered = [item for item in base_dados if item is not None]

# Check if the filtered list is empty before proceeding
if not base_dados_filtered:
    print("There are no data compatible with CSV.")
else:
    with open(nome_arquivo, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=base_dados_filtered[0].keys()
        )
        writer.writeheader()
        writer.writerows(base_dados_filtered)

    print("well succeeded CSV generation!")

In [None]:
#run this if you want to download the .csv file
from google.colab import files
files.download("imoveis_uberaba.csv")