In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import zapimoveis_scraper as zap

In [2]:
def scrape_page(link):
    response = requests.get(link, verify=False)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        property_cards = soup.find_all(class_='js-card-selector')

        link_imovel = []
        address = []
        anunciante = []
        area = []
        tipo = []
        room = []
        bath = []
        park = []
        price = []

        for card in property_cards:
            full_address = card.find(class_='property-card__address').text.strip()
            full_link = card.find(class_='property-card__main-info').a['href']

            anunciante_elem = card.find(class_='property-card__account-link js-property-card-account-link')
            full_anunciante = anunciante_elem.img['alt'] if anunciante_elem and anunciante_elem.img else 'N/A'

            full_area = card.find(class_='property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area').text.strip()
            full_tipo = card.find(class_='property-card__title js-cardLink js-card-title').text.split()[0]
            full_room = card.find(class_='property-card__detail-item property-card__detail-room js-property-detail-rooms').text.strip()
            full_bath = card.find(class_='property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom').text.strip()
            full_park = card.find(class_='property-card__detail-item property-card__detail-garage js-property-detail-garages').text.strip()
            full_price = card.find(class_='property-card__price js-property-card-prices js-property-card__price-small').text.strip()

            address.append(full_address.replace('\n', ''))
            link_imovel.append(full_link)
            anunciante.append(full_anunciante)
            area.append(full_area)
            tipo.append(full_tipo)
            room.append(full_room)
            bath.append(full_bath)
            park.append(full_park)
            price.append(full_price)

        data = {
            'Link': link_imovel,
            'Address': address,
            'Anunciante': anunciante,
            'Area': area,
            'Tipo': tipo,
            'Room': room,
            'Bath': bath,
            'Park': park,
            'Price': price
        }
        df = pd.DataFrame(data)
        return df
    else:
        print(f'Error: {response.status_code}')
        return None

# Specify the range of pages you want to scrape
start_page = 1
end_page = 3  # You can adjust this to the desired end page

# URL template with a placeholder for the page number
url_template = 'https://www.vivareal.com.br/venda/espirito-santo/vitoria/?pagina={}#onde=Brasil,Esp%C3%ADrito%20Santo,Vit%C3%B3ria,,,,,,BR%3EEspirito%20Santo%3ENULL%3EVitoria,,,'
dfs = []

# Loop through the specified range of pages
for page in range(start_page, end_page + 1):
    page_url = url_template.format(page)
    df = scrape_page(page_url)
    if df is not None:
        dfs.append(df)

# Concatenate dataframes from all pages
result_df = pd.concat(dfs, ignore_index=True)

# Save the final DataFrame to a CSV file
result_df.to_csv('vivareal_combined.csv', index=False, encoding='utf-8')

print('Scraping and data extraction completed successfully.')



Scraping and data extraction completed successfully.


In [11]:
def scrape_olx_page(link):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(link, headers=headers, verify=False)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.find_all(class_='fnmrjs-0 jVYLMk')

        link_imovel = []
        address = []
        anunciante = []
        area = []
        tipo = []
        room = []
        bath = []
        park = []
        price = []

        for item in items:
            full_address = item.find(class_='fnmrjs-1 ivyGow').text.strip()
            full_link = item.find(class_='fnmrjs-4 hXRcVt')['href']

            full_area = item.find(class_='fnmrjs-6 dQSMGN').text.strip()
            full_tipo = item.find(class_='fnmrjs-2 kXuJIz').text.split()[0]
            full_room = item.find(class_='fnmrjs-6 kPSaQi').text.strip()
            full_bath = item.find(class_='fnmrjs-6 jtdKe').text.strip()
            full_park = item.find(class_='fnmrjs-6 eImvvy').text.strip()
            full_price = item.find(class_='fnmrjs-3 bXKLTz').text.strip()

            address.append(full_address.replace('\n', ''))
            link_imovel.append(full_link)
            area.append(full_area)
            tipo.append(full_tipo)
            room.append(full_room)
            bath.append(full_bath)
            park.append(full_park)
            price.append(full_price)

        data = {
            'Link': link_imovel,
            'Address': address,
            'Area': area,
            'Tipo': tipo,
            'Room': room,
            'Bath': bath,
            'Park': park,
            'Price': price
        }
        df = pd.DataFrame(data)
        return df
    else:
        print(f'Error: {response.status_code}')
        return None

# Rest of the code remains unchanged
# ...

# Specify the range of pages you want to scrape for OLX
start_page_olx = 1
end_page_olx = 5  # You can adjust this to the desired end page

# URL template for OLX with a placeholder for the page number
url_template_olx = 'https://www.olx.com.br/imoveis/venda/estado-es?o={}'
dfs_olx = []

# Loop through the specified range of pages for OLX
for page_olx in range(start_page_olx, end_page_olx + 1):
    page_url_olx = url_template_olx.format(page_olx)
    df_olx = scrape_olx_page(page_url_olx)
    if df_olx is not None:
        dfs_olx.append(df_olx)

# Concatenate dataframes from all pages for OLX
if dfs_olx:
    result_df_olx = pd.concat(dfs_olx, ignore_index=True)

    # Save the final DataFrame from OLX to a CSV file
    result_df_olx.to_csv('olx_combined.csv', index=False, encoding='utf-8')

    print('Scraping and data extraction from OLX completed successfully.')
else:
    print('No data to concatenate.')



Scraping and data extraction from OLX completed successfully.


In [33]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_imovelweb(start_page, end_page):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    # Initialize a list to store data for the CSV
    data_list = []

    # Loop through the specified range of pages
    for page_number in range(start_page, end_page + 1):
        # Construct the URL for each page
        url = f'https://www.imovelweb.com.br/apartamentos-venda-vitoria-es-pagina-{page_number}.html'

        # Introduce a delay to avoid triggering anti-scraping measures
        time.sleep(1)
        
        # Send a GET request to the URL with headers
        response = requests.get(url, headers=headers, verify=False)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all posting elements on the page
            posting_elements = soup.find_all('div', class_='sc-i1odl-0', attrs={'data-qa': 'posting PROPERTY'})

            # Loop through each posting element
            for posting in posting_elements:
                # Extract relevant information from each posting
                price_element = posting.find('div', {'data-qa': 'POSTING_CARD_PRICE'})
                description_element = posting.find('div', {'data-qa': 'POSTING_CARD_DESCRIPTION'})
                features_element = posting.find('div', {'data-qa': 'POSTING_CARD_FEATURES'})
                location_element = posting.find('div', {'data-qa': 'POSTING_CARD_LOCATION'})

                # Check if the elements exist before accessing their attributes
                if price_element and description_element and features_element and location_element:
                    price = price_element.text.strip()
                    description = description_element.text.strip()
                    location = location_element.text.strip()

                    # Split the 'Features' data into separate columns
                    features = features_element.text.strip().split()

                    # Create a dictionary to store the data
                    data_dict = {'Location': location, 'Description': description, 'Price': price, 'Area': None, 'Quartos': None, 'Banheiros': None, 'Vagas': None, 'Type': 'apartamento'}

                    # Check if the features list has the specified indices
                    if len(features) > 0:
                        data_dict['Area'] = features[0]
                    if len(features) > 4:
                        data_dict['Quartos'] = features[4]
                    if len(features) > 6:
                        data_dict['Banheiros'] = features[6]
                    if len(features) > 8:
                        data_dict['Vagas'] = features[8]

                    # Append data to the list
                    data_list.append(data_dict)

            print(f"Page {page_number} scraped successfully.")

        else:
            print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")

    # Create a Pandas DataFrame from the data list
    df = pd.DataFrame(data_list)

    # Write the DataFrame to a CSV file
    df.to_csv('imovelweb_data.csv', index=False)

    print("CSV file created successfully.")

# Example usage for pages 1 and 2
start_page = 1
end_page = 2
scrape_imovelweb(start_page, end_page)



Page 1 scraped successfully.




Page 2 scraped successfully.
CSV file created successfully.
