In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time 
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from tqdm.notebook import tqdm
pd.set_option('display.max_colwidth', None)

In [2]:
base_url = 'https://www.argenprop.com/casas-o-departamentos/venta/buenos-aires?pagina-'
response = requests.get(base_url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    pagination = soup.find('ul', class_='pagination')
    last_page_element = pagination.find_all('li')[-2] 
    max_pages = int(last_page_element.text.strip())

In [12]:
residencias = []
max_pages = 20

def scrape_page(page):
    url = f'{base_url}{page}'
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        results = soup.find_all(class_='card')

        if len(results) > 0:
            for result in results:
                link = result.get('href')
                if link is not None:
                    residencias.append('https://www.argenprop.com' + link)
            return True
        else:
            return False
    else:
        return False

with ThreadPoolExecutor(max_workers=8) as executor:   
    futures = [executor.submit(scrape_page, page) for page in range(1, max_pages + 1)]

    for future in futures:
        if future.result():
            time.sleep(0.5)  


  0%|          | 0/20 [00:00<?, ?it/s]

In [4]:
data = []

def process_residencia(residencia):
    with requests.Session() as session:
        response = session.get(residencia)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        main_features = soup.find('ul', class_='property-main-features')
        location_container = soup.find('div', class_='location-container')
        price_element = soup.find('p', class_='titlebar__price')

        if main_features:
            features = main_features.find_all('li')
            residence_data = {'Link': residencia}

            for feature in features:
                title = feature.get('title')
                value = feature.find('p', class_='strong').text.strip()
                residence_data[title] = value

            if location_container:
                location_title = location_container.find('h2').text.strip()
                location_info = location_container.find('p').text.strip()                
                residence_data['Calle'] = location_title
                residence_data['Ubicacion'] = location_info

            if price_element:
                price = price_element.get_text(strip=True)
                residence_data['Precio'] = price

            return residence_data

with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    results = executor.map(process_residencia, residencias)
    for result in results:
        if result:
            data.append(result)


df = pd.DataFrame(data)

In [8]:
df.shape

(398, 18)

In [6]:
df.isna().sum()

Link                  0
Sup. cubierta         3
Dormitorios          18
Antigüedad          196
Baños                 6
Ambientes            11
Cocheras            155
Sup. terreno        212
Calle                 3
Ubicacion             3
Precio                0
Estado              315
Orientación         290
Toilettes           378
Antiguedad          259
Disposición         312
Apto profesional    376
Permite mascota     393
dtype: int64

In [2]:
columns_to_drop = ['Dependencia', 'Cantidad de baños', 'Cantidad de dormitorios', 'Toilletes', 'Permite mascota', 'Toilettes', 'Superficie cubierta', 'Apto profesional', 'Orientación', 'Disposición']
df = df.drop(columns=columns_to_drop)


In [None]:
 df.to_csv('argenprop.csv', index=False)