# Retrieve listings from `Zonaprop` and post-process
- https://www.machinelearnear.com/
- https://www.youtube.com/@machinelearnear

## First download pages as `.html` files

In [1]:
# !pip install cloudscraper

In [2]:
import cloudscraper
import re
from bs4 import BeautifulSoup
import os
from tqdm import tqdm

In [3]:
def download_zonaprop_pages(search_url, folder_path, overwrite=False):
    # crear una instancia de cloudscraper con un delay para evitar el error 403
    scraper = cloudscraper.create_scraper(delay=30)

    # crear el directorio si no existe
    os.makedirs(folder_path, exist_ok=True)

    # si overwrite es True, eliminar archivos existentes en folder_path
    if overwrite:
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if os.path.isfile(file_path):
                os.unlink(file_path)
        print("Archivos existentes eliminados.")

    # primera solicitud para obtener el número total de páginas
    res = scraper.get(search_url)
    if res.status_code != 200:
        print(f'Error: {res.status_code}')
        return

    # primera solicitud para obtener el número total de páginas
    res = scraper.get(search_url)
    if res.status_code != 200:
        print(f'Error: {res.status_code}')
        return

    soup = BeautifulSoup(res.text, 'html.parser')

    # encontrar el total de páginas
    total_pages = None
    for script in soup.find_all('script'):
        if 'totalPages' in script.text:
            match = re.search(r'"totalPages":(\d+)', script.text)
            if match:
                total_pages = int(match.group(1))
                break

    if not total_pages:
        print('No se encontró el total de páginas.')
        return

    # generar la lista de URLs
    urls = [f"{search_url}-pagina-{i}.html" for i in range(1, total_pages + 1)]

    # crear el directorio si no existe
    os.makedirs(folder_path, exist_ok=True)

    # descargar cada página y guardar su contenido, si no existe previamente
    for index, url in enumerate(tqdm(urls, desc="Descargando páginas .."), start=1):
        filename = f'{folder_path}/listings-{index:03}.html'
        if not os.path.exists(filename):
            res = scraper.get(url)
            if res.status_code == 200:
                soup = BeautifulSoup(res.text, 'html.parser')
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(str(soup))
            else:
                print(f'Error al obtener la página {index}: {res.status_code}')
        else:
            print(f'Archivo {filename} ya existe, se omite su descarga.')

### Download files from `Zonaprop`

### Example:

SEARCH_URL = `"https://www.zonaprop.com.ar/inmuebles-venta-palermo-colegiales-villa-crespo-chacarita-con-balcon-y-disposicion-frente-mas-30-m2-cubiertos-50000-130000-dolar.html"`

In [4]:
SEARCH_URL = "https://www.zonaprop.com.ar/inmuebles-venta-barrio-norte-palermo-colegiales-villa-crespo-publicado-hace-menos-de-45-dias-50000-130000-dolar-orden-visitas-descendente.html"

In [5]:
download_zonaprop_pages(SEARCH_URL, "raw_data/zonaprop", overwrite=True)

Archivos existentes eliminados.


Descargando páginas ..: 100%|██████████| 75/75 [01:25<00:00,  1.14s/it]


## Extract info from `.html` files into `Pandas` dataframe

### Define imports and functions

In [6]:
import os
import string
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

In [7]:
def extract_property_data(soup):
    # buscamos el tag de script por su id
    script = soup.find('script', id='preloadedData')

    if not script:
        return "No se encontraron los datos de las propiedades."

    # extraemos la cadena json del tag de script
    json_str = re.search(r'window\.__PRELOADED_STATE__ = (\{.*?\});', script.string, re.DOTALL | re.MULTILINE).group(1)

    # cargamos la cadena json en un diccionario de python
    data = json.loads(json_str)

    # definimos la base url
    base_url = 'https://www.zonaprop.com.ar'

    extracted_data = []
    for prop in data['listStore']['listPostings']:
        # extraemos los datos de precio y expensas
        amount_in_usd = prop['priceOperationTypes'][0]['prices'][0].get('amount', None)
        expenses_in_ars = prop['expenses']['amount'] if prop['expenses'] else None

        # normalizar y extraer características principales
        main_features = {feature_details['label'].lower().replace(' ', '_').translate(str.maketrans('', '', string.punctuation)): feature_details.get('value', 'No disponible')
                         for feature_id, feature_details in prop.get('mainFeatures', {}).items()}

        # normalizar y extraer características generales
        general_features = {feature_details['label'].lower().replace(' ', '_').translate(str.maketrans('', '', string.punctuation)): feature_details.get('value', 'No disponible')
                            for feature_id, feature_details in prop['generalFeatures'].get('Características generales', {}).items()}

        # extraer geolocalización
        geolocation = prop['postingLocation']['postingGeolocation']['geolocation'] if prop['postingLocation']['postingGeolocation'] else {}
        latitude = geolocation.get('latitude', 'No se encontró la latitud')
        longitude = geolocation.get('longitude', 'No se encontró la longitud')

        # extraemos las fotos
        visible_pictures = prop.get('visiblePictures')
        photos_urls = []
        if visible_pictures and isinstance(visible_pictures, dict):
            pictures = visible_pictures.get('pictures', [])
            if isinstance(pictures, list):
                photos_urls = [photo.get('url1200x1200') for photo in pictures if 'url1200x1200' in photo]

        # google maps
        google_maps = f"https://maps.google.com/?q={latitude},{longitude}"

        # whatsapp check
        whatsapp = prop.get('whatsApp', '')

        # published date
        modified_date = datetime.strptime(prop['modified_date'], "%Y-%m-%dT%H:%M:%S%z")

        # compilar toda la información
        extracted_data.append({
            "listing_url": base_url + prop.get('url', ''),
            "asking_price_in_usd": amount_in_usd,
            "expensas_in_ars": expenses_in_ars,
            "latitude": latitude,
            "longitude": longitude,
            "google_maps": google_maps,
            "photos": photos_urls,
            "whatsapp": re.sub(r'\s+', '', whatsapp) if whatsapp else '',
            "published_on": modified_date.strftime("%d-%m-%Y"),
            **main_features,
            **general_features,
        })

    return extracted_data

In [8]:
def create_dataframe_from_html(folder_path):
    # lista para almacenar los datos de todas las propiedades
    all_properties = []

    # iteramos sobre cada archivo en el directorio
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.html'):
            file_path = os.path.join(folder_path, filename)

            # leemos el contenido del archivo html
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file.read(), 'html.parser')

            # extraemos los datos de las propiedades
            properties_data = extract_property_data(soup)

            # verificamos si los datos son una lista y los agregamos a all_properties
            if isinstance(properties_data, list):
                all_properties.extend(properties_data)

    # create el dataframe
    return pd.DataFrame(all_properties)

### Run process on downloaded `.html` files

In [9]:
# ruta a la carpeta donde están los archivos html
folder_path = 'raw_data/zonaprop'

In [10]:
# create el dataframe
zonaprop_listings = create_dataframe_from_html(folder_path)
zonaprop_listings.head()

100%|██████████| 76/76 [00:09<00:00,  8.05it/s]


Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,cocheras,toilettes,frentedelterrenomts,largodelterrenomts,cantidaddedepartamentos,saladejuegos,promotiondescription,tipoderubro,cantidaddeascensores,cantidaddelocalescomerciales
0,https://www.zonaprop.com.ar/propiedades/clasif...,98000,30000.0,-34.583418,-58.404087,"https://maps.google.com/?q=-34.5834184,-58.404087",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491165992023.0,24-01-2024,Frente,...,,,,,,,,,,
1,https://www.zonaprop.com.ar/propiedades/clasif...,114000,1.0,-34.591798,-58.423422,"https://maps.google.com/?q=-34.5917978,-58.423...",[https://imgar.zonapropcdn.com/avisos/resize/1...,,18-01-2024,Frente,...,,,,,,,,,,
2,https://www.zonaprop.com.ar/propiedades/clasif...,99000,19500.0,-34.578852,-58.413605,"https://maps.google.com/?q=-34.57885220000001,...",[https://imgar.zonapropcdn.com/avisos/1/00/53/...,,05-02-2024,,...,,,,,,,,,,
3,https://www.zonaprop.com.ar/propiedades/clasif...,51000,,-34.598794,-58.386697,"https://maps.google.com/?q=-34.5987945,-58.386...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491145506247.0,09-01-2024,Lateral,...,,,,,,,,,,
4,https://www.zonaprop.com.ar/propiedades/clasif...,75000,,-34.592027,-58.433416,"https://maps.google.com/?q=-34.5920267,-58.433...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,02-01-2024,,...,,,,,,,,,,


In [11]:
print(f'hay unas {len(zonaprop_listings)} propiedades en el dataframe')

hay unas 1487 propiedades en el dataframe


### Save a copy locally just in case

In [12]:
import os
import shutil

def crear_directorio(SEARCH_URL, delete=False):
    SAVE_DIR = SEARCH_URL.split("/")[-1].replace(".html", "")
    folder_path = f'processed/{SAVE_DIR}'
    # verificar si el directorio ya existe
    if os.path.exists(folder_path):
        if delete:
            shutil.rmtree(folder_path)
            print(f'El directorio "{folder_path}" ha sido eliminado.')
            os.makedirs(folder_path)
            print(f'Nuevo directorio "{folder_path}" creado.')
        else:
            print(f'Se mantiene el directorio existente "{folder_path}".')
    else:
        os.makedirs(folder_path)
        print(f'Directorio "{folder_path}" creado.')
    return folder_path

In [13]:
folder_path = crear_directorio(SEARCH_URL, delete=True)

El directorio "processed/inmuebles-venta-barrio-norte-palermo-colegiales-villa-crespo-publicado-hace-menos-de-45-dias-50000-130000-dolar-orden-visitas-descendente" ha sido eliminado.
Nuevo directorio "processed/inmuebles-venta-barrio-norte-palermo-colegiales-villa-crespo-publicado-hace-menos-de-45-dias-50000-130000-dolar-orden-visitas-descendente" creado.


In [14]:
zonaprop_listings.to_csv(f'{folder_path}/zonaprop_listings.csv', index=False)

In [15]:
zonaprop_listings.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,cocheras,toilettes,frentedelterrenomts,largodelterrenomts,cantidaddedepartamentos,saladejuegos,promotiondescription,tipoderubro,cantidaddeascensores,cantidaddelocalescomerciales
0,https://www.zonaprop.com.ar/propiedades/clasif...,98000,30000.0,-34.583418,-58.404087,"https://maps.google.com/?q=-34.5834184,-58.404087",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491165992023.0,24-01-2024,Frente,...,,,,,,,,,,
1,https://www.zonaprop.com.ar/propiedades/clasif...,114000,1.0,-34.591798,-58.423422,"https://maps.google.com/?q=-34.5917978,-58.423...",[https://imgar.zonapropcdn.com/avisos/resize/1...,,18-01-2024,Frente,...,,,,,,,,,,
2,https://www.zonaprop.com.ar/propiedades/clasif...,99000,19500.0,-34.578852,-58.413605,"https://maps.google.com/?q=-34.57885220000001,...",[https://imgar.zonapropcdn.com/avisos/1/00/53/...,,05-02-2024,,...,,,,,,,,,,
3,https://www.zonaprop.com.ar/propiedades/clasif...,51000,,-34.598794,-58.386697,"https://maps.google.com/?q=-34.5987945,-58.386...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491145506247.0,09-01-2024,Lateral,...,,,,,,,,,,
4,https://www.zonaprop.com.ar/propiedades/clasif...,75000,,-34.592027,-58.433416,"https://maps.google.com/?q=-34.5920267,-58.433...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,02-01-2024,,...,,,,,,,,,,


## We also want to figure out how popular a listing is by the number of user views

In [16]:
def extraer_vistas_de_usuarios(df):
    # creamos una instancia de cloudscraper con un delay
    scraper = cloudscraper.create_scraper(delay=30)

    # lista para guardar los resultados
    resultados = []

    # vamos recorriendo cada url en el dataframe
    for url in tqdm(df['listing_url'], desc="Descargando páginas .."):
        res = scraper.get(url)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            script = soup.find('script', string=re.compile(r'usersViews\s*=\s*\d+|antiquity\s*=\s*\''))

            if script:
                # usamos una regex para encontrar el número de vistas
                match_users_views = re.search(r'usersViews\s*=\s*(\d+)', script.string)
                users_views = int(match_users_views.group(1)) if match_users_views else 0

                # usamos otra regex para encontrar la antigüedad
                match_antiquity = re.search(r"antiquity\s*=\s*'Publicado hace (\d+) días'", script.string)
                if match_antiquity:
                    antiquity = int(match_antiquity.group(1))
                elif 'Publicado hoy' in script.string:
                    antiquity = 0
                elif 'Publicado desde ayer' in script.string:
                    antiquity = 1
                else:
                    antiquity = 0

            else:
                users_views = 0
                antiquity = 0

            # calcular vistas por día
            if isinstance(users_views, int) and isinstance(antiquity, int) and antiquity != 0:
                views_per_day = users_views / antiquity
            else:
                views_per_day = 0

            # agregamos los resultados a la lista
            resultados.append({
                'listing_url': url, 
                'user_views': users_views, 
                'days': antiquity,
                'views_per_day': int(views_per_day)
            })
        else:
            print(f'Error al cargar la página: {res.status_code}')

    # armamos el dataframe con los resultados
    return pd.DataFrame(resultados)

In [18]:
# suponiendo que `zonaprop_listings` es tu dataframe
listings_user_views = extraer_vistas_de_usuarios(zonaprop_listings)

Descargando páginas ..: 100%|██████████| 1487/1487 [10:20<00:00,  2.40it/s]


In [19]:
print(f'che, was able to get info from {len(listings_user_views)/len(zonaprop_listings)*100:.1f} % of the listings')

che, was able to get info from 100.0 % of the listings


In [20]:
listings_user_views.head()

Unnamed: 0,listing_url,user_views,days,views_per_day
0,https://www.zonaprop.com.ar/propiedades/clasif...,169,15,11
1,https://www.zonaprop.com.ar/propiedades/clasif...,170,27,6
2,https://www.zonaprop.com.ar/propiedades/clasif...,170,8,21
3,https://www.zonaprop.com.ar/propiedades/clasif...,170,42,4
4,https://www.zonaprop.com.ar/propiedades/clasif...,171,37,4


### Merge both `dataframes` and do some post-processing

In [21]:
def fusionar_y_recalcular(df_zonaprop, df_views):
    # fusionate los dataframes en 'listing_url'
    df_fusionado = pd.merge(df_zonaprop, df_views, on='listing_url', how='left')

    # hoy es el día de hoy, para calcular la diferencia de días
    hoy = datetime.now()

    # dale una recorrida a cada fila
    for index, row in df_fusionado.iterrows():
        try:
            superficietotal = float(row['superficietotal'])
        except ValueError:
            superficietotal = None
        # asegurate que 'superficietotal' sea un número antes de dividir
        if superficietotal and superficietotal > 0:
            df_fusionado.at[index, 'usd_per_m2'] = int(row['asking_price_in_usd'] / superficietotal)
        else:
            df_fusionado.at[index, 'usd_per_m2'] = 0
        # si 'user_views' es mayor a 0 y 'days' es 0
        if row['user_views'] > 0 and row['days'] == 0:
            # calcular la diferencia de días desde la publicación
            fecha_publicacion = datetime.strptime(row['published_on'], '%d-%m-%Y')
            diferencia_dias = (hoy - fecha_publicacion).days
            # actualizamos 'days' con la diferencia de días
            df_fusionado.at[index, 'days'] = diferencia_dias
        else:
            diferencia_dias = row['days']

        # actualizar 'views_per_day'
        # usamos el mínimo entre 30 y la diferencia de días
        df_fusionado.at[index, 'views_per_day'] = int(row['user_views'] / min(diferencia_dias, 30)) if diferencia_dias > 0 else int(row['user_views'])

    return df_fusionado

In [22]:
# suponiendo que zonaprop_listings y listings_user_views son tus dataframes
zonaprop_with_userviews = fusionar_y_recalcular(zonaprop_listings, listings_user_views)

In [27]:
zonaprop_with_userviews.to_csv(f'{folder_path}/zonaprop_with_userviews.csv', index=False)

In [28]:
zonaprop_with_userviews.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,cantidaddedepartamentos,saladejuegos,promotiondescription,tipoderubro,cantidaddeascensores,cantidaddelocalescomerciales,user_views,days,views_per_day,usd_per_m2
0,https://www.zonaprop.com.ar/propiedades/clasif...,98000,30000.0,-34.583418,-58.404087,"https://maps.google.com/?q=-34.5834184,-58.404087",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491165992023.0,24-01-2024,Frente,...,,,,,,,169,15,11,2450.0
1,https://www.zonaprop.com.ar/propiedades/clasif...,114000,1.0,-34.591798,-58.423422,"https://maps.google.com/?q=-34.5917978,-58.423...",[https://imgar.zonapropcdn.com/avisos/resize/1...,,18-01-2024,Frente,...,,,,,,,170,27,6,2111.0
2,https://www.zonaprop.com.ar/propiedades/clasif...,99000,19500.0,-34.578852,-58.413605,"https://maps.google.com/?q=-34.57885220000001,...",[https://imgar.zonapropcdn.com/avisos/1/00/53/...,,05-02-2024,,...,,,,,,,170,8,21,3300.0
3,https://www.zonaprop.com.ar/propiedades/clasif...,51000,,-34.598794,-58.386697,"https://maps.google.com/?q=-34.5987945,-58.386...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491145506247.0,09-01-2024,Lateral,...,,,,,,,170,42,5,1758.0
4,https://www.zonaprop.com.ar/propiedades/clasif...,75000,,-34.592027,-58.433416,"https://maps.google.com/?q=-34.5920267,-58.433...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,02-01-2024,,...,,,,,,,171,37,5,2205.0
