# Retrieve listings from `Zonaprop`
- https://www.machinelearnear.com/
- https://www.youtube.com/@machinelearnear

## First download pages as `.html` files

In [1]:
# !pip install cloudscraper

In [2]:
import cloudscraper
import re
from bs4 import BeautifulSoup
import os
from tqdm import tqdm

In [3]:
def download_zonaprop_pages(search_url, folder_path):
    # crear una instancia de cloudscraper con un delay para evitar el error 403
    scraper = cloudscraper.create_scraper(delay=30)

    # primera solicitud para obtener el número total de páginas
    res = scraper.get(search_url)
    if res.status_code != 200:
        print(f'Error: {res.status_code}')
        return

    soup = BeautifulSoup(res.text, 'html.parser')

    # encontrar el total de páginas
    total_pages = None
    for script in soup.find_all('script'):
        if 'totalPages' in script.text:
            match = re.search(r'"totalPages":(\d+)', script.text)
            if match:
                total_pages = int(match.group(1))
                break

    if not total_pages:
        print('No se encontró el total de páginas.')
        return

    # generar la lista de URLs
    urls = [f"{search_url}-pagina-{i}.html" for i in range(1, total_pages + 1)]

    # crear el directorio si no existe
    os.makedirs(folder_path, exist_ok=True)

    # descargar cada página y guardar su contenido, si no existe previamente
    for index, url in enumerate(tqdm(urls, desc="Descargando páginas .."), start=1):
        filename = f'{folder_path}/listings-{index:03}.html'
        if not os.path.exists(filename):
            res = scraper.get(url)
            if res.status_code == 200:
                soup = BeautifulSoup(res.text, 'html.parser')
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(str(soup))
            else:
                print(f'Error al obtener la página {index}: {res.status_code}')
        else:
            print(f'Archivo {filename} ya existe, se omite su descarga.')

In [None]:
SEARCH_URL = "https://www.zonaprop.com.ar/inmuebles-venta-recoleta-barrio-norte-palermo-colegiales-monserrat-san-nicolas-retiro-con-balcon-y-disposicion-frente-mas-30-m2-cubiertos-publicado-hace-menos-de-1-mes-50000-120000-dolar-orden-publicado-descendente.html"
download_zonaprop_pages(SEARCH_URL, "raw_data/zonaprop")

## Extract info from `.html` files into `Pandas` dataframe

### Define imports and functions

In [234]:
import os
import string
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

In [334]:
def extract_property_data(soup):
    # buscamos el tag de script por su id
    script = soup.find('script', id='preloadedData')

    if not script:
        return "No se encontraron los datos de las propiedades."

    # extraemos la cadena json del tag de script
    json_str = re.search(r'window\.__PRELOADED_STATE__ = (\{.*?\});', script.string, re.DOTALL | re.MULTILINE).group(1)

    # cargamos la cadena json en un diccionario de python
    data = json.loads(json_str)

    # definimos la base url
    base_url = 'https://www.zonaprop.com.ar'

    extracted_data = []
    for prop in data['listStore']['listPostings']:
        # extraemos los datos de precio y expensas
        amount_in_usd = prop['priceOperationTypes'][0]['prices'][0].get('amount', None)
        expenses_in_ars = prop['expenses']['amount'] if prop['expenses'] else None

        # normalizar y extraer características principales
        main_features = {feature_details['label'].lower().replace(' ', '_').translate(str.maketrans('', '', string.punctuation)): feature_details.get('value', 'No disponible')
                         for feature_id, feature_details in prop.get('mainFeatures', {}).items()}

        # normalizar y extraer características generales
        general_features = {feature_details['label'].lower().replace(' ', '_').translate(str.maketrans('', '', string.punctuation)): feature_details.get('value', 'No disponible')
                            for feature_id, feature_details in prop['generalFeatures'].get('Características generales', {}).items()}

        # extraer geolocalización
        geolocation = prop['postingLocation']['postingGeolocation']['geolocation'] if prop['postingLocation']['postingGeolocation'] else {}
        latitude = geolocation.get('latitude', 'No se encontró la latitud')
        longitude = geolocation.get('longitude', 'No se encontró la longitud')

        # extraemos las fotos
        photos = prop.get('visiblePictures', {}).get('pictures', [])
        photos_urls = [photo.get('url1200x1200') for photo in photos if 'url1200x1200' in photo]

        # google maps
        google_maps = f"https://maps.google.com/?q={latitude},{longitude}"

        # whatsapp check
        whatsapp = prop.get('whatsApp', '')

        # published date
        modified_date = datetime.strptime(prop['modified_date'], "%Y-%m-%dT%H:%M:%S%z")

        # compilar toda la información
        extracted_data.append({
            "listing_url": base_url + prop.get('url', ''),
            "asking_price_in_usd": amount_in_usd,
            "expensas_in_ars": expenses_in_ars,
            "latitude": latitude,
            "longitude": longitude,
            "google_maps": google_maps,
            "photos": photos_urls,
            "whatsapp": re.sub(r'\s+', '', whatsapp) if whatsapp else '',
            "published_on": modified_date.strftime("%d-%m-%Y"),
            **main_features,
            **general_features,
        })

    return extracted_data

In [335]:
def create_dataframe_from_html(folder_path):
    # lista para almacenar los datos de todas las propiedades
    all_properties = []

    # iteramos sobre cada archivo en el directorio
    for filename in os.listdir(folder_path):
        if filename.endswith('.html'):
            file_path = os.path.join(folder_path, filename)

            # leemos el contenido del archivo html
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file.read(), 'html.parser')

            # extraemos los datos de las propiedades
            properties_data = extract_property_data(soup)

            # verificamos si los datos son una lista y los agregamos a all_properties
            if isinstance(properties_data, list):
                all_properties.extend(properties_data)

    # create el dataframe
    return pd.DataFrame(all_properties)

### Run on downloaded `.html` files

In [336]:
# ruta a la carpeta donde están los archivos html
folder_path = 'raw_data/zonaprop'

In [337]:
# create el dataframe
zonaprop_listings = create_dataframe_from_html(folder_path)
zonaprop_listings.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,toilette,departamentosporpiso,cantidadpisosenedificio,cocheras,hidromasaje,accesoparapersonasconmovilidadreducida,promotiondescription,coberturacochera,cochera,usocomercial
0,https://www.zonaprop.com.ar/propiedades/depart...,118000,34500.0,-34.576309,-58.432937,"https://maps.google.com/?q=-34.5763092,-58.432...",[https://imgar.zonapropcdn.com/avisos/resize/1...,5491150525300.0,03-01-2024,Frente,...,,,,,,,,,,
1,https://www.zonaprop.com.ar/propiedades/depart...,98500,35000.0,-34.582701,-58.448237,"https://maps.google.com/?q=-34.5827007,-58.448...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491131797700.0,03-01-2024,Frente,...,,,,,,,,,,
2,https://www.zonaprop.com.ar/propiedades/depart...,82000,16000.0,-34.613535,-58.384005,"https://maps.google.com/?q=-34.6135355,-58.384...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,,,
3,https://www.zonaprop.com.ar/propiedades/anchor...,99000,23000.0,-34.599022,-58.408783,"https://maps.google.com/?q=-34.5990219,-58.408...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491151239846.0,17-01-2024,Frente,...,,,,,,,,,,
4,https://www.zonaprop.com.ar/propiedades/depart...,109000,,-34.574454,-58.443086,"https://maps.google.com/?q=-34.5744539,-58.443...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,,,


In [338]:
print(len(zonaprop_listings))

182


#### And then save to disk just in case

In [339]:
zonaprop_listings.to_csv('processed/zonaprop_listings.csv', index=False)

In [340]:
zonaprop_listings.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,toilette,departamentosporpiso,cantidadpisosenedificio,cocheras,hidromasaje,accesoparapersonasconmovilidadreducida,promotiondescription,coberturacochera,cochera,usocomercial
0,https://www.zonaprop.com.ar/propiedades/depart...,118000,34500.0,-34.576309,-58.432937,"https://maps.google.com/?q=-34.5763092,-58.432...",[https://imgar.zonapropcdn.com/avisos/resize/1...,5491150525300.0,03-01-2024,Frente,...,,,,,,,,,,
1,https://www.zonaprop.com.ar/propiedades/depart...,98500,35000.0,-34.582701,-58.448237,"https://maps.google.com/?q=-34.5827007,-58.448...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491131797700.0,03-01-2024,Frente,...,,,,,,,,,,
2,https://www.zonaprop.com.ar/propiedades/depart...,82000,16000.0,-34.613535,-58.384005,"https://maps.google.com/?q=-34.6135355,-58.384...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,,,
3,https://www.zonaprop.com.ar/propiedades/anchor...,99000,23000.0,-34.599022,-58.408783,"https://maps.google.com/?q=-34.5990219,-58.408...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491151239846.0,17-01-2024,Frente,...,,,,,,,,,,
4,https://www.zonaprop.com.ar/propiedades/depart...,109000,,-34.574454,-58.443086,"https://maps.google.com/?q=-34.5744539,-58.443...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,,,


## As an extra, we want to figure out how popular a listing is by the number of page views

In [343]:
def extraer_vistas_de_usuarios(df):
    # creamos una instancia de cloudscraper con un delay
    scraper = cloudscraper.create_scraper(delay=30)

    # lista para guardar los resultados
    resultados = []

    # vamos recorriendo cada url en el dataframe
    for url in tqdm(df['listing_url'], desc="Descargando páginas .."):
        res = scraper.get(url)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            script = soup.find('script', string=re.compile(r'usersViews\s*=\s*\d+|antiquity\s*=\s*\''))

            if script:
                # usamos una regex para encontrar el número de vistas
                match_users_views = re.search(r'usersViews\s*=\s*(\d+)', script.string)
                users_views = int(match_users_views.group(1)) if match_users_views else 0

                # usamos otra regex para encontrar la antigüedad
                match_antiquity = re.search(r"antiquity\s*=\s*'Publicado hace (\d+) días'", script.string)
                if match_antiquity:
                    antiquity = int(match_antiquity.group(1))
                elif 'Publicado hoy' in script.string:
                    antiquity = 0
                elif 'Publicado desde ayer' in script.string:
                    antiquity = 1
                else:
                    antiquity = 0

            else:
                users_views = 0
                antiquity = 0

            # calcular vistas por día
            if isinstance(users_views, int) and isinstance(antiquity, int) and antiquity != 0:
                views_per_day = users_views / antiquity
            else:
                views_per_day = 0

            # agregamos los resultados a la lista
            resultados.append({
                'listing_url': url, 
                'user_views': users_views, 
                'days': antiquity,
                'views_per_day': int(views_per_day)
            })
        else:
            print(f'Error al cargar la página: {res.status_code}')

    # armamos el dataframe con los resultados
    return pd.DataFrame(resultados)

In [344]:
# suponiendo que `zonaprop_listings` es tu dataframe
listings_user_views = extraer_vistas_de_usuarios(zonaprop_listings)

Descargando páginas ..:  12%|█▏        | 22/182 [00:10<03:02,  1.14s/it]

Error al cargar la página: 502


Descargando páginas ..: 100%|██████████| 182/182 [00:55<00:00,  3.27it/s]

Error al cargar la página: 404





In [353]:
print(f'was able to get info from {len(listings_user_views)/len(zonaprop_listings)*100:.1f} % of the listings')

was able to get info from 98.9 % of the listings


In [354]:
listings_user_views.head()

Unnamed: 0,listing_url,user_views,days,views_per_day
0,https://www.zonaprop.com.ar/propiedades/depart...,78,16,4
1,https://www.zonaprop.com.ar/propiedades/depart...,17,16,1
2,https://www.zonaprop.com.ar/propiedades/depart...,55,17,3
3,https://www.zonaprop.com.ar/propiedades/anchor...,206,17,12
4,https://www.zonaprop.com.ar/propiedades/depart...,246,17,14


### Merge both `dataframes` and do some post-processing

In [361]:
def fusionar_y_recalcular(df_zonaprop, df_views):
    # fusionate los dataframes en 'listing_url'
    df_fusionado = pd.merge(df_zonaprop, df_views, on='listing_url', how='left')

    # hoy es el día de hoy, para calcular la diferencia de días
    hoy = datetime.now()

    # dale una recorrida a cada fila
    for index, row in df_fusionado.iterrows():
        # si 'user_views' es mayor a 0 y 'days' es 0
        if row['user_views'] > 0 and row['days'] == 0:
            # parseamos la fecha 'published_on' a un objeto datetime
            fecha_publicacion = datetime.strptime(row['published_on'], '%d-%m-%Y')
            # calculamos la diferencia de días
            diferencia_dias = (hoy - fecha_publicacion).days
            # actualizamos 'days' con la diferencia de días
            df_fusionado.at[index, 'days'] = diferencia_dias
            # recalculamos 'views_per_day'
            df_fusionado.at[index, 'views_per_day'] = row['user_views'] / diferencia_dias if diferencia_dias > 0 else row['user_views']

    return df_fusionado

In [365]:
# suponiendo que zonaprop_listings y listings_user_views son tus dataframes
zonaprop_with_viewcount = fusionar_y_recalcular(zonaprop_listings, listings_user_views)

In [366]:
zonaprop_with_viewcount.to_csv('processed/zonaprop_with_viewcount.csv', index=False)

In [367]:
zonaprop_with_viewcount.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,cocheras,hidromasaje,accesoparapersonasconmovilidadreducida,promotiondescription,coberturacochera,cochera,usocomercial,user_views,days,views_per_day
0,https://www.zonaprop.com.ar/propiedades/depart...,118000,34500.0,-34.576309,-58.432937,"https://maps.google.com/?q=-34.5763092,-58.432...",[https://imgar.zonapropcdn.com/avisos/resize/1...,5491150525300.0,03-01-2024,Frente,...,,,,,,,,78.0,16.0,4.0
1,https://www.zonaprop.com.ar/propiedades/depart...,98500,35000.0,-34.582701,-58.448237,"https://maps.google.com/?q=-34.5827007,-58.448...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491131797700.0,03-01-2024,Frente,...,,,,,,,,17.0,16.0,1.0
2,https://www.zonaprop.com.ar/propiedades/depart...,82000,16000.0,-34.613535,-58.384005,"https://maps.google.com/?q=-34.6135355,-58.384...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,55.0,17.0,3.0
3,https://www.zonaprop.com.ar/propiedades/anchor...,99000,23000.0,-34.599022,-58.408783,"https://maps.google.com/?q=-34.5990219,-58.408...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,5491151239846.0,17-01-2024,Frente,...,,,,,,,,206.0,17.0,12.0
4,https://www.zonaprop.com.ar/propiedades/depart...,109000,,-34.574454,-58.443086,"https://maps.google.com/?q=-34.5744539,-58.443...",[https://imgar.zonapropcdn.com/avisos/1/00/52/...,,17-01-2024,Frente,...,,,,,,,,246.0,17.0,14.0


In [198]:
# this works:
# https://www.booking.com/searchresults.en-gb.html?latitude=-34.5745;longitude=-58.451499999999996;&dest_type=district&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=b72a6a79f6b5033f&ac_meta=GhBiNzJhNmE3OWY2YjUwMzNmIAAoATICZW46CmNvbGVnaWFsZXNAAEoAUAA%3D&checkin=2024-02-01&checkout=2024-02-29&group_adults=2&no_rooms=1&group_children=0