# Using LLMs to analyse `Zonaprop` listings
- https://www.machinelearnear.com/
- https://www.youtube.com/@machinelearnear
- https://www.youtube.com/watch?v=DuujwXarVbY&ab_channel=SantiagoMagnin

In [5]:
import os
import pandas as pd

def abrir_zonaprop_csv(fname, processed_folder = './processed'):
    # listamos las subcarpetas en la carpeta
    subfolders = [f.path for f in os.scandir(processed_folder) if f.is_dir()]
    
    # si no hay subcarpetas, retornamos
    if not subfolders:
        print(f"No hay subcarpetas en {processed_folder}.")
        return None

    # imprimimos las subcarpetas para que el usuario elija una
    print("Subcarpetas disponibles:")
    for idx, folder in enumerate(subfolders):
        print(f"{idx + 1}. {folder}")

    # pedimos al usuario que elija una subcarpeta
    choice = int(input("\nSeleccioná el número de la subcarpeta: "))
    selected_folder = subfolders[choice - 1]

    # construimos la ruta al archivo csv
    csv_file = os.path.join(selected_folder, fname)

    # verificamos si el archivo existe
    if not os.path.exists(csv_file):
        print(f"No se encontró el archivo '{csv_file}'.")
        return None

    # leemos y retornamos el dataframe
    return pd.read_csv(csv_file)

In [6]:
# load data from local
zonaprop_listings = abrir_zonaprop_csv('zonaprop_with_userviews.csv')
zonaprop_listings.head()

Subcarpetas disponibles:
1. ./processed/inmuebles-venta-palermo-colegiales-villa-crespo-chacarita-con-balcon-y-disposicion-frente-mas-30-m2-cubiertos-50000-130000-dolar



Seleccioná el número de la subcarpeta:  1


Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,categoríadeledificio,estadodeledificio,etapa,tipoedificio,permitemenoresde12,cantidaddormitorios,user_views,days,views_per_day,usd_per_m2
0,https://www.zonaprop.com.ar/propiedades/clasif...,118000,53000.0,-34.5822742,-58.4094659,"https://maps.google.com/?q=-34.5822742,-58.409...",['https://imgar.zonapropcdn.com/avisos/1/00/52...,5491141000000.0,29-01-2024,Frente,...,,,,,,,383,122,12,2360.0
1,https://www.zonaprop.com.ar/propiedades/clasif...,95000,28000.0,-34.5952278,-58.4455391,"https://maps.google.com/?q=-34.5952278,-58.445...",['https://imgar.zonapropcdn.com/avisos/1/00/52...,,29-01-2024,Frente,...,,,,,,,214,119,7,1338.0
2,https://www.zonaprop.com.ar/propiedades/clasif...,119500,60000.0,-34.5841554959184,-58.4247836714286,"https://maps.google.com/?q=-34.5841554959184,-...",['https://imgar.zonapropcdn.com/avisos/1/00/51...,5491163000000.0,08-01-2024,Frente,...,,,,,,,232,282,7,1991.0
3,https://www.zonaprop.com.ar/propiedades/empren...,94000,,-34.5682215,-58.4371057,"https://maps.google.com/?q=-34.5682215,-58.437...",['https://imgar.zonapropcdn.com/avisos/1/00/52...,5491165000000.0,15-01-2024,,...,,,,,,,1744,14,124,0.0
4,https://www.zonaprop.com.ar/propiedades/clasif...,109000,16300.0,-34.5966816877551,-58.4405421469388,"https://maps.google.com/?q=-34.5966816877551,-...",['https://imgar.zonapropcdn.com/avisos/1/00/51...,5491163000000.0,25-01-2024,Frente,...,,,,,,,443,280,14,1252.0


## Como traemos la información de `Airbnb`?

In [10]:
from sklearn.neighbors import BallTree

def find_within_radius(reference_df, target_df, radius_km=1):
    # Convert radius from kilometers to radians for BallTree
    radius_rad = radius_km / 6371  # Earth's radius in km

    # Creating BallTree with target data
    tree = BallTree(np.deg2rad(target_df[['latitude', 'longitude']].values), metric='haversine')

    # Initialize DataFrame to store results
    results_df = pd.DataFrame()

    # Iterate over reference DataFrame
    for index, row in reference_df.iterrows():
        # Query BallTree
        indices = tree.query_radius(np.deg2rad([[row['latitude'], row['longitude']]]), r=radius_rad)
        # Extract relevant rows from target DataFrame
        filtered_df = target_df.iloc[indices[0]]
        results_df = pd.concat([results_df, filtered_df])

    return results_df

In [11]:
airbnb_listings = pd.read_csv('processed/airbnb_listings.csv')
# airbnb_reviews = pd.read_csv('processed/airbnb_reviews.csv')

### Encontrar los listings mas cercanos

In [12]:
def find_within_radius(row, airbnb_listings, radius_km):
    radius_rad = radius_km / 6371
    valid_listings = airbnb_listings.dropna(subset=['latitude', 'longitude'])
    tree = BallTree(np.deg2rad(valid_listings[['latitude', 'longitude']].values), metric='haversine')
    if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
        indices = tree.query_radius(np.deg2rad([[row['latitude'], row['longitude']]]), r=radius_rad)
        return valid_listings.iloc[indices[0]]
    return pd.DataFrame()

In [22]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd

def agregar_info_airbnb(listings, airbnb_listings, radius_km):
    listings = listings.copy()
    listings['latitude'] = pd.to_numeric(listings['latitude'], errors='coerce')
    listings['longitude'] = pd.to_numeric(listings['longitude'], errors='coerce')

    def find_within_radius(row, radius_km):
        radius_rad = radius_km / 6371
        valid_listings = airbnb_listings.dropna(subset=['latitude', 'longitude'])
        tree = BallTree(np.deg2rad(valid_listings[['latitude', 'longitude']].values), metric='haversine')
        if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
            indices = tree.query_radius(np.deg2rad([[row['latitude'], row['longitude']]]), r=radius_rad)
            return valid_listings.iloc[indices[0]]
        return pd.DataFrame()

    for index, row in listings.iterrows():
        closest_listings = find_within_radius(row, radius_km)
        if closest_listings.empty: continue
        filtered = closest_listings.dropna(subset=['estimated_price_per_night_in_USD', 'review_scores_rating', 
                           'review_scores_location', 'review_scores_value', 'room_type', 'estimated_nights_booked_l30d'])

        # calcular la probabilidad de alquiler
        booking_counts = filtered['estimated_nights_booked_l30d'].value_counts()
        probabilidad_alquiler = 'más probable' if (booking_counts.get('high', 0) > booking_counts.get('low', 0)
                                                  ) or (booking_counts.get('high', 0) > booking_counts.get('medium', 0)) else 'menos probable'
        listings.at[index, 'airbnb_probabilidad_alquiler'] = probabilidad_alquiler
        
        # otros numeros grosos
        listings.at[index, 'airbnb_avg_price_entire_home'] = filtered[filtered['room_type'] == 'Entire home/apt']['estimated_price_per_night_in_USD'].mean()
        listings.at[index, 'airbnb_avg_price_private_room'] = filtered[filtered['room_type'] == 'Private room']['estimated_price_per_night_in_USD'].mean()
        listings.at[index, 'airbnb_avg_review_score_rating'] = filtered['review_scores_rating'].mean()
        listings.at[index, 'airbnb_avg_review_score_location'] = filtered['review_scores_location'].mean()
        listings.at[index, 'airbnb_avg_review_score_value'] = filtered['review_scores_value'].mean()

    return listings

#### Ejemplo

In [23]:
high_relevance_listings = zonaprop_listings[zonaprop_listings.views_per_day > 60].reset_index(drop=True)
high_relevance_listings = high_relevance_listings.dropna(axis=1, how='all')

In [24]:
# suponiendo que `high_relevance_listings` y `airbnb_listings` son tus DataFrames
a = agregar_info_airbnb(high_relevance_listings, airbnb_listings, 0.3)

In [27]:
a.head()

Unnamed: 0,listing_url,asking_price_in_usd,expensas_in_ars,latitude,longitude,google_maps,photos,whatsapp,published_on,disposición,...,user_views,days,views_per_day,usd_per_m2,airbnb_probabilidad_alquiler,airbnb_avg_price_entire_home,airbnb_avg_price_private_room,airbnb_avg_review_score_rating,airbnb_avg_review_score_location,airbnb_avg_review_score_value
0,https://www.zonaprop.com.ar/propiedades/empren...,94000,,-34.568221,-58.437106,"https://maps.google.com/?q=-34.5682215,-58.437...",['https://imgar.zonapropcdn.com/avisos/1/00/52...,5491165000000.0,15-01-2024,,...,1744,14,124,0.0,menos probable,64.757353,53.0,4.871844,4.948156,4.831489
1,https://www.zonaprop.com.ar/propiedades/empren...,77000,,-34.575123,-58.438449,"https://maps.google.com/?q=-34.5751231,-58.438...",['https://imgar.zonapropcdn.com/avisos/1/00/52...,5491127000000.0,24-01-2024,,...,1543,5,308,0.0,más probable,65.304348,82.857143,4.811051,4.866733,4.74375
2,https://www.zonaprop.com.ar/propiedades/clasif...,120000,27000.0,-34.576361,-58.468243,"https://maps.google.com/?q=-34.5763612,-58.468...",['https://imgar.zonapropcdn.com/avisos/1/00/50...,5491164000000.0,29-01-2024,Frente,...,124,0,124,1935.0,menos probable,30.571429,10.0,4.43875,4.52625,4.37125
3,https://www.zonaprop.com.ar/propiedades/clasif...,92000,0.0,-34.58647,-58.440049,"https://maps.google.com/?q=-34.5864703,-58.440...",['https://imgar.zonapropcdn.com/avisos/1/00/47...,5491156000000.0,29-01-2024,Frente,...,82,0,82,2000.0,más probable,62.440111,19.0,4.751162,4.760865,4.694865
4,https://www.zonaprop.com.ar/propiedades/clasif...,110000,88515.0,-34.582389,-58.422672,"https://maps.google.com/?q=-34.5823885663,-58....",['https://imgar.zonapropcdn.com/avisos/1/00/49...,5491144000000.0,29-01-2024,Frente,...,106,0,106,2820.0,más probable,91.587242,45.652174,4.788961,4.924731,4.72638


## Que criterio vamos a usar para filtrar las propiedades?
Nos vamos a basar en esto que dice el groso de [Santiago Magnin](https://twitter.com/santivende) en este video [*"GUÍA 2024 para COMPRADORES 💣 Webinar bomba 💣 ¿Cómo comprar con Javier Milei Presidente?"*](https://www.youtube.com/watch?v=DuujwXarVbY&ab_channel=SantiagoMagnin).

En resumen, para `Zonaprop`:
- (1) ~60 visitas/dia, se reservan en la primera semana, negociación promedio del 2%
- (2) ~30 visitas/dia, se reservan entre 8 y 30 dias, negociación promedio del 5%
- (3) ~20 visitas/dia, se reservan entre 31 y 60 dias, negociación promedio del 7%
- (4) ~10 visitas/dia, se reservan en más de 60 dias, negociación promedio del 8%

In [28]:
import IPython.display as display
from IPython.display import Markdown, HTML

def mostrar_resumen_propiedad(row):
    # crear el resumen en markdown
    caracteristicas = '\n'.join(f"- {col}: {row[col]}" for col in row.index if col not in ['photos', 'listing_url'] and 'airbnb_' not in col)
    airbnb_info = '\n'.join(f"- {col}: {row[col]}" for col in row.index if 'airbnb_' in col)

    markdown_resumen = f"""
------
**PROPIEDAD{row.name}: [Link]({row['listing_url']})**
**Characteristics:**
{caracteristicas}

**Airbnb:**
{airbnb_info}
"""

    # mostrar el resumen en markdown
    display.display(Markdown(markdown_resumen))

    # agregar fotos en html con grilla
    if row['photos']:
        fotos = row['photos'][1:-1].replace("'", "").split(', ')
        fotos_html = '<div style="display: grid; grid-template-columns: repeat(3, 1fr); grid-gap: 5px;">'
        fotos_html += ''.join(f'<div style="width: 100%;"><img src="{url.strip()}" style="width: 100%; max-height: 200px; object-fit: cover;" /></div>' for url in fotos[:6])
        fotos_html += '</div>'
        display.display(HTML(fotos_html))

In [41]:
# uso de la función para la primera fila
mostrar_resumen_propiedad(a.loc[10])


------
**PROPERTY 10: [Link](https://www.zonaprop.com.ar/propiedades/clasificado/veclapin-monoambiente-en-palermo-hollywood-6-piso-al-frente-50571531.html)**
**Characteristics:**
- asking_price_in_usd: 108000
- expensas_in_ars: 90000.0
- latitude: -34.5805555
- longitude: -58.4417977
- google_maps: https://maps.google.com/?q=-34.5805555,-58.4417977
- whatsapp: nan
- published_on: 29-01-2024
- disposición: Frente
- luminoso: Muy luminoso
- orientación: NE
- superficietotal: 40.0
- superficiecubierta: 35.0
- ambientes: nan
- dormitorio: nan
- baño: 1.0
- antigüedad: 12
- cantidadplantas: 1
- superficiesemicubiertam²: 5.0
- dormitorios: nan
- baños: nan
- aptocrédito: Apto crédito
- toilette: nan
- ambiente: 1.0
- departamentosporpiso: nan
- cantidadpisosenedificio: nan
- cochera: nan
- coberturacochera: nan
- cantidaddedepartamentos: nan
- user_views: 126
- days: 0
- views_per_day: 126
- usd_per_m2: 2700.0

**Airbnb:**
- airbnb_probabilidad_alquiler: más probable
- airbnb_avg_price_entire_home: 85.49846153846154
- airbnb_avg_price_private_room: 22.333333333333332
- airbnb_avg_review_score_rating: 4.836310975609756
- airbnb_avg_review_score_location: 4.901737804878048
- airbnb_avg_review_score_value: 4.7594817073170725


In [43]:
# Luego, fuera de la función:
from IPython.display import display, HTML, Markdown

markdown, fotos_html = mostrar_resumen_propiedad(a.loc[0])

In [14]:
assert

SyntaxError: invalid syntax (2389114725.py, line 1)

In [40]:
# Crear el resumen en Markdown
caracteristicas = '\n'.join(f"- {col}: {row[col]}" for col in row.index if col not in ['photos', 'listing_url'] and 'airbnb_' not in col)
airbnb_info = '\n'.join(f"- {col}: {row[col]}" for col in row.index if 'airbnb_' in col)

markdown_resumen = f"""
------
**PROPERTY {row.name}: [Link]({row['listing_url']})**
**Characteristics:**
{caracteristicas}

**Airbnb:**
{airbnb_info}
"""

In [None]:
# Acceder al ítem específico donde views_per_day es 0
item = zonaprop_listings[zonaprop_listings.views_per_day > 30].reset_index().loc[0]

# Crear una lista para guardar cada par columna-valor
pares_columna_valor = []

# Recorrer cada columna y su valor, excluyendo los NaN
for columna, valor in item.items():
    if pd.notnull(valor):
        pares_columna_valor.append(f'{columna}: {valor}')

# Unir todos los pares en una sola string
string_resultante = ', '.join(pares_columna_valor)

print(string_resultante)

## Usamos `Mixtral` a través de `Together.ai` 

In [None]:
from openai import OpenAI
import os

In [None]:
TOGETHER_API_KEY = ""
client = OpenAI(api_key=TOGETHER_API_KEY, base_url='https://api.together.xyz')

In [None]:
system_instructions = "You are an AI assistant that always responds in Argentinian Spanish and you are concise and professional"
user_query = f"""
Using the following information from a property listing in Buenos Aires, write a summary, paying more attention to
`asking_price` and `usd_per_m2`. Skip any URL.
{string_resultante}
"""

In [None]:
messages=[
    {"role": "system", "content": system_instructions},
    {"role": "user", "content": user_query}
]

In [None]:
chat_completion = client.chat.completions.create(
    messages=messages,
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    max_tokens=1024
)

In [None]:
print(chat_completion.choices[0].message.content)