In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import altair as alt
import polars as pl

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

alt.data_transformers.enable('default', max_rows=None)

print("Librerías importadas y configuración lista.")

In [None]:
url = "https://data.insideairbnb.com/mexico/df/mexico-city/2025-03-19/data/listings.csv.gz"

print(f"Cargando datos desde: {url}")

try:
    df = pl.read_csv(url)
    print("Datos cargados exitosamente con Polars.")
    print(f"El dataset tiene {df.shape[0]} filas y {df.shape[1]} columnas.")
except Exception as e:
    print(f"Error al cargar los datos: {e}")
    df = pl.DataFrame()

if not df.is_empty():
    df = df.with_columns(
        pl.col("price")
        .str.replace_all(r"\$", "")
        .str.replace_all(r",", "")
        .cast(pl.Float64, strict=False)
        .alias("price")
    ).drop_nulls(
        subset=["price"]
    ).filter(
        pl.col("price") > 0
    )
    print("Columna 'price' limpiada y convertida a tipo numérico usando Polars.")

In [None]:
print("\n--- 1. ¿Cuál es el precio promedio por noche? ---")
avg_price = df.select(pl.col('price').mean()).item()
print(f"El precio promedio de un Airbnb en CDMX es: ${avg_price:,.2f} MXN")

print("\n--- 2. ¿Cuáles son los tipos de alojamiento más comunes? ---")
room_types = df.group_by('room_type').agg(pl.len().alias('count')).sort('count', descending=True)
print(room_types)

print("\n--- 3. ¿Cuáles son las 10 delegaciones con más alojamientos? ---")
top_neighbourhoods = df.group_by('neighbourhood_cleansed').agg(pl.len().alias('count')).sort('count', descending=True).head(10)
print(top_neighbourhoods)

print("\n--- 4. ¿Quiénes son los 10 anfitriones con más propiedades? ---")
top_hosts = df.group_by('host_name').agg(pl.len().alias('count')).sort('count', descending=True).head(10)
print(top_hosts)

In [None]:
q95 = df.select(pl.col('price').quantile(0.95)).item()  
price_to_plot_df = df.filter(pl.col('price') < q95).to_pandas()

chart_hist = alt.Chart(price_to_plot_df).mark_bar().encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=50), title='Precio (MXN por noche)'),
    alt.Y('count()', title='Frecuencia'),
    tooltip=[alt.Tooltip('count()', title='Frecuencia'), alt.Tooltip('price:Q', bin=True, title='Rango de Precio')]
).properties(
    title='Distribución de Precios (para el 95% de los alojamientos)',
    width=700,
    height=400
)
chart_hist.show()

In [None]:
room_types_pd = room_types.to_pandas()

fig_room_types = px.bar(
    room_types_pd,
    x='room_type',
    y='count',
    title='Número de Alojamientos por Tipo',
    labels={'room_type': 'Tipo de Alojamiento', 'count': 'Cantidad'},
    color='room_type'
)
fig_room_types.show()

In [None]:
top_neighbourhoods_pd = top_neighbourhoods.to_pandas()

fig_hoods = px.bar(
    top_neighbourhoods_pd,
    y='neighbourhood_cleansed',
    x='count',
    orientation='h',
    title='Top 10 Delegaciones por Cantidad de Alojamientos',
    labels={'neighbourhood_cleansed': 'Delegación', 'count': 'Cantidad de Alojamientos'},
    color='count',
    color_continuous_scale=px.colors.sequential.Viridis
).update_yaxes(categoryorder="total ascending")
fig_hoods.show()

In [None]:
top_hosts_pd = top_hosts.to_pandas()

fig_hosts = px.bar(
    top_hosts_pd,
    y='host_name',
    x='count',
    orientation='h',
    title='Top 10 Anfitriones por Número de Propiedades',
    labels={'host_name': 'Nombre del Anfitrión', 'count': 'Cantidad de Propiedades'},
    color='count',
    color_continuous_scale=px.colors.sequential.Plasma
).update_yaxes(categoryorder="total ascending")
fig_hosts.show()

In [None]:
import polars as pl
import plotly.express as px

q95 = df.select(pl.col("price").quantile(0.95)).item()

df_filtered = (
    df
    .filter(pl.col("price").is_not_null())
    .filter(pl.col("latitude").is_not_null() & pl.col("longitude").is_not_null())
    .filter(pl.col("price") < q95)
)

n_sample = min(5000, df_filtered.height)
df_sample_for_plot = (
    df_filtered
    .sample(n=n_sample, seed=42)
    .select(["latitude", "longitude", "price", "name", "neighbourhood_cleansed"])
    .to_pandas()
)

center_lat = float(df_sample_for_plot["latitude"].mean())
center_lon = float(df_sample_for_plot["longitude"].mean())

fig_map = px.scatter_mapbox(
    df_sample_for_plot,
    lat="latitude",
    lon="longitude",
    color="price",
    size="price",
    color_continuous_scale=px.colors.sequential.Viridis_r,
    size_max=15,
    zoom=10,
    center={"lat": center_lat, "lon": center_lon},
    mapbox_style="open-street-map",
    hover_name="name",
    hover_data={"neighbourhood_cleansed": True, "price": ":$.2f"},
    title="Mapa Interactivo de Precios (≤ p95)"
)

fig_map.update_layout(legend_title_text="Precio (MXN)")
fig_map.show()