In [0]:
!pip install reverse_geocoder 

In [0]:
from pathlib import Path
import sys

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from data.airbnb_data_loader import load_airbnb_data, load_raw_airbnb_data, select_relevant_columns, cast_and_clean_types, analyze_missing_values, analyze_distributions, parse_pricing_details, parse_category_ratings, apply_listing_feature_extraction
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType

from pyspark.sql import functions as F
import pandas as pd

from config import continents
from data.travel_cities_data_loader import load_travel_cities

spark = SparkSession.builder.getOrCreate()

# Initial Airbnb Data

In [0]:
raw_airbnb = load_raw_airbnb_data(spark)
sel_airbnb = select_relevant_columns(raw_airbnb)
cast_airbnb = cast_and_clean_types(sel_airbnb)
price_airbnb = parse_pricing_details(cast_airbnb)
cat_airbnb = parse_category_ratings(price_airbnb)
init_airbnb = apply_listing_feature_extraction(cat_airbnb)

In [0]:
init_airbnb.printSchema()

In [0]:
init_airbnb.count()

In [0]:
display(init_airbnb.limit(10))

In [0]:
analyze_missing_values(init_airbnb)

In [0]:
analyze_distributions(init_airbnb)

In [0]:
import math
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType

def plot_numeric_distributions_fast(df, max_rows=10000, cols_per_row=3, show_kde=False):
    """
    Optimized for speed: limits rows to max_rows and disables KDE by default.
    """
    # 1. Identify Numeric Cols
    numeric_cols = []
    for field in df.schema.fields:
        if isinstance(field.dataType, (DoubleType, IntegerType, FloatType, LongType)):
            if "id" not in field.name.lower() and not field.name.startswith("is_"):
                numeric_cols.append(field.name)
    
    print(f"Plotting {len(numeric_cols)} columns using max {max_rows} rows...")

    # 2. Optimized Sampling
    # pdf = df.limit(max_rows).toPandas()

    total = df.count()
    fraction = min(1.0, max_rows / total)

    pdf = (
        df
        .sample(withReplacement=False, fraction=fraction, seed=42)
        .limit(max_rows)
        .toPandas()
    )

    # 3. Setup Grid
    n_cols = len(numeric_cols)
    if n_cols == 0:
        print("No numeric columns found.")
        return

    n_rows = math.ceil(n_cols / cols_per_row)
    
    # HISTOGRAMS
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 3 * n_rows))
    axes = axes.flatten()
    
    for i, col_name in enumerate(numeric_cols):
        # Drop NaNs just for this column
        data = pdf[col_name].dropna()
        
        # KEY CHANGE: kde=show_kde (False by default) makes this 10x faster
        # bins=30 ensures consistent granularity
        sns.histplot(data, ax=axes[i], kde=show_kde, bins=30, color="#00A699", edgecolor='none')
        axes[i].set_title(col_name, fontsize=10)
        axes[i].set_ylabel("") # Save space
        axes[i].set_xlabel("")
        
    for i in range(n_cols, len(axes)):
        axes[i].axis('off')
        
    plt.tight_layout()
    plt.suptitle(f"Distributions ({max_rows} rows)", y=1.02, fontsize=14)
    plt.show()

In [0]:
plot_numeric_distributions_fast(init_airbnb, max_rows=10000, show_kde=True)

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

from data.airbnb_data_loader import derive_price_per_night, normalize_currency_to_usd

# Prepare the dataframe lazily
df_prices = normalize_currency_to_usd(derive_price_per_night(init_airbnb))

# Select, Sample (~10%), and Collect
price_data = (
    df_prices
    .select("price_per_night")
    .dropna()
    .sample(withReplacement=False, fraction=0.1, seed=42) 
    .toPandas()
)

plt.figure(figsize=(10, 4))
sns.histplot(price_data['price_per_night'], bins=100, color="#FF5A5F", log_scale=True)
plt.axvline(1300, color='black', linestyle='--', linewidth=2, label='Outlier Cap ($1300)')
plt.title(f"Distribution of Nightly Prices (Sampled {len(price_data):,} listings)")
plt.xlabel("Price (USD)")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

# Initial OSM Data

In [0]:
init_osm = spark.read.parquet("dbfs:/vibebnb/data/osm_pois")

In [0]:
init_osm.printSchema()

In [0]:
init_osm.count()

In [0]:
display(init_osm.limit(10))

In [0]:
analyze_missing_values(init_osm)

In [0]:
analyze_distributions(init_osm)

In [0]:
plot_numeric_distributions_fast(init_osm, max_rows=10000, show_kde=True)

In [0]:
# Count by group
init_osm_poi = init_osm.groupBy("poi_group").count().toPandas().sort_values("count", ascending=False)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=init_osm_poi, x="count", y="poi_group", palette="viridis")
plt.xscale('log')
plt.title("POI Counts by Category (Log Scale)")
plt.xlabel("Total Count (Log Scale)")
plt.ylabel("Category")

# # Add text annotation
# top_cat = init_osm_poi.iloc[0]['poi_group']
# top_val = init_osm_poi.iloc[0]['count']
# bottom_cat = init_osm_poi.iloc[-1]['poi_group']
# bottom_val = init_osm_poi.iloc[-1]['count']
# plt.text(0.05, 0.5, 
#          f"{top_cat} vs {bottom_cat} Ratio: {int(top_val/bottom_val)}:1", 
#          fontsize=12, 
#          bbox=dict(facecolor='white', alpha=0.8),
#          transform=plt.gca().transAxes)

plt.tight_layout()
plt.show()

# Initial Travel City Data

In [0]:
CITIES_PATH = "dbfs:/vibebnb/data/travel_cities.parquet"
init_cities = spark.read.parquet(CITIES_PATH)

In [0]:
init_cities.printSchema()

In [0]:
init_cities.count()

In [0]:
display(init_cities.limit(10))

In [0]:
analyze_missing_values(init_cities)

In [0]:
analyze_distributions(init_cities)

In [0]:
plot_numeric_distributions_fast(init_cities, max_rows=10000, show_kde=True)

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Spark DataFrame to Pandas
pdf_cities = init_cities.toPandas()

# Plot using the Pandas DataFrame
plt.figure(figsize=(8, 5))
sns.countplot(
    data=pdf_cities, 
    x='budget_level', 
    order=['Budget', 'Mid-range', 'Luxury'], 
    palette="viridis"
)
plt.title("Distribution of Economic Classifications")
plt.xlabel("Budget Level")
plt.ylabel("Number of Cities")
plt.tight_layout()
plt.show()

In [0]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Bring data to Pandas first
pdf_cities = init_cities.toPandas()

# Define your helper function
def extract_jan_jul(json_str):
    try:
        if not json_str: return pd.Series([None, None])
        data = json.loads(json_str)
        # Extract Avg for Month 1 (Jan) and Month 7 (Jul)
        return pd.Series([data.get('1', {}).get('avg'), data.get('7', {}).get('avg')])
    except:
        return pd.Series([None, None])

# Apply the function on the PANDAS dataframe
pdf_cities[['temp_jan', 'temp_jul']] = pdf_cities['avg_temp_monthly'].apply(extract_jan_jul)

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pdf_cities, x='temp_jan', y='temp_jul', hue='budget_level', palette='viridis', alpha=0.7)
plt.plot([-10, 35], [-10, 35], 'r--', label="No Seasonality Line") 
plt.title("Climate seasonality: Winter (Jan) vs Summer (Jul) Temperatures")
plt.xlabel("Average January Temp (°C)")
plt.ylabel("Average July Temp (°C)")
plt.legend(title="Budget")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Final Datasets

In [0]:
cities_df = load_travel_cities(spark, CITIES_PATH)

In [0]:
airbnb_df = load_airbnb_data(spark)

In [0]:
OSM_BASE_DIR = "dbfs:/vibebnb/data/osm_pois"
OSM_PATHS = {
    "europe": f"{OSM_BASE_DIR}/europe_pois_enriched.parquet",
    "asia": f"{OSM_BASE_DIR}/asia_pois_enriched.parquet",
    "north_america": f"{OSM_BASE_DIR}/north_america_pois_enriched.parquet",
    "south_america": f"{OSM_BASE_DIR}/south_america_pois_enriched.parquet",
    "africa": f"{OSM_BASE_DIR}/africa_pois_enriched.parquet",
    "antarctica": f"{OSM_BASE_DIR}/antarctica_pois_enriched.parquet",
    "central_america":f"{OSM_BASE_DIR}/central_america_pois_enriched.parquet",
    "australia": f"{OSM_BASE_DIR}/australia_oceania_pois_enriched.parquet"

}


def counts_from_cc(df, cc_col="addr_cc"):
    """
    Count rows per continent using continents dict.
    Works for Airbnb + Cities (both have addr_cc).
    """
    # map country code -> continent using a Spark "when" chain
    cont_expr = None
    for cont, cc_list in continents.items():
        cond = F.col(cc_col).isin(cc_list)
        cont_expr = F.when(cond, F.lit(cont)) if cont_expr is None else cont_expr.when(cond, F.lit(cont))
    cont_expr = cont_expr.otherwise(F.lit("unknown"))

    out = (
        df.select(F.col(cc_col).alias("addr_cc"))
          .where(F.col("addr_cc").isNotNull())
          .withColumn("continent", cont_expr)
          .groupBy("continent")
          .agg(F.count("*").alias("n_rows"))
          .toPandas()
    )
    return out

def counts_from_osm_paths(osm_paths: dict, cc_col="addr_cc"):
    """
    Count rows per continent using the fact OSM is already split by continent file.
    We still optionally filter to countries you care about in config.continents[continent].
    """
    rows = []
    for cont, path in osm_paths.items():
        try:
            df = spark.read.parquet(path).select(cc_col)
   
            if cont in continents:
                df = df.where(F.col(cc_col).isin(continents[cont]))
            n = df.count()
            rows.append({"continent": cont, "n_rows": int(n)})
        except Exception as e:
            print(f"[OSM] Skipping {cont} ({path}) -> {e}")
    return pd.DataFrame(rows)


cities_df = load_travel_cities(spark, CITIES_PATH)

# -------------------------
# Compute counts
# -------------------------
airbnb_counts = counts_from_cc(airbnb_df, cc_col="addr_cc")
cities_counts = counts_from_cc(cities_df, cc_col="addr_cc")
osm_counts    = counts_from_osm_paths(OSM_PATHS, cc_col="addr_cc")

display(airbnb_counts)
display(cities_counts)
display(osm_counts)

In [0]:
%pip install geopandas shapely pyproj fiona

In [0]:
import geopandas as gpd
import matplotlib.pyplot as plt


# 1) Load world map + build continents geometry

world = gpd.read_file(
    "https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip"
)

world_cont = (
    world[["CONTINENT", "geometry"]]
    .dissolve(by="CONTINENT")
    .reset_index()
)

# Drop the irrelevant "Seven seas" polygon if present
world_cont = world_cont[world_cont["CONTINENT"] != "Seven seas (open ocean)"].copy()

# Canonical continent mapping 

CONTINENT_MAP = {
    "africa": "africa",
    "antarctica": "antarctica",
    "asia": "asia",
    "europe": "europe",
    "north_america": "north_america",
    "south_america": "south_america",
    "oceania": "oceania",
    "central_america": "north_america",      
    "australia": "oceania",
    "australia_oceania": "oceania",
    "australia__oceania": "oceania",
}

def _norm_raw(x):
    """Normalize raw strings to snake_case lowercase."""
    if x is None:
        return None
    return (
        str(x)
        .lower()
        .strip()
        .replace("-", "_")
        .replace(" ", "_")
    )

def to_canonical_continent(x):
    """
    Map your dataset continent value to Natural Earth's continent bucket (snake_case).
    Returns one of:
      africa, antarctica, asia, europe, north_america, south_america, oceania
    """
    k = _norm_raw(x)
    return CONTINENT_MAP.get(k, k)  # fallback: keep normalized value (helps you spot unexpected names)

# Normalize the map side too
world_cont["continent_norm"] = world_cont["CONTINENT"].apply(_norm_raw).apply(to_canonical_continent)

# -----------------------------
# 3) Plot function (choropleth + labels)
# -----------------------------
def plot_continent_map(title, counts_df):
    """
    counts_df expected columns:
      - continent   (your continent naming)
      - n_rows      (count)
    """
    df = counts_df.copy()
    df["continent_norm"] = df["continent"].apply(to_canonical_continent)

    merged = world_cont.merge(
        df[["continent_norm", "n_rows"]],
        on="continent_norm",
        how="left"
    )
    merged["n_rows"] = merged["n_rows"].fillna(0)

    fig, ax = plt.subplots(figsize=(12, 6))

    merged.plot(
        column="n_rows",
        ax=ax,
        legend=True,
        legend_kwds={"label": "Rows"}
    )

    # Add continent name labels near each polygon
    reps = merged.geometry.representative_point()
    for (name, x, y) in zip(merged["CONTINENT"], reps.x, reps.y):
        ax.text(
            x, y, name,
            fontsize=9,
            ha="center", va="center",
            bbox=dict(facecolor="white", alpha=0.6, edgecolor="none", pad=1.5)
        )

    ax.set_title(title)
    ax.axis("off")
    plt.show()

plot_continent_map("Airbnb — rows per continent", airbnb_counts)
plot_continent_map("Travel Cities — rows per continent", cities_counts)
plot_continent_map("OSM — rows per continent", osm_counts)

In [0]:
# WHAT
%pip install "scipy<1.11" reverse_geocoder
print(airbnb_df.count())

In [0]:
analyze_distributions(cities_df)

In [0]:
analyze_distributions(airbnb_df)

In [0]:
# Filter Datasets to Europe
europe_cc = continents['europe']

# Airbnb
df_ab_eur = airbnb_df.filter(F.col("addr_cc").isin(europe_cc))
# Cities (Travel Hubs)
df_ci_eur = cities_df.filter(F.col("addr_cc").isin(europe_cc))
# OSM (Already loaded from 'europe' file, but let's be safe)
df_osm_eur = spark.read.parquet(OSM_PATHS["europe"])

In [0]:
# Countries that are physically complex (islands/transcontinental) and need explicit inclusion
# GE=Georgia, TR=Turkey, SI=Slovenia, MT=Malta, IT=Italy (islands), SE=Sweden (Gotland), PT=Portugal (Azores/Madeira), ES=Spain (Canaries)
EXTRA_COUNTRIES = ["GE", "TR", "SI", "MT", "IT", "SE", "PT", "ES"]

In [0]:
n_ab = df_ab_eur.count()
n_ci = df_ci_eur.count()
n_osm = df_osm_eur.count()
n_countries = df_ab_eur.select("addr_cc").distinct().count()

print(f"--- FINAL EUROPEAN DATASET STATISTICS ---")
print(f"Total Listings:   {n_ab:,}")
print(f"Total POIs:       {n_osm:,}")
print(f"Travel Cities:    {n_ci} (Major Tourist Hubs)")
print(f"Countries:        {n_countries}")

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# 1. Aggregate Counts: Country x Category
pdf_osm_composition = (
    df_osm_eur
    .groupBy("addr_cc", "poi_group")
    .count()
    .toPandas()
)

# 2. Filter to Top 10 Countries (to keep plot readable)
top_countries = df_ab_eur.groupBy("addr_cc").count().orderBy(F.desc("count")).limit(10).toPandas()["addr_cc"].tolist()
pdf_plot = pdf_osm_composition[pdf_osm_composition["addr_cc"].isin(top_countries)]

# 3. Pivot: Rows=Country, Cols=Category, Values=Count
pdf_pivot = pdf_plot.pivot(index="addr_cc", columns="poi_group", values="count").fillna(0)

# 4. Normalize to 100% (Calculate percentages)
pdf_normalized = pdf_pivot.div(pdf_pivot.sum(axis=1), axis=0) * 100

# 5. Plot Stacked Bar Chart
ax = pdf_normalized.plot(
    kind="bar", 
    stacked=True, 
    figsize=(12, 6), 
    colormap="tab20", # High contrast palette
    width=0.8
)

plt.title("Environmental 'Vibe' Composition by Country")
plt.xlabel("Country")
plt.ylabel("Percentage of National POIs")
plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F

# 1. Create a "Universe" of all countries from your Config
# This ensures we count countries that have ZERO cities (critical for sparsity)
all_countries_data = []
for cont, codes in continents.items():
    for cc in codes:
        all_countries_data.append({"continent": cont, "addr_cc": cc})

pdf_all_countries = pd.DataFrame(all_countries_data)

# 2. Get Actual City Counts per Country from Data
# We use the enriched cities_df which has 'addr_cc'
pdf_city_counts = (
    cities_df
    .groupBy("addr_cc")
    .count()
    .withColumnRenamed("count", "n_cities")
    .toPandas()
)

# 3. Merge to calculate Sparsity
# Left join ensures countries with 0 cities appear as NaN -> fill with 0
pdf_sparsity = pd.merge(pdf_all_countries, pdf_city_counts, on="addr_cc", how="left")
pdf_sparsity["n_cities"] = pdf_sparsity["n_cities"].fillna(0)

# 4. Aggregation: Calculate Mean, Std, and Coverage
sparsity_metrics = (
    pdf_sparsity
    .groupby("continent")
    .agg(
        Total_Cities=('n_cities', 'sum'),
        Total_Countries=('addr_cc', 'count'),
        Avg_Cities_per_Country=('n_cities', 'mean'),
        Std_Dev=('n_cities', 'std'),
        Countries_with_Data=('n_cities', lambda x: (x > 0).sum())
    )
)

# 5. Add "Coverage %" column
sparsity_metrics["Coverage_Pct"] = (
    sparsity_metrics["Countries_with_Data"] / sparsity_metrics["Total_Countries"] * 100
).round(1)

# Format for display
sparsity_metrics = sparsity_metrics.sort_values("Avg_Cities_per_Country", ascending=False).round(2)

# Display
print("--- City Data Sparsity Analysis by Continent ---")
display(sparsity_metrics)

# Optional: Quick textual summary for the report
print("\nQuick Report Summary:")
for ix, row in sparsity_metrics.iterrows():
    print(f"{ix}: {int(row['Total_Cities'])} cities across {int(row['Total_Countries'])} countries "
          f"(Avg: {row['Avg_Cities_per_Country']} ± {row['Std_Dev']}). Coverage: {row['Coverage_Pct']}%")

In [0]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Aggregation for Plotting (Listings per Country)
pdf_ab_counts = df_ab_eur.groupBy("addr_cc").count() \
    .withColumnRenamed("count", "listings_count") \
    .toPandas().sort_values("listings_count", ascending=False).head(15)

# Aggregation for Cities (Tourist Hubs per Country)
pdf_ci_counts = df_ci_eur.groupBy("addr_cc").count() \
    .withColumnRenamed("count", "cities_count") \
    .toPandas()

# Merge for side-by-side comparison
pdf_merged = pd.merge(pdf_ab_counts, pdf_ci_counts, on="addr_cc", how="left")

# --- PLOT OF EUROPEAN MARKET DISTRIBUTION ---
# We use a dual-axis plot to show Listings (Bars) and City Hubs (Line/Points)
fig, ax1 = plt.subplots(figsize=(12, 6))

# 1. Bar Plot (Listings)
sns.barplot(
    data=pdf_merged, 
    x="addr_cc", 
    y="listings_count", 
    color="#00A699", 
    ax=ax1, 
    alpha=0.8
    # No 'label=' here, avoiding auto-legend confusion
)
ax1.set_ylabel("Number of Listings")
ax1.set_xlabel("Country")
ax1.set_title("Top 15 European Markets: Listing Density vs. Tourist Hubs")

# 2. Scatter Plot (Cities)
ax2 = ax1.twinx()
sns.scatterplot(
    data=pdf_merged, 
    x="addr_cc", 
    y="cities_count", 
    color="#FF5A5F", 
    s=100, 
    ax=ax2, 
    marker="D"
    # No 'label=' here either
)
ax2.set_ylabel("Major Tourist Cities")
ax2.set_ylim(0, max(pdf_merged['cities_count']) * 1.2)

# Manually define what the legend should look like
legend_elements = [
    Patch(facecolor='#00A699', edgecolor='none', label='Airbnb Listings'),
    Line2D([0], [0], marker='D', color='w', markerfacecolor='#FF5A5F', markersize=10, label='Major Tourist Cities')
]

# Place the legend on ax1
ax1.legend(handles=legend_elements, loc="upper right")

plt.grid(False)
plt.show()

In [0]:
!pip install adjustText

In [0]:
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import pandas as pd
from pyspark.sql import functions as F
from adjustText import adjust_text

def count_by_country(df, cc_col="addr_cc"):
    return (
        df.filter(F.col(cc_col).isNotNull())
          .groupBy(cc_col)
          .count()
          .withColumnRenamed("count", "n_rows")
          .toPandas()
    )

pdf_airbnb_cc = count_by_country(df_ab_eur)
pdf_cities_cc = count_by_country(df_ci_eur)
pdf_osm_cc = count_by_country(df_osm_eur)


def plot_europe_choropleth(title, counts_df, cc_col="addr_cc"):
    # Load Map (50m)
    world = gpd.read_file("https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_countries.zip")

    # FIX MAP CODES: Patch '-99' codes so they match the group-by data (FR, NO, etc.)
    world.loc[world['NAME'] == 'Norway', 'ISO_A2'] = 'NO'
    world.loc[world['NAME'] == 'France', 'ISO_A2'] = 'FR'
    world.loc[world['NAME'] == 'Kosovo', 'ISO_A2'] = 'XK'

    # Filter Map
    europe_map = world[
        (world["CONTINENT"] == "Europe") | 
        (world["ISO_A2"].isin(EXTRA_COUNTRIES))
    ].copy()

    # Merge Data
    europe_map = europe_map.merge(
        counts_df,
        left_on="ISO_A2",
        right_on=cc_col,
        how="left"
    )
    europe_map["n_rows"] = europe_map["n_rows"].fillna(0)

    # Plot
    fig, ax = plt.subplots(figsize=(14, 12))
    
    europe_map.plot(
        column="n_rows",
        cmap="Blues",       
        linewidth=0.5,
        ax=ax,
        edgecolor="0.6",
        legend=True,
        legend_kwds={'label': "Count", 'shrink': 0.6},
        missing_kwds={'color': '#f0f0f0'}
    )

    # IMPORTANT: set view window BEFORE labeling/adjusting
    ax.set_xlim(-32, 50)
    ax.set_ylim(27, 72)

    # LABELING LOOP
    texts = []
    for idx, row in europe_map.iterrows():
        iso = row['ISO_A2']
        count_val = row['n_rows']
        
        # Only label if we actually have data for this country
        if count_val == 0: continue

        # Standard Placement: Use the map's representative point
        rep_point = row.geometry.representative_point()
        x, y = rep_point.x, rep_point.y

        # Draw Label (Only if inside zoom view)
        if -32 < x < 50 and 27 < y < 72:
            texts.append(ax.text(
                x, y, iso,
                fontsize=9,
                ha='center', va='center',
                color="white",
                fontweight='bold',
                path_effects=[pe.withStroke(linewidth=2, foreground="black")]
            ))

    # Keep labels on/near countries: small nudges, no arrows
    adjust_text(
        texts,
        ax=ax,
        only_move={'text': 'xy'},
        max_move=(10, 10),          # small move in points
        force_text=(0.2, 0.2),
        expand_text=(1.05, 1.1),
        expand_points=(1.05, 1.1),
        ensure_inside_axes=True,
        lim=200
    )

    ax.set_title(title, fontsize=16, fontweight='bold')
    ax.axis("off")
    plt.tight_layout()
    plt.show()

plot_europe_choropleth("Airbnb Listings Density (Europe)", pdf_airbnb_cc)
plot_europe_choropleth("Travel Cities Coverage (Europe)", pdf_cities_cc)
plot_europe_choropleth("OSM POI Density (Europe)", pdf_osm_cc)


In [0]:
# import geopandas as gpd

# # 1. Prepare Data
# # Ensure your cities data is a GeoDataFrame
# gdf_cities = gpd.GeoDataFrame(
#     pdf_cities_coords, 
#     geometry=gpd.points_from_xy(pdf_cities_coords.longitude, pdf_cities_coords.latitude),
#     crs="EPSG:4326"
# )

# # 2. Perform the Spatial Join
# # This attaches polygon info to the points. 
# # Points OUTSIDE the map will have 'index_right' as NaN.
# joined = gpd.sjoin(gdf_cities, europe_map, how="left", predicate="within")

# # 3. Filter for the "Floaters"
# outliers = joined[joined["index_right"].isna()]

# # 4. Show who they are
# print(f"Found {len(outliers)} points outside the map polygons.")
# display(outliers[["addr_cc", "latitude", "longitude"]].head(10)) 
# # Note: If you have a 'city_name' column, add it to the display list above!

In [0]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import functions as F
from matplotlib.lines import Line2D

# ---------------------------------------------------------
# 1. SETUP: CONTINENT MAPPING & GEOMETRY
# ---------------------------------------------------------

# Load World Map (Low res is fine for continents)
world = gpd.read_file("https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip")

# Dissolve countries into Continents
world_cont = world[["CONTINENT", "geometry"]].dissolve(by="CONTINENT").reset_index()
world_cont = world_cont[world_cont["CONTINENT"] != "Seven seas (open ocean)"].copy()

# Canonical Mapping (Same as your code)
CONTINENT_MAP = {
    "africa": "Africa",
    "antarctica": "Antarctica",
    "asia": "Asia",
    "europe": "Europe",
    "north_america": "North America",
    "central_america": "North America",
    "south_america": "South America",
    "oceania": "Oceania",
    "australia": "Oceania",
    "australia_oceania": "Oceania",
    "australia__oceania": "Oceania",
}

def _norm_raw(x):
    if x is None: return None
    return str(x).lower().strip().replace("-", "_").replace(" ", "_")

def to_canonical_continent(x):
    k = _norm_raw(x)
    return CONTINENT_MAP.get(k, k)

# Normalize geometry map side
world_cont["continent_norm"] = world_cont["CONTINENT"].apply(_norm_raw).apply(to_canonical_continent)

# ---------------------------------------------------------
# 2. PREPARE DATA LAYERS
# ---------------------------------------------------------

# LAYER 1: Airbnb Counts (Global)
# We use your 'airbnb_counts' dataframe computed earlier
pdf_ab_global = airbnb_counts.copy()
pdf_ab_global["continent_norm"] = pdf_ab_global["continent"].apply(to_canonical_continent)
# Sum duplicates (e.g. North America + Central America)
pdf_ab_global = pdf_ab_global.groupby("continent_norm")["n_rows"].sum().reset_index()
pdf_ab_global.columns = ["continent_norm", "listings_count"]

# LAYER 2: OSM Counts (Global)
# We use your 'osm_counts' dataframe computed earlier
pdf_osm_global = osm_counts.copy()
pdf_osm_global["continent_norm"] = pdf_osm_global["continent"].apply(to_canonical_continent)
pdf_osm_global = pdf_osm_global.groupby("continent_norm")["n_rows"].sum().reset_index()
pdf_osm_global.columns = ["continent_norm", "osm_count"]

# LAYER 3: City Dots (Global)
# We take a sample to keep plotting fast
pdf_cities_global = cities_df.select("longitude", "latitude").sample(fraction=0.5, seed=42).toPandas()

# ---------------------------------------------------------
# 3. MERGING DATA
# ---------------------------------------------------------

# Merge Airbnb & OSM into the Geometry
world_cont = world_cont.merge(pdf_ab_global, on="continent_norm", how="left")
world_cont = world_cont.merge(pdf_osm_global, on="continent_norm", how="left")
world_cont.fillna(0, inplace=True)

# Calculate Centroids for Bubbles (using temporary equal-area projection)
world_cont["centroid"] = world_cont.to_crs('+proj=cea').centroid.to_crs(world_cont.crs)

# ---------------------------------------------------------
# 4. PLOTTING
# ---------------------------------------------------------
fig, ax = plt.subplots(figsize=(18, 10))

# CHOROPLETH (Airbnb Density)
world_cont.plot(
    column="listings_count",
    ax=ax,
    cmap="Blues",
    edgecolor="#999999",
    linewidth=0.5,
    legend=True,
    legend_kwds={'label': "Total Listings", 'shrink': 0.5},
    missing_kwds={'color': '#f0f0f0'}
)

# BUBBLES (OSM Density)
scale_factor = 4000 / world_cont["osm_count"].max() 
ax.scatter(
    world_cont["centroid"].x, 
    world_cont["centroid"].y, 
    s=world_cont["osm_count"] * scale_factor, 
    color="#76c893", 
    alpha=0.6, 
    edgecolor="#2d6a4f", 
    linewidth=2,
    zorder=2
)

# DOTS (Cities)
# ax.scatter(
#     pdf_cities_global["longitude"], 
#     pdf_cities_global["latitude"], 
#     color="#fc0f16",  # instead of FF5A5F
#     s=12, 
#     alpha=0.7, 
#     zorder=1
# )
ax.scatter(
    pdf_cities_global["longitude"], 
    pdf_cities_global["latitude"], 
    color="#ff7034", 
    s=60,               # Size of the pin
    marker='v',         # 'v' points down
    edgecolor="white",  # Adds a white border so it stands out on the map
    linewidth=0.5,
    alpha=1.0,
    zorder=3
)

# LABELS
# Use representative point for labeling continent names
reps = world_cont.geometry.representative_point()
for (name, x, y, osm_c) in zip(world_cont["CONTINENT"], reps.x, reps.y, world_cont["osm_count"]):
    # Only label if we have data or it's a major landmass
    if osm_c > 0:
        ax.text(
            x, y, 
            name.replace(" (open ocean)", ""), # Clean name
            fontsize=10,
            ha="center", va="center",
            bbox=dict(facecolor="white", alpha=0.6, edgecolor="none", pad=1.5),
            zorder=3
        )

# STYLING
ax.set_title("Global Data Coverage", fontsize=20, fontweight='bold')
ax.axis("off")
ax.set_ylim(-60, 85) # Trim Antarctica slightly

# Custom Legend
legend_elements = [
    Line2D([0], [0], marker='s', color='w', markerfacecolor='#2171b5', markersize=15, label='High Airbnb Supply'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#76c893', markersize=15, alpha=0.6, label='High OSM POI Count'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#FF5A5F', markersize=5, label='City Metadata Available')
]
ax.legend(handles=legend_elements, loc='lower left', fontsize=12, frameon=True)

plt.tight_layout()
plt.savefig("global_data_coverage.png", dpi=300)
plt.show()

In [0]:
# Airbnb Counts (Choropleth Data)
pdf_ab_counts = count_by_country(df_ab_eur).rename(columns={"n_rows": "listings_count"})

# OSM Counts (Bubble Size Data)
pdf_osm_counts = count_by_country(df_osm_eur).rename(columns={"n_rows": "osm_count"})

# City Coordinates (Red Dots Data)
pdf_cities_coords = cities_df.filter(F.col("addr_cc").isin(europe_cc)) \
    .select("addr_cc", "longitude", "latitude").toPandas()

# LOAD & FILTER MAP
# Use 50m resolution (better for Slovenia coastline, Malta, and islands)
world = gpd.read_file("https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_countries.zip")

# Filter: Keep "Europe" OR the specific countries in our constant list
europe_map = world[
    (world["CONTINENT"] == "Europe") | 
    (world["ISO_A2"].isin(EXTRA_COUNTRIES))
].copy()

# MERGE AGGREGATED STATS
# Merge Airbnb
europe_map = europe_map.merge(pdf_ab_counts, left_on="ISO_A2", right_on="addr_cc", how="left")
# Merge OSM
europe_map = europe_map.merge(pdf_osm_counts, left_on="ISO_A2", right_on="addr_cc", how="left")

# Fill NaNs
europe_map["listings_count"] = europe_map["listings_count"].fillna(0)
europe_map["osm_count"] = europe_map["osm_count"].fillna(0)

# Calculate Centroids for the OSM Bubbles
europe_map["centroid"] = europe_map.geometry.centroid

# PLOT
fig, ax = plt.subplots(figsize=(16, 12))

# LAYER 1: BASE MAP (Airbnb Choropleth)
europe_map.plot(
    column="listings_count",
    ax=ax,
    cmap="Blues",
    edgecolor="#d0d0d0",
    linewidth=0.5,
    legend=True,
    missing_kwds={'color': '#f9f9f9'}
)

# LAYER 2: OSM DENSITY (Green Bubbles)
scale_factor = 2000 / europe_map["osm_count"].max() 
ax.scatter(
    europe_map["centroid"].x, 
    europe_map["centroid"].y, 
    s=europe_map["osm_count"] * scale_factor, 
    color="#76c893", 
    alpha=0.6,
    edgecolor="#2d6a4f",
    linewidth=1,
    zorder=2
)

# LAYER 3: TRAVEL CITIES (Red Dots)
# ax.scatter(
#     pdf_cities_coords["longitude"], 
#     pdf_cities_coords["latitude"], 
#     color="#fc0f16", 
#     s=12, 
#     alpha=0.8,
#     zorder=3
# )

ax.scatter(
    pdf_cities_coords["longitude"], 
    pdf_cities_coords["latitude"], 
    color="#ff7034", 
    s=60,               # Size of the pin
    marker='v',         # 'v' points down
    edgecolor="white",  # Adds a white border so it stands out on the map
    linewidth=0.5,
    alpha=1.0,
    zorder=3
)

# FORMATTING
ax.set_title("Europe Data Coverage", fontsize=16, fontweight='bold')

# Expanded limits to include Azores (left) and Canary Islands (bottom)
ax.set_xlim(-32, 50)
ax.set_ylim(27, 72)
ax.axis("off")

# Custom Legend
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#4292c6', markersize=15, label='High Airbnb Supply'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#76c893', markersize=15, alpha=0.6, label='High OSM/Vibe Density'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#FF5A5F', markersize=8, label='Rich City Metadata')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=11, frameon=True)

plt.tight_layout()
plt.savefig("europe_data_coverage.png", dpi=300)
plt.show()