In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Listing

In [None]:
cols = [
    'id', 'host_id', 'neighbourhood_cleansed',
    'latitude', 'longitude', 'room_type', 'accommodates',
    'price', 'minimum_nights', 'number_of_reviews', 'review_scores_rating',
    'last_review', 'reviews_per_month', 'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms', 'host_listings_count', 'host_is_superhost',
    'amenities', 'last_scraped', 'license', 'availability_365'
]

df = pd.read_csv('./airbnb_data/listings_truncated/2024-12-07_listings.csv', usecols=cols, low_memory=False)
listings = df.copy()

In [None]:
listings.head()

In [None]:
listings.isnull().sum()

In [None]:
# Clean price
listings['price'] = listings['price'].replace('[\$,]', '', regex=True).astype(float)

# Clean superhost flag
listings['host_is_superhost'] = listings['host_is_superhost'].fillna('f')

# Fill missing reviews info
listings['reviews_per_month'] = listings['reviews_per_month'].fillna(0)
listings['number_of_reviews'] = listings['number_of_reviews'].fillna(0)

# Replace NaN in rating (optional: set 0 or median)
listings['review_scores_rating'] = listings['review_scores_rating'].fillna(listings['review_scores_rating'].median())

In [None]:
# Plot histogram of minimum nights
plt.figure(figsize=(10, 6))
plt.hist(listings['minimum_nights'].dropna(), bins=30, color='blue', edgecolor='black')
plt.title('Distribution of Minimum Nights')
plt.xlabel('Minimum Nights')
plt.ylabel('Number of Listings')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Remove extreme outliers (minimum_nights over 99 percentile)
min_nights_threshold = 30 # common Airbnb minimum nights for short-term rentals
min_nights_covered = listings[listings['minimum_nights'] <= min_nights_threshold]
plt.figure(figsize=(10, 6))
plt.hist(min_nights_covered['minimum_nights'].dropna(), bins=30, color='blue', edgecolor='black')
plt.title('Distribution of Minimum Nights (After Removing Outliers)')
plt.xlabel('Minimum Nights')
plt.ylabel('Number of Listings')
plt.xlim(0, min_nights_threshold)  # Limit x-axis to focus on lower prices
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Plot histogram of price distribution
plt.figure(figsize=(10, 6))
plt.hist(listings['price'].dropna(), bins=100, color='blue', edgecolor='black')
plt.title('Distribution of Listing Prices')
plt.xlabel('Price')
plt.ylabel('Number of Listings')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Remove extreme outliers (prices above 99 percentile)
price_threshold = listings['price'].quantile(0.99)
price_covered = listings[listings['price'] <= price_threshold]
plt.figure(figsize=(10, 6))
plt.hist(price_covered['price'].dropna(), bins=100, color='blue', edgecolor='black')
plt.title('Distribution of Listing Prices (After Removing Outliers)')
plt.xlabel('Price')
plt.ylabel('Number of Listings')
plt.xlim(0, price_threshold)  # Limit x-axis to focus on lower prices
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Then should remove outliers in further analysis
listings = listings[listings['minimum_nights'] <= min_nights_threshold]
listings = listings[listings['price'] <= price_threshold]

In [None]:
# Total removed listings
print(f"Total listings before outlier removal: {len(df)}")
print(f"Total listings after outlier removal: {len(listings)}")
print(f"Total listings removed: {len(df) - len(listings)}")

In [None]:
listings.describe()

In [None]:
# Some feature engineering can be done here
listings['multi_host_flag'] = (listings['calculated_host_listings_count'] > 1).astype(int)
listings['price_per_accommodate'] = listings['price'] / listings['accommodates']

In [None]:
agg = listings.groupby('neighbourhood_cleansed').agg(
    n_listings=('id', 'count'),
    avg_price=('price', 'mean'),
    median_price=('price', 'median'),
    avg_price_per_person=('price_per_accommodate', 'mean'),
    pct_entire_home=('room_type', lambda x: (x=='Entire home/apt').mean()*100),
    pct_superhost=('host_is_superhost', lambda x: (x=='t').mean()*100),
    avg_availability=('availability_365', 'mean'),
    avg_rating=('review_scores_rating', 'mean'),
    avg_reviews=('number_of_reviews', 'mean'),
    avg_min_nights=('minimum_nights', 'mean'),
    pct_multi_host=('multi_host_flag', 'mean')
).reset_index()
agg['snapshot_date'] = '2025-03'

In [None]:
agg

### Calendar

In [None]:
df = pd.read_csv('./airbnb_data/calendar_truncated/2024-12-07_calendar.csv', low_memory=False)

df.head()

In [None]:
# Great, sign of safely aggregating data
df.isnull().sum()

### Geo

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load geojson file for neighbourhoods
geo = gpd.read_file('./airbnb_data/neighbourhoods.geojson')

# Project to meters system for accurate area calculation
geo = geo.to_crs(epsg=3857)

# Dissolve to get area per neighbourhood
geo_dissolved = geo.dissolve(by='neighbourhood', as_index=False)

# Calculate area in square kilometers
geo_dissolved['area_km2'] = geo_dissolved['geometry'].area / 10**6

In [None]:
df = pd.read_csv('./airbnb_data/airbnb_neighbourhood_summary.csv')

merged = geo_dissolved.merge(df, left_on='neighbourhood', right_on='neighbourhood_cleansed', how='right')

# Calculate density metrics
merged["density_listings_per_km2"] = merged["n_listings"] / merged["area_km2"]

# Calculate log density metrics to reduce skewness
merged["log_density"] = np.log1p(merged["density_listings_per_km2"])

In [None]:
fig, axes = plt.subplots(figsize=(8,8))
snapshot_date = merged['snapshot_date'].unique()[0]
merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
merged_snapshot.plot(
    column='n_listings',
    cmap='OrRd',
    linewidth=0.8,
    edgecolor='0.8',
    legend=True,
    ax=axes
)
axes.set_title(f"Total listings per neighborhood - Venice - {snapshot_date}", fontsize=14)
axes.axis('off')
plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(8,8))
snapshot_date = merged['snapshot_date'].unique()[0]
merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
merged_snapshot.plot(
    column='density_listings_per_km2',
    cmap='OrRd',
    linewidth=0.8,
    edgecolor='0.8',
    legend=True,
    ax=axes
)
axes.set_title(f"Listing Density per km - Venice - {snapshot_date}", fontsize=14)
axes.axis('off')
plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(8,8))
snapshot_date = merged['snapshot_date'].unique()[0]
merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
merged_snapshot.plot(
    column='log_density',
    cmap='OrRd',
    linewidth=0.8,
    edgecolor='0.8',
    legend=True,
    ax=axes
)
axes.set_title(f"Listing Density per km - Venice - {snapshot_date}", fontsize=14)
axes.axis('off')
plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(30,8), ncols=4, nrows=1)
for i, snapshot_date in enumerate(merged['snapshot_date'].unique()):
    merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
    merged_snapshot.plot(
        column='avg_price',
        cmap='OrRd',
        linewidth=0.8,
        edgecolor='0.8',
        legend=True,
        ax=axes[i]
    )
    axes[i].set_title(f"Average Price by Neighborhood - Venice - {snapshot_date}", fontsize=14)
    axes[i].axis('off')
plt.show()

The average price at snapshot date 2025-09-11 looks a little bit off, it has only a small neighborhood with very high average price while other neighborhoods have relatively low average prices. Let's try removing the outlier and see if it improves the overall trend.

In [None]:
snapshot_date = merged['snapshot_date'].unique()[-1]
merged_snapshot = merged[merged['snapshot_date'] == snapshot_date].copy()
# Remove outliers in average price (above 95 percentile)
merged_snapshot = merged_snapshot[merged_snapshot['avg_price'] < merged_snapshot['avg_price'].quantile(0.95)]

fig, axes = plt.subplots(figsize=(8,8))
merged_snapshot.plot(
    column='avg_price',
    cmap='OrRd',
    linewidth=0.8,
    edgecolor='0.8',
    legend=True,
    ax=axes
)
axes.set_title(f"Average Price - Venice - {snapshot_date}", fontsize=14)
axes.axis('off')
plt.show()

Yes, after removing the outlier, the average prices across neighborhoods appear more balanced and consistent with expected trends. The visualization now shows a clearer distribution of average prices without the distortion caused by the outlier.

In [None]:
fig, axes = plt.subplots(figsize=(30,8), ncols=4, nrows=1)
for i, snapshot_date in enumerate(merged['snapshot_date'].unique()):
    merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
    merged_snapshot.plot(
        column='avg_availability',
        cmap='OrRd',
        linewidth=0.8,
        edgecolor='0.8',
        vmax=90,
        legend=True,
        ax=axes[i]
    )
    axes[i].set_title(f"Average availability by Neighborhood - Venice - {snapshot_date}", fontsize=14)
    axes[i].axis('off')
plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(8,8))
snapshot_date = merged['snapshot_date'].unique()[0]
merged_snapshot = merged[merged['snapshot_date'] == snapshot_date]
merged_snapshot.plot(
    column='avg_min_nights',
    cmap='OrRd',
    linewidth=0.8,
    edgecolor='0.8',
    legend=True,
    ax=axes
)
axes.set_title(f"Average Minimum Nights - Venice - {snapshot_date}", fontsize=14)
axes.axis('off')
plt.show()

In [None]:
summary = merged.copy()

In [None]:
import json
import plotly.express as px

In [None]:
# 1) Ensure GeoDataFrame uses WGS84 (lat/lon)
summary = summary.to_crs(epsg=4326)

# 2) Make sure the join key is a string and normalized
summary['neighbourhood'] = summary['neighbourhood'].astype(str).str.strip()

# 3) Export GeoDataFrame to GeoJSON (as dict)
geojson = json.loads(summary.to_json())

# 4) Plot — use featureidkey that points to property name in geojson ("properties.neighbourhood")
fig = px.choropleth_map(
    summary,                          # DataFrame
    geojson=geojson,                 # GeoJSON dict
    locations='neighbourhood',       # column in DataFrame that matches feature property
    color='density_listings_per_km2',
    featureidkey='properties.neighbourhood',
    hover_name='neighbourhood',
    hover_data=['n_listings', 'avg_price', 'area_km2', 'avg_min_nights'],
    center={"lat": 45.44, "lon": 12.33},
    animation_frame="snapshot_date",
    zoom=11,
    color_continuous_scale="YlOrRd",
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Comparision

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load geojson file for neighbourhoods
geo = gpd.read_file('./airbnb_data/neighbourhoods.geojson')

# Project to meters system for accurate area calculation
geo = geo.to_crs(epsg=3857)

# Dissolve to get area per neighbourhood
geo_dissolved = geo.dissolve(by='neighbourhood', as_index=False)

# Calculate area in square kilometers
geo_dissolved['area_km2'] = geo_dissolved['geometry'].area / 10**6

In [None]:
df = pd.read_csv('./airbnb_data/airbnb_neighbourhood_summary.csv')

merged = geo_dissolved.merge(df, left_on='neighbourhood', right_on='neighbourhood_cleansed', how='right')

# Calculate density metrics
merged["density_listings_per_km2"] = merged["n_listings"] / merged["area_km2"]

# Calculate log density metrics to reduce skewness
merged["log_density"] = np.log1p(merged["density_listings_per_km2"])

# Let's take 2024-12-07 snapshot as main comparison
snapshot_date = '2024-12-07'
summary = merged[merged['snapshot_date'] == snapshot_date].copy()

In [None]:
# Let's take a look at the data
summary.head()

In [None]:
# Bar plot of top 10 average price per neighborhood
compared = summary.sort_values(by='avg_price', ascending=False)
plt.figure(figsize=(12, 6))
plt.bar(compared['neighbourhood_cleansed'][:20], compared['avg_price'][:20], color='skyblue')
plt.xticks(rotation=90)
plt.title('Average Price per Neighborhood in Venice')
plt.xlabel('Neighborhood')
plt.ylabel('Average Price')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of top 10 density listings per km2 per neighborhood
compared = summary.sort_values(by='density_listings_per_km2', ascending=False)
plt.figure(figsize=(12, 6))
plt.bar(compared['neighbourhood_cleansed'][:20], compared['density_listings_per_km2'][:20], color='skyblue')
plt.xticks(rotation=90)
plt.title('Density of Listings per km2 by Neighborhood in Venice')
plt.xlabel('Neighborhood')
plt.ylabel('Density (Listings per km2)')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of top 10 average availability per neighborhood
compared = summary.sort_values(by='avg_availability', ascending=False)
plt.figure(figsize=(12, 6))
plt.bar(compared['neighbourhood_cleansed'][:20], compared['avg_availability'][:20], color='skyblue')
plt.xticks(rotation=90)
plt.title('Average Availability by Neighborhood in Venice')
plt.xlabel('Neighborhood')
plt.ylabel('Average Availability')
plt.tight_layout()
plt.show()

Those above looks a little bit off, let's do some comparision to verify.

In [None]:
# Take top 20 neighborhoods by density first
top20_density = summary.sort_values(by='density_listings_per_km2', ascending=False).head(20)

# From there, take top 10 by average price
top10_price = top20_density.sort_values(by='avg_price', ascending=False).head(10)

# Let's plot them!
plt.figure(figsize=(10, 6))
plt.bar(top10_price['neighbourhood_cleansed'], top10_price['avg_price'], color='skyblue')
plt.axhline(y=150, color='r', linestyle='--')
plt.xticks(rotation=90)
plt.title('Top 10 Neighborhoods by Average Price among Top 20 by Listing Density in Venice')
plt.xlabel('Neighborhood')
plt.ylabel('Average Price')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of top 10 density 
top10_price_sorted = top10_price.sort_values(by='density_listings_per_km2', ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(top10_price_sorted['neighbourhood_cleansed'], top10_price_sorted['density_listings_per_km2'], color='skyblue')
plt.axhline(y=200, color='r', linestyle='--')
plt.xticks(rotation=90)
plt.title('Listing Density among Top 10 Neighborhoods by Average Price in Venice')
plt.xlabel('Neighborhood')
plt.ylabel('Density (Listings per km2)')
plt.tight_layout()
plt.show()

In [None]:
# Plot the heatmap of correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = summary[['avg_price', 'avg_availability', 'avg_min_nights', 'density_listings_per_km2']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Key Variables in Venice')
plt.show()  

Wow! The average price has a strong positive correlation with the density of listings per km2 and the average minimum nights. This means that neighborhoods with higher density of listings tend to have higher average prices. 

In [None]:
# Let's plot correlation of average price vs density listings per km2
plt.figure(figsize=(10, 6))
sns.scatterplot(data=summary, x='density_listings_per_km2', y='avg_price', color='blue')
plt.title('Average Price vs Density Listings per km2 in Venice')
plt.xlabel('Density Listings per km2')
plt.ylabel('Average Price')
plt.show()

In [None]:
# Correlation of average price and average minimum nights
plt.figure(figsize=(10, 6))
sns.scatterplot(data=summary, x='avg_min_nights', y='avg_price', color='green')
plt.title('Average Price vs Average Minimum Nights in Venice')
plt.xlabel('Average Minimum Nights')
plt.ylabel('Average Price')
plt.show()

### Predictive

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./airbnb_data/merged_calendar_summary.csv',parse_dates=['date'])
df.head()

#### San Marco

In [None]:
small_df = df[df['neighbourhood_cleansed'] == 'San Marco']
small_df.head()

In [None]:
# Plot price trend over time
plt.figure(figsize=(12,6))
plt.plot(small_df['date'], small_df['avg_price_calendar'], marker='o', linestyle='-')
plt.title('Average Price Trend Over Time - San Marco')
plt.xlabel('Date')
plt.ylabel('Average Price')
plt.grid()
plt.show()

In [None]:
# Plot occupancy trend over time
plt.figure(figsize=(12,6))
plt.plot(small_df['date'], small_df['occupancy_rate'], marker='o', linestyle='-', color='orange')
plt.title('Average Occupancy Trend Over Time - San Marco')
plt.xlabel('Date')
plt.ylabel('Average Occupancy Rate')
plt.grid()
plt.show()

#### San Polo

In [None]:
small_df = df[df['neighbourhood_cleansed'] == 'San Polo']
small_df.head()

In [None]:
# Plot price trend over time
plt.figure(figsize=(12,6))
plt.plot(small_df['date'], small_df['avg_price_calendar'], marker='o', linestyle='-')
plt.title('Average Price Trend Over Time - San Polo')
plt.xlabel('Date')
plt.ylabel('Average Price')
plt.grid()
plt.show()

In [None]:
# Plot occupancy trend over time
plt.figure(figsize=(12,6))
plt.plot(small_df['date'], small_df['occupancy_rate'], marker='o', linestyle='-', color='orange')
plt.title('Average Occupancy Trend Over Time - San Polo')
plt.xlabel('Date')
plt.ylabel('Average Occupancy Rate')
plt.grid()
plt.show()

### External

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns

In [None]:
df_all = pd.read_csv('./airbnb_data/immobiliare_venice_price_trends.csv', parse_dates=['timestamp'])
df_all.head()

In [None]:
df_all_small = df_all[df_all['neighborhood'] == 'san-polo-santa-croce']
df_all_small.head()

In [None]:
# Let's plot the price trend over time from mid 2024 to now
plt.figure(figsize=(12,6))
plt.plot(df_all_small['timestamp'][80:], df_all_small['values'][80:], marker='o', linestyle='-')
plt.title('Average Property Price Trend Over Time - San Polo Santa Croce')
plt.xlabel('Date')
plt.ylabel('Average Property Price')
plt.grid()
plt.show()

In [None]:
df = pd.read_csv('./airbnb_data/merged_calendar_summary.csv',parse_dates=['date'])
df.head()

In [None]:
small_df = df[df['neighbourhood_cleansed'].isin(['San Polo', 'Santa Croce'])]
small_df

In [None]:
small_df['month_year'] = small_df['date'].dt.to_period("M")

In [None]:
monthly = (
    small_df.groupby(["month_year"], as_index=False)
         .agg({"avg_price_calendar": "mean", "occupancy_rate": "mean"})
)

In [None]:
# Plot price trend over time
plt.figure(figsize=(12,6))
plt.plot(monthly['month_year'].astype(str), monthly['avg_price_calendar'], marker='o', linestyle='-')
plt.title('Average Price Trend Over Time - San Polo & Santa Croce')
plt.xlabel('Month-Year')
plt.ylabel('Average Price')
plt.grid()
plt.show()  

In [None]:
small_df_all = df_all[df_all['neighborhood'] == 'san-polo-santa-croce']
small_df_all['month_year'] = small_df_all['timestamp'].dt.to_period('M')

In [None]:
merged = monthly.merge(small_df_all, on=["month_year"], how="inner")

In [None]:
merged

In [None]:
# Plot correlation between Airbnb avg price and Immobiliare avg price
plt.figure(figsize=(10, 8))
correlation_matrix = merged[['values', 'avg_price_calendar']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Airbnb and Immobiliare Average Prices of San Polo & Santa Croce')
plt.show()  