# Airbnb Price Predictor: Exploratory Data Analysis (Milestone 2)

This notebook performs exploratory data analysis (EDA) on the New York City Airbnb listings dataset from Inside Airbnb. The dataset was scraped on 17 June 2025 and is licensed under Creative Commons CC0 1.0 (public domain). Feel free to run the cells to reproduce the plots used in the white paper.

**Note:** This notebook assumes the CSV file `listings.csv.gz` is located in the same directory.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')


In [None]:
# Load the compressed CSV file
listings = pd.read_csv('listings.csv.gz')

# Convert price string to numeric (remove dollar sign and commas)
listings['price_num'] = listings['price'].replace({r'\$':'','\,':''}, regex=True).astype(float)

# Display first few rows and summary statistics
listings.head(), listings['price_num'].describe()


In [None]:
# Histogram of the log-transformed prices
plt.figure(figsize=(8,4))
subset = listings['price_num'].dropna()
plt.hist(np.log10(subset[subset > 0]), bins=30, color='skyblue', edgecolor='k')
plt.xlabel('Log10 of price (USD)')
plt.ylabel('Count')
plt.title('Distribution of Airbnb Listing Prices (Log10 scale)')
plt.show()


In [None]:
# Boxplot of price by room type
plt.figure(figsize=(8,4))
sns.boxplot(x='room_type', y='price_num', data=listings, showfliers=False)
plt.yscale('log')
plt.ylabel('Price (USD) [log scale]')
plt.xlabel('Room type')
plt.title('Price distribution by room type')
plt.show()


In [None]:
# Boxplot of price by borough (neighbourhood_group_cleansed)
subset2 = listings.dropna(subset=['neighbourhood_group_cleansed', 'price_num'])
counts = subset2['neighbourhood_group_cleansed'].value_counts()
selected = counts[counts > 500].index
subset2 = subset2[subset2['neighbourhood_group_cleansed'].isin(selected)]

plt.figure(figsize=(8,4))
sns.boxplot(x='neighbourhood_group_cleansed', y='price_num', data=subset2, showfliers=False)
plt.yscale('log')
plt.ylabel('Price (USD) [log scale]')
plt.xlabel('Borough')
plt.title('Price distribution by borough')
plt.show()


In [None]:
# Average price by number of bedrooms
subset3 = listings.dropna(subset=['bedrooms'])
subset3['bedrooms'] = pd.to_numeric(subset3['bedrooms'], errors='coerce')
avg_price_by_bedroom = subset3.groupby('bedrooms')['price_num'].mean().dropna().sort_index().head(10)

plt.figure(figsize=(8,4))
sns.barplot(x=avg_price_by_bedroom.index.astype(int), y=avg_price_by_bedroom.values, color='lightgreen')
plt.ylabel('Average price (USD)')
plt.xlabel('Number of bedrooms')
plt.title('Average price by number of bedrooms')
plt.show()


In [None]:
# Scatter plot of price vs review score rating
subset4 = listings.dropna(subset=['review_scores_rating', 'price_num'])
sample = subset4.sample(n=min(2000, len(subset4)), random_state=42)

plt.figure(figsize=(8,4))
plt.scatter(sample['review_scores_rating'], sample['price_num'], alpha=0.3, s=10, color='purple')
plt.yscale('log')
plt.xlabel('Review score rating')
plt.ylabel('Price (USD) [log scale]')
plt.title('Price vs review score rating')
plt.show()
