# Airbnb NYC Rental Prices Analysis

**Author:** David Graham

**Dataset:** Airbnb NYC 2019 Listings

## Project Description

This project performs an exploratory data analysis on the Airbnb NYC 2019 dataset to uncover patterns and relationships between listing features and rental prices. Through data visualization and statistical analysis, we aim to identify key factors that influence pricing and discover meaningful insights about the short-term rental market in New York City.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [None]:
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

## Data Cleaning Functions

In [None]:
def handle_missing_values(dataframe, numeric_strategy='median', text_strategy='Unknown'):
    """
    Handle missing values in a DataFrame by filling numeric columns with a 
    specified strategy and text columns with a placeholder value.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame with missing values
    numeric_strategy : str, default='median'
        Strategy for filling numeric columns ('median', 'mean', or 'zero')
    text_strategy : str, default='Unknown'
        Value to fill missing text/object columns
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with missing values handled
    """
    df_clean = dataframe.copy()
    
    for column in df_clean.columns:
        if df_clean[column].isnull().sum() > 0:
            if df_clean[column].dtype in ['float64', 'int64']:
                if numeric_strategy == 'median':
                    df_clean[column] = df_clean[column].fillna(df_clean[column].median())
                elif numeric_strategy == 'mean':
                    df_clean[column] = df_clean[column].fillna(df_clean[column].mean())
                else:
                    df_clean[column] = df_clean[column].fillna(0)
            else:
                df_clean[column] = df_clean[column].fillna(text_strategy)
    
    return df_clean


def remove_price_outliers(dataframe, price_column='price', lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the price column based on percentile thresholds.
    This helps eliminate extreme values that could skew analysis results.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame containing price data
    price_column : str, default='price'
        Name of the column containing price values
    lower_percentile : int, default=1
        Lower percentile threshold (removes values below this)
    upper_percentile : int, default=99
        Upper percentile threshold (removes values above this)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with price outliers removed
    """
    df_clean = dataframe.copy()
    
    lower_bound = df_clean[price_column].quantile(lower_percentile / 100)
    upper_bound = df_clean[price_column].quantile(upper_percentile / 100)
    
    original_count = len(df_clean)
    df_clean = df_clean[(df_clean[price_column] >= lower_bound) & 
                        (df_clean[price_column] <= upper_bound)]
    
    removed_count = original_count - len(df_clean)
    print(f"Removed {removed_count} outliers ({removed_count/original_count*100:.2f}%)")
    print(f"Price range: ${lower_bound:.0f} - ${upper_bound:.0f}")
    
    return df_clean

## Apply Cleaning Functions

In [None]:
# Check missing values before cleaning
print("Missing values before cleaning:")
print(df.isnull().sum())
print(f"\nTotal rows: {len(df)}")

In [None]:
# Apply cleaning functions
df_clean = handle_missing_values(df)
df_clean = remove_price_outliers(df_clean)

# Verify cleaning results
print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())
print(f"\nFinal row count: {len(df_clean)}")

## Exploratory Data Analysis Functions

In [None]:
def get_summary_statistics(dataframe, numeric_only=True):
    """
    Generate comprehensive summary statistics for a DataFrame including
    count, mean, std, min, max, and percentiles for numeric columns.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame to analyze
    numeric_only : bool, default=True
        If True, only include numeric columns in the summary
    
    Returns:
    --------
    pd.DataFrame
        Summary statistics including additional metrics like skewness
    """
    if numeric_only:
        numeric_df = dataframe.select_dtypes(include=[np.number])
    else:
        numeric_df = dataframe
    
    summary = numeric_df.describe()
    
    # Add additional statistics
    summary.loc['skew'] = numeric_df.skew()
    summary.loc['median'] = numeric_df.median()
    
    return summary.round(2)


def analyze_by_group(dataframe, group_column, value_column, agg_funcs=['mean', 'median', 'count']):
    """
    Perform grouped analysis on a DataFrame, calculating aggregate statistics
    for a value column grouped by a categorical column.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame to analyze
    group_column : str
        Column name to group by (categorical)
    value_column : str
        Column name to calculate statistics for (numeric)
    agg_funcs : list, default=['mean', 'median', 'count']
        List of aggregation functions to apply
    
    Returns:
    --------
    pd.DataFrame
        Grouped statistics sorted by mean value descending
    """
    grouped = dataframe.groupby(group_column)[value_column].agg(agg_funcs)
    grouped = grouped.sort_values(by='mean', ascending=False)
    
    return grouped.round(2)


def get_correlation_analysis(dataframe, target_column=None, threshold=0.3):
    """
    Calculate correlation matrix for numeric columns and optionally
    highlight correlations with a target column above a threshold.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame to analyze
    target_column : str, optional
        If provided, show correlations with this column sorted by strength
    threshold : float, default=0.3
        Minimum absolute correlation value to highlight
    
    Returns:
    --------
    pd.DataFrame or pd.Series
        Full correlation matrix, or correlations with target column
    """
    numeric_df = dataframe.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr()
    
    if target_column and target_column in correlation_matrix.columns:
        target_corr = correlation_matrix[target_column].drop(target_column)
        target_corr = target_corr.sort_values(key=abs, ascending=False)
        
        print(f"Correlations with '{target_column}' (|r| >= {threshold}):")
        significant = target_corr[abs(target_corr) >= threshold]
        return significant.round(3)
    
    return correlation_matrix.round(3)

## Run Exploratory Analysis

In [None]:
# Summary Statistics
print("=== Summary Statistics ===\n")
get_summary_statistics(df_clean)

In [None]:
# Grouped Analysis: Price by Neighbourhood Group
print("=== Price by Borough ===\n")
analyze_by_group(df_clean, 'neighbourhood_group', 'price')

In [None]:
# Grouped Analysis: Price by Room Type
print("=== Price by Room Type ===\n")
analyze_by_group(df_clean, 'room_type', 'price')

In [None]:
# Correlation Analysis
print("=== Correlation Analysis ===\n")
get_correlation_analysis(df_clean, target_column='price', threshold=0.1)