# Airbnb NYC Rental Prices Analysis

**Author:** David Graham

**Dataset:** Airbnb NYC 2019 Listings

## Project Description

This project performs an exploratory data analysis on the Airbnb NYC 2019 dataset to uncover patterns and relationships between listing features and rental prices. Through data visualization and statistical analysis, we aim to identify key factors that influence pricing and discover meaningful insights about the short-term rental market in New York City.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [None]:
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

## Data Cleaning Functions

In [None]:
def handle_missing_values(dataframe, numeric_strategy='median', text_strategy='Unknown'):
    """
    Handle missing values in a DataFrame by filling numeric columns with a 
    specified strategy and text columns with a placeholder value.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame with missing values
    numeric_strategy : str, default='median'
        Strategy for filling numeric columns ('median', 'mean', or 'zero')
    text_strategy : str, default='Unknown'
        Value to fill missing text/object columns
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with missing values handled
    """
    df_clean = dataframe.copy()
    
    for column in df_clean.columns:
        if df_clean[column].isnull().sum() > 0:
            if df_clean[column].dtype in ['float64', 'int64']:
                if numeric_strategy == 'median':
                    df_clean[column] = df_clean[column].fillna(df_clean[column].median())
                elif numeric_strategy == 'mean':
                    df_clean[column] = df_clean[column].fillna(df_clean[column].mean())
                else:
                    df_clean[column] = df_clean[column].fillna(0)
            else:
                df_clean[column] = df_clean[column].fillna(text_strategy)
    
    return df_clean


def remove_price_outliers(dataframe, price_column='price', lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the price column based on percentile thresholds.
    This helps eliminate extreme values that could skew analysis results.
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The input DataFrame containing price data
    price_column : str, default='price'
        Name of the column containing price values
    lower_percentile : int, default=1
        Lower percentile threshold (removes values below this)
    upper_percentile : int, default=99
        Upper percentile threshold (removes values above this)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with price outliers removed
    """
    df_clean = dataframe.copy()
    
    lower_bound = df_clean[price_column].quantile(lower_percentile / 100)
    upper_bound = df_clean[price_column].quantile(upper_percentile / 100)
    
    original_count = len(df_clean)
    df_clean = df_clean[(df_clean[price_column] >= lower_bound) & 
                        (df_clean[price_column] <= upper_bound)]
    
    removed_count = original_count - len(df_clean)
    print(f"Removed {removed_count} outliers ({removed_count/original_count*100:.2f}%)")
    print(f"Price range: ${lower_bound:.0f} - ${upper_bound:.0f}")
    
    return df_clean

## Apply Cleaning Functions

In [None]:
# Check missing values before cleaning
print("Missing values before cleaning:")
print(df.isnull().sum())
print(f"\nTotal rows: {len(df)}")

In [None]:
# Apply cleaning functions
df_clean = handle_missing_values(df)
df_clean = remove_price_outliers(df_clean)

# Verify cleaning results
print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())
print(f"\nFinal row count: {len(df_clean)}")