# Data Exploration

### 1. Importing Packages and Loading Data

In [None]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns

In [None]:
df = pd.read_csv("/Users/matteo/Documents/PersonalProjects/airbnb-pricing-optimization/data/listings-2.csv")
df.info()
df.describe()

### 2. Analyzing Missing Data

In [None]:
# check for na values

NAs = df.isna().sum().reset_index()
NAs.columns = ["variable", "na_count"]

(
    ggplot(NAs[NAs["na_count"] > 0], mapping = aes(x = "variable", y = "na_count")) +
    geom_col() +
    coord_flip() +
    labs(
        title = "Bar Chart of the NA Distribution by Variable",
        x = "Variable",
        y = "Count (Number of NA's)"
    )
)

### 3. Price Analysis (Target Variable)

In [None]:
# drop NA prices (target column, NAs will be useless)

df = df[df["price"].isna() == False].reset_index(drop = True)

# convert price to a float

df["price"] = df["price"].str.replace("$", "").str.replace(",", "").astype(float)

# create log price column

df["log_price"] = np.log(df["price"])

In [None]:
# histogram of log prices (heavy skew in original prices)

(
    ggplot(df, mapping = aes(x = "log_price")) +
    geom_histogram(color = "black", fill = "skyblue") +
    labs(title = "Histogram of Airbnb Log Prices Per Night",
         subtitle = "Prices have been log transformed to compensate for heavy skew.",
         x = "Log Price (Per Night)",
         y = "Frequency")
)

### 4. Feature Distributions

In [None]:
# create feature distributions

### 5. Relationships With Price

In [None]:
# Examining the relationship between price and location

(
    ggplot(df, mapping = aes(x = "longitude", y = "latitude", color = "log_price")) +
    geom_point() +
    labs(
        title = "Heat Map of Log Prices (Based on Location)",
        x = "Longitude",
        y = "Latitude"
    )
)

In [None]:
# explore room type and price 

(
    ggplot(df, mapping = aes(x = "room_type", y = "log_price")) +
    geom_boxplot() +
    labs(
        title = "Box Plots of Log Price by Room Type",
        x = "Room Type",
        y = "Log Price (Per Night)"
    )
)

In [None]:
# Exploring relationship between key features and price

(
    ggplot(df, mapping = aes(x = "bedrooms", y = "log_price")) +
    geom_point() +
    labs(
        title = "Scatterplot of Bedrooms vs Log Price",
        x = "Number of Bedrooms",
        y = "Log Price (Per Night)"
    )
)

In [None]:
(
    ggplot(df, mapping = aes(x = "bathrooms", y = "log_price")) +
    geom_point() +
    labs(
        title = "Scatterplot of Bathrooms vs Log Price",
        x = "Number of Bathrooms",
        y = "Log Price (Per Night)"
    )
)

### 6. Correlations

In [None]:
# analyze correlations