# EDA

In [1]:
# load libraries
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency
import time
start_time = time.time()

In [2]:
output_dir = "figures"
os.makedirs(output_dir, exist_ok=True)

In [3]:
listings_large_1_df = pd.read_csv("data/LA_2022.10-12/listings.csv.gz")
listings_large_2_df = pd.read_csv("data/LA_2023.1-3/listings.csv.gz")
listings_large_3_df = pd.read_csv("data/LA_2023.4-6/listings.csv.gz")
listings_large_4_df = pd.read_csv("data/LA_2023.7-9/listings.csv.gz")

listings_large_combined_df = pd.concat(
    [listings_large_1_df, listings_large_2_df, listings_large_3_df, listings_large_4_df],
    ignore_index=True
)

listings_large_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171947 entries, 0 to 171946
Data columns (total 75 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   id                                            171947 non-null  int64  
 1   listing_url                                   171947 non-null  object 
 2   scrape_id                                     171947 non-null  int64  
 3   last_scraped                                  171947 non-null  object 
 4   source                                        171947 non-null  object 
 5   name                                          171943 non-null  object 
 6   description                                   169104 non-null  object 
 7   neighborhood_overview                         99737 non-null   object 
 8   picture_url                                   171947 non-null  object 
 9   host_id                                       17

In [4]:
reviews_large_1_df = pd.read_csv("data/LA_2022.10-12/reviews.csv.gz")
reviews_large_2_df = pd.read_csv("data/LA_2023.1-3/reviews.csv.gz")
reviews_large_3_df = pd.read_csv("data/LA_2023.4-6/reviews.csv.gz")
reviews_large_4_df = pd.read_csv("data/LA_2023.7-9/reviews.csv.gz")

reviews_large_combined_df = pd.concat(
    [reviews_large_1_df, reviews_large_2_df, reviews_large_3_df, reviews_large_4_df],
    ignore_index=True
)

reviews_large_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5748175 entries, 0 to 5748174
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   listing_id     int64 
 1   id             int64 
 2   date           object
 3   reviewer_id    int64 
 4   reviewer_name  object
 5   comments       object
dtypes: int64(3), object(3)
memory usage: 263.1+ MB


In [5]:
top_listings = listings_large_combined_df.loc[listings_large_combined_df['number_of_reviews'] >= 10]

top_listings = top_listings.sort_values(
    by=['review_scores_rating', 'number_of_reviews'],
    ascending=[False, False]
)

top_listings_df = top_listings[[
    'id', 'name', 'neighbourhood_cleansed',
    'price', 'number_of_reviews', 'review_scores_rating'
]].head(50)

In [6]:
df = listings_large_combined_df.copy()

df['price'] = df['price'].str.replace('[\$,]', '', regex=True).astype(float)

df['host_since'] = pd.to_datetime(df['host_since'])
df['host_experience_days'] = (datetime.now() - df['host_since']).dt.days

df['amenities_count'] = df['amenities'].str.split(',').apply(len)

df['is_superhost'] = df['host_is_superhost'] == 't'
df['instant_bookable'] = df['instant_bookable'] == 't'

In [7]:
top_ids = set(top_listings_df['id'])
df['is_top'] = df['id'].isin(top_ids)

In [8]:
features = [
    'price', 'accommodates', 'bedrooms', 'beds',
    'host_experience_days', 'amenities_count'
]

df['is_top_2'] = df['is_top'].map({True: 'Top 10%', False: 'Bottom 90%'})

summary = df.groupby('is_top_2')[features].agg(['mean', 'median'])

summary.columns = ['_'.join(col).strip() for col in summary.columns.values]

summary = summary.loc[["Top 10%", "Bottom 90%"]]

print(summary.round(2))

            price_mean  price_median  accommodates_mean  accommodates_median  \
is_top_2                                                                       
Top 10%         213.51         168.0               2.55                  2.0   
Bottom 90%      283.55         146.0               3.92                  3.0   

            bedrooms_mean  bedrooms_median  beds_mean  beds_median  \
is_top_2                                                             
Top 10%              1.21              1.0       1.43          1.0   
Bottom 90%           1.90              1.0       2.17          2.0   

            host_experience_days_mean  host_experience_days_median  \
is_top_2                                                             
Top 10%                       3468.01                       3334.0   
Bottom 90%                    2920.78                       3048.0   

            amenities_count_mean  amenities_count_median  
is_top_2                                                  

In [13]:
for col in ['room_type', 'property_type', 'is_superhost', 'instant_bookable']:
    ct = pd.crosstab(df[col], df['is_top_2'], normalize='columns')
    print(f"\nfeature: {col}\n", ct.head(5).round(3))


feature: room_type
 is_top_2         Bottom 90%  Top 10%
room_type                           
Entire home/apt       0.708    0.867
Hotel room            0.002    0.000
Private room          0.273    0.133
Shared room           0.017    0.000

feature: property_type
 is_top_2       Bottom 90%  Top 10%
property_type                     
Barn                0.000    0.000
Boat                0.000    0.000
Bus                 0.000    0.000
Camper/RV           0.004    0.053
Campsite            0.001    0.000

feature: is_superhost
 is_top_2      Bottom 90%  Top 10%
is_superhost                     
False              0.701    0.053
True               0.299    0.947

feature: instant_bookable
 is_top_2          Bottom 90%  Top 10%
instant_bookable                     
False                  0.714     0.88
True                   0.286     0.12


In [None]:
tstat, pval = ttest_ind(
    df.loc[df['is_top'], 'price'],
    df.loc[~df['is_top'], 'price'], 
    equal_var=False
)
print("price t‑test p‑value:", pval)

ct = pd.crosstab(df['is_superhost'], df['is_top'])
chi2, pval, _, _ = chi2_contingency(ct)
print("superhost chi2 p‑value:", pval)

price t‑test p‑value: 4.444411186759661e-06
superhost chi2 p‑value: 9.060479944865947e-34


Note: low p-values represent statistical significance.

In [None]:
end_time = time.time()
print("total notebook execution time: {:.0f} seconds".format(end_time - start_time))

total notebook execution time: 25 seconds
