## Get to know listings data

In [1]:
import pandas as pd

In [2]:
columns_to_drop = [
    "listing_url", "scrape_id", "last_scraped", "source", "description","neighborhood_overview", "picture_url",
    "host_url", "host_name", "host_about", "host_response_time", "host_response_rate","host_thumbnail_url",
    "host_picture_url", "host_neighbourhood", "host_listings_count", "host_verifications", "host_has_profile_pic",
    "neighbourhood_group_cleansed", "bathrooms_text", "amenities", "minimum_minimum_nights",
    "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm","calendar_updated", "calendar_last_scraped", "number_of_reviews_ltm", "number_of_reviews_l30d",
    "first_review", "last_review", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"
]

In [4]:
listings = pd.read_csv("../data/raw/listings.csv.gz").drop(columns=columns_to_drop)

In [20]:
import os
#os.getcwd()
os.chdir("..")

In [5]:
listings.shape

(4932, 42)

In [6]:
listings.columns

Index(['id', 'name', 'host_id', 'host_since', 'host_location',
       'host_acceptance_rate', 'host_is_superhost',
       'host_total_listings_count', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'availability_eoy',
       'number_of_reviews_ly', 'estimated_occupancy_l365d',
       'estimated_revenue_l365d', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'license',
       'instant_bookable', 'reviews_per_month'],
      dtype='object')

In [7]:
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932 entries, 0 to 4931
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           4932 non-null   int64  
 1   name                         4932 non-null   object 
 2   host_id                      4932 non-null   int64  
 3   host_since                   4682 non-null   object 
 4   host_location                3267 non-null   object 
 5   host_acceptance_rate         4378 non-null   object 
 6   host_is_superhost            4560 non-null   object 
 7   host_total_listings_count    4682 non-null   float64
 8   host_identity_verified       4682 non-null   object 
 9   neighbourhood                1821 non-null   object 
 10  neighbourhood_cleansed       4932 non-null   object 
 11  latitude                     4932 non-null   float64
 12  longitude                    4932 non-null   float64
 13  property_type     

In [8]:
listings["host_acceptance_rate"] = listings["host_acceptance_rate"].str.rstrip('%').astype(float) / 100.0
listings["host_is_superhost"] = listings["host_is_superhost"].map({'t': True, 'f': False})
listings["host_identity_verified"] = listings["host_identity_verified"].map({'t': True, 'f': False})
listings["price"] = listings["price"].str.lstrip('$').str.replace(',', '').astype(float)
listings["has_availability"] = listings["has_availability"].map({'t': True, 'f': False})
listings["instant_bookable"] = listings["instant_bookable"].map({'t': True, 'f': False})

In [9]:
type_dict = {
    "id": "object",
    "host_id": "object",
    "host_since": "datetime64[ns]",
    "host_total_listings_count": "Int16",
    "neighbourhood": "category",
    "neighbourhood_cleansed": "category",
    "property_type": "category",
    "room_type": "category",
    "accommodates": "Int16",
    "bedrooms": "Int16",
    "beds": "Int16",
    "minimum_nights": "Int16",
    "maximum_nights": "Int16",
}

In [10]:
listings = listings.astype(type_dict)

In [11]:
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932 entries, 0 to 4931
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           4932 non-null   object        
 1   name                         4932 non-null   object        
 2   host_id                      4932 non-null   object        
 3   host_since                   4682 non-null   datetime64[ns]
 4   host_location                3267 non-null   object        
 5   host_acceptance_rate         4378 non-null   float64       
 6   host_is_superhost            4560 non-null   object        
 7   host_total_listings_count    4682 non-null   Int16         
 8   host_identity_verified       4682 non-null   object        
 9   neighbourhood                1821 non-null   category      
 10  neighbourhood_cleansed       4932 non-null   category      
 11  latitude                     4932 non-null 

In [28]:
listings.loc[listings["host_since"] > "2025-06-23"].shape[0]

0

In [12]:
listings.to_parquet("../data/processed/listings_cleaned.parquet", engine="pyarrow", index=False)

In [4]:
import os

os.chdir("..")
#os.getcwd()