In [1]:
import boto3
import pandas as pd
import numpy as np


In [2]:
s3 = boto3.client("s3", region_name="us-east-2")

obj = s3.get_object(
    Bucket="staywise-airbnb-data",
    Key="airbnb/raw_data/AB_NYC_2019.csv"
)

df = pd.read_csv(obj["Body"])
df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
df['name'] = df['name'].fillna("Unknown Listing")
df['host_name'] = df['host_name'].fillna("Unknown Host")
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)


In [4]:
reference_date = pd.Timestamp("2024-01-01")
df['days_since_last_review'] = (reference_date - df['last_review']).dt.days
df['days_since_last_review'] = df['days_since_last_review'].fillna(df['days_since_last_review'].max())


In [5]:
lower = df['price'].quantile(0.01)
upper = df['price'].quantile(0.99)
df['price'] = df['price'].clip(lower, upper)


In [6]:
df = df.drop(['id', 'name', 'host_id', 'host_name', 'last_review'], axis=1)


In [7]:
df = pd.get_dummies(df, drop_first=True)
df.head()


Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,days_since_last_review,neighbourhood_group_Brooklyn,...,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,149,1,9,0.21,6,365,1900.0,True,...,False,False,False,False,False,False,False,False,True,False
1,40.75362,-73.98377,225,1,45,0.38,2,355,1686.0,False,...,False,False,False,False,False,False,False,False,False,False
2,40.80902,-73.9419,150,3,0,0.0,1,365,4662.0,False,...,False,False,False,False,False,False,False,False,True,False
3,40.68514,-73.95976,89,1,270,4.64,1,194,1641.0,True,...,False,False,False,False,False,False,False,False,False,False
4,40.79851,-73.94399,80,10,9,0.1,1,0,1869.0,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from io import StringIO

csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)

s3.put_object(
    Bucket="staywise-airbnb-data",
    Key="airbnb/processed/cleaned_airbnb.csv",
    Body=csv_buffer.getvalue()
)

print("Cleaned dataset saved to S3 as airbnb/processed/cleaned_airbnb.csv")


Cleaned dataset saved to S3 as airbnb/processed/cleaned_airbnb.csv


: 