In [1]:
import pandas as pd
import hvplot.pandas
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
import requests
import tensorflow as tf

In [2]:
df_SD=pd.read_csv("./Resources/listings_SD_zipcode.csv", encoding='utf-8')
df_SD.head()

Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode
0,0,47475850.0,https://www.airbnb.com/rooms/47475849,20230300000000.0,3/25/2023,previous scrape,"WELCOME to ""WORLD MAP SUITE"" in the heart of P...",Welcome to my home! <br /><br />The “WORLD MAP...,,https://a0.muscache.com/pictures/0e0be955-e28b...,...,5.0,5.0,,t,1,0,1,0,0.04,92109.0
1,1,8.32e+17,https://www.airbnb.com/rooms/832160725075323156,20230300000000.0,3/25/2023,city scrape,"""Pool Villa & Guest House""","""Luxury Pool Villa With Detached Guest House"" ...",Point Loma is a historic upscale residential c...,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,STR-06234L,f,2,2,0,0,,92106.0
2,2,29796730.0,https://www.airbnb.com/rooms/29796733,20230300000000.0,3/25/2023,previous scrape,PB LIFE,Located in the highly sought out Crown Point n...,,https://a0.muscache.com/pictures/e105cd5d-9093...,...,5.0,5.0,,f,1,0,1,0,0.02,92109.0
3,3,51640550.0,https://www.airbnb.com/rooms/51640548,20230300000000.0,3/25/2023,previous scrape,Lovely two story apartment Gaslamp / East Village,Enjoy a stylish experience at this centrally-l...,,https://a0.muscache.com/pictures/8d58b68f-f762...,...,5.0,5.0,,t,1,1,0,0,0.05,92101.0
4,4,7.26e+17,https://www.airbnb.com/rooms/726314173554301905,20230300000000.0,3/25/2023,city scrape,Mid-Century Modern House,This beautiful Mid-Century Modern home is the ...,North Park is a vibrant and diverse neighborho...,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,,f,1,1,0,0,,92104.0


In [3]:
df_SD.columns

Index(['Unnamed: 0', 'id', 'listing_url', 'scrape_id', 'last_scraped',
       'source', 'name', 'description', 'neighborhood_overview', 'picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights

In [4]:
#finding the items in each row 
amenities=df_SD["amenities"].str.replace("[", "",regex=True).str.replace("]","",regex=True).str.replace('"','',regex=True).str.split(", ", expand=True)

amenities=amenities.T
amenities.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12861,12862,12863,12864,12865,12866,12867,12868,12869,12870
0,Baby bath,Cleaning available during stay,Washer,Smoke alarm,Washer,Washer,Self check-in,Refrigerator,Washer,Refrigerator,...,Refrigerator,Washer,Washer,Washer,Washer,Washer,Refrigerator,Bed sheets and pillows,Bed sheets and pillows,Mini fridge
1,Iron,Iron,Hot tub,Security cameras on property,BBQ grill,Refrigerator,Security cameras on property,Iron,Host greets you,Iron,...,Iron,TV,Smoke alarm,Refrigerator,Hot tub,Refrigerator,Iron,Toiletries,Toiletries,Hair dryer
2,Safe,Safe,Smoke alarm,TV,First aid kit,Iron,Keypad,Security cameras on property,Smoke alarm,Security cameras on property,...,Stove,Host greets you,TV,Hair dryer,Breakfast,Iron,Safe,Smoke alarm,Iron,Bed sheets and pillows
3,Lock on bedroom door,Security cameras on property,Dryer,Wifi,Air conditioning,Dryer,,Stove,Security cameras on property,Pack \u2019n play/Travel crib,...,Self check-in,Kitchen,Wifi,Free street parking,Hot water,Dryer,Paid washer \u2013 In building,Coffee maker,Safe,HDTV
4,First aid kit,Table corner guards,Lock on bedroom door,First aid kit,Carbon monoxide alarm,Stove,,Pack \u2019n play/Travel crib,TV,Window guards,...,Carbon monoxide alarm,,Kitchen,Iron,Essentials,Stove,Dedicated workspace,Heated pool \u2014 outdoor,BBQ grill,Iron


In [5]:
#finding unique amenities for each unit (this is just a test to make sure we don't have double entries)
amenities_list_unit=[]
for i in range(0, len(df_SD)):
    temp=amenities[i].unique()
    amenities_list_unit.append(temp)


In [6]:
amenities_count=[]
for i in range(0, len(df_SD)):
   t=len(amenities_list_unit[i])
   amenities_count.append(t)

amenities_count=pd.DataFrame(amenities_count).rename(columns={0:"amenities_count"})
df_SD_2=pd.concat([df_SD , amenities_count], axis=1)
df_SD_2.head()

Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,...,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,amenities_count
0,0,47475850.0,https://www.airbnb.com/rooms/47475849,20230300000000.0,3/25/2023,previous scrape,"WELCOME to ""WORLD MAP SUITE"" in the heart of P...",Welcome to my home! <br /><br />The “WORLD MAP...,,https://a0.muscache.com/pictures/0e0be955-e28b...,...,5.0,,t,1,0,1,0,0.04,92109.0,33
1,1,8.32e+17,https://www.airbnb.com/rooms/832160725075323156,20230300000000.0,3/25/2023,city scrape,"""Pool Villa & Guest House""","""Luxury Pool Villa With Detached Guest House"" ...",Point Loma is a historic upscale residential c...,https://a0.muscache.com/pictures/miso/Hosting-...,...,,STR-06234L,f,2,2,0,0,,92106.0,81
2,2,29796730.0,https://www.airbnb.com/rooms/29796733,20230300000000.0,3/25/2023,previous scrape,PB LIFE,Located in the highly sought out Crown Point n...,,https://a0.muscache.com/pictures/e105cd5d-9093...,...,5.0,,f,1,0,1,0,0.02,92109.0,17
3,3,51640550.0,https://www.airbnb.com/rooms/51640548,20230300000000.0,3/25/2023,previous scrape,Lovely two story apartment Gaslamp / East Village,Enjoy a stylish experience at this centrally-l...,,https://a0.muscache.com/pictures/8d58b68f-f762...,...,5.0,,t,1,1,0,0,0.05,92101.0,10
4,4,7.26e+17,https://www.airbnb.com/rooms/726314173554301905,20230300000000.0,3/25/2023,city scrape,Mid-Century Modern House,This beautiful Mid-Century Modern home is the ...,North Park is a vibrant and diverse neighborho...,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,f,1,1,0,0,,92104.0,34


In [7]:
#putting all the amenities in all units in one list (there are for sure duplicates)
amenities_list_all=[]
for i in range(0, len(amenities_list_unit)):
    amenities_list_all=list(amenities_list_unit[i])+amenities_list_all


In [8]:
#all uniques amenities in SD
unique_amenities=pd.DataFrame(pd.DataFrame(amenities_list_all)[0].unique())
unique_amenities.head()

Unnamed: 0,0
0,Mini fridge
1,Hair dryer
2,Bed sheets and pillows
3,HDTV
4,Iron


In [9]:
#finding the top amenities
amenities=pd.DataFrame(amenities_list_all).rename(columns={0:"amenities"})
amenities_stat=amenities.groupby("amenities")['amenities'].count().sort_values(ascending=False)
pd.DataFrame(amenities_stat).head(10)

Unnamed: 0_level_0,amenities
amenities,Unnamed: 1_level_1
Smoke alarm,12442
Carbon monoxide alarm,11667
Kitchen,11593
Essentials,11555
Wifi,11505
Hangers,10814
Hair dryer,10604
Dishes and silverware,10521
Hot water,10481
Iron,10246


# Finding the range of amenities and bin them into basic, moderate and luxury

In [10]:
# sort the count of amenities and plot the result
amenities_count_sorted=pd.DataFrame(df_SD_2["amenities_count"].sort_values())
summary_amenities_count=pd.DataFrame(amenities_count_sorted.groupby("amenities_count")["amenities_count"].count())
summary_amenities_count=summary_amenities_count.rename(columns={"amenities_count":"count"})
summary_amenities_count.hvplot.scatter(x="amenities_count", y="count")

### Here are our assumption to categorize the amenities:
Basic amenities as those with less than 20 count of amenities
Moderate amenities as those with count of amenities between 20 and 60, and 
Luxury amenities as those with more than 60 amenities

In [11]:
# categorizing the amenities
df_SD_2["amenities_cat"]="moderate"
df_SD_2.loc[df_SD_2["amenities_count"]<20,"amenities_cat" ]="basic"
df_SD_2.loc[df_SD_2["amenities_count"]>60,"amenities_cat" ]="luxury"
df_SD_2

Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,...,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,amenities_count,amenities_cat
0,0,4.747585e+07,https://www.airbnb.com/rooms/47475849,2.023030e+13,3/25/2023,previous scrape,"WELCOME to ""WORLD MAP SUITE"" in the heart of P...",Welcome to my home! <br /><br />The “WORLD MAP...,,https://a0.muscache.com/pictures/0e0be955-e28b...,...,,t,1,0,1,0,0.04,92109.0,33,moderate
1,1,8.320000e+17,https://www.airbnb.com/rooms/832160725075323156,2.023030e+13,3/25/2023,city scrape,"""Pool Villa & Guest House""","""Luxury Pool Villa With Detached Guest House"" ...",Point Loma is a historic upscale residential c...,https://a0.muscache.com/pictures/miso/Hosting-...,...,STR-06234L,f,2,2,0,0,,92106.0,81,luxury
2,2,2.979673e+07,https://www.airbnb.com/rooms/29796733,2.023030e+13,3/25/2023,previous scrape,PB LIFE,Located in the highly sought out Crown Point n...,,https://a0.muscache.com/pictures/e105cd5d-9093...,...,,f,1,0,1,0,0.02,92109.0,17,basic
3,3,5.164055e+07,https://www.airbnb.com/rooms/51640548,2.023030e+13,3/25/2023,previous scrape,Lovely two story apartment Gaslamp / East Village,Enjoy a stylish experience at this centrally-l...,,https://a0.muscache.com/pictures/8d58b68f-f762...,...,,t,1,1,0,0,0.05,92101.0,10,basic
4,4,7.260000e+17,https://www.airbnb.com/rooms/726314173554301905,2.023030e+13,3/25/2023,city scrape,Mid-Century Modern House,This beautiful Mid-Century Modern home is the ...,North Park is a vibrant and diverse neighborho...,https://a0.muscache.com/pictures/miso/Hosting-...,...,,f,1,1,0,0,,92104.0,34,moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12866,12866,3.774158e+07,https://www.airbnb.com/rooms/37741577,2.023030e+13,3/25/2023,previous scrape,The Best In Mission Valley 2 Bedrooms MB4,"Each unit has been carefully planned, offering...",,https://a0.muscache.com/pictures/69fc20ce-cec2...,...,,t,5,5,0,0,0.36,92018.0,33,moderate
12867,12867,6.340000e+17,https://www.airbnb.com/rooms/634095272329380456,2.023030e+13,3/25/2023,previous scrape,Studio San Diego,Enjoy a stylish experience at this centrally-l...,,https://a0.muscache.com/pictures/50fd3821-bd3f...,...,,f,1,1,0,0,,92101.0,28,moderate
12868,12868,4.307796e+07,https://www.airbnb.com/rooms/43077965,2.023030e+13,3/25/2023,previous scrape,PB Surf Beachside Inn,,,https://a0.muscache.com/pictures/78370e0c-a5f7...,...,,f,1,0,0,0,,92109.0,18,basic
12869,12869,4.287564e+07,https://www.airbnb.com/rooms/42875640,2.023030e+13,3/25/2023,previous scrape,Pacific Shores Inn,,,https://a0.muscache.com/pictures/2a1c18ba-843f...,...,,f,1,0,0,0,,92109.0,22,moderate


In [12]:
#finding the unique bathrooms type
df_SD_2["bathrooms_text"].unique()

array(['1 private bath', '4 baths', '1.5 baths', '2 baths', '1 bath',
       '0 shared baths', '23 baths', '5 baths', '2 shared baths',
       '8.5 baths', '1 shared bath', '3 baths', '2.5 baths', '11 baths',
       '7 baths', '5.5 baths', '14 baths', '8 baths', '4.5 baths',
       '9.5 baths', '6 baths', '3.5 baths', '9 baths', '7.5 baths',
       '6.5 baths', '33.5 baths', '10 baths', '1.5 shared baths',
       '12.5 baths', '0 baths', '3 shared baths', '2.5 shared baths',
       '4 shared baths', '3.5 shared baths', '6 shared baths', '20 baths',
       nan, '27.5 baths', 'Half-bath', '4.5 shared baths',
       '5.5 shared baths', '6.5 shared baths', '5 shared baths',
       '8 shared baths', 'Shared half-bath'], dtype=object)

In [13]:
#binning the bathrooms
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('Shared half-bath', '0.5')
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('Half-bath', '0.5')
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('shared', '')
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('baths', '')
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('bath', '')
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].str.replace('private', '')

# Convert data type to float.
df_SD_2['bathrooms_text'] = df_SD_2['bathrooms_text'].astype(float)
df_SD_2['bathrooms_text']

0        1.0
1        4.0
2        1.0
3        1.5
4        2.0
        ... 
12866    2.0
12867    1.0
12868    NaN
12869    NaN
12870    NaN
Name: bathrooms_text, Length: 12871, dtype: float64

In [14]:
# Selecting only the important columns
my_data=df_SD_2[["listing_url",
"last_scraped",
"host_since",
"host_response_rate",
"host_acceptance_rate",
"host_is_superhost",
"host_total_listings_count",
#"host_has_profile_pic",
#"host_identity_verified",
"zipcode",
"room_type",
"accommodates",
"bathrooms_text",
"bedrooms",
"beds",
"amenities_cat",
"minimum_nights",
"maximum_nights",
# "number_of_reviews",
# "review_scores_rating",
# "review_scores_accuracy",
# "review_scores_cleanliness",
# "review_scores_checkin",
# "review_scores_communication",
# "review_scores_location",
# "review_scores_value",
"instant_bookable",
# "availability_30",
# "availability_60",
#"availability_90",
"availability_365",
"price",
"latitude",
"longitude"        
]]
my_data

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,bedrooms,beds,amenities_cat,minimum_nights,maximum_nights,instant_bookable,availability_365,price,latitude,longitude
0,https://www.airbnb.com/rooms/47475849,3/25/2023,12/18/2020,,,f,1.0,92109.0,Private room,2,...,1.0,1.0,moderate,1,3,t,0,"$100,000.00",32.791900,-117.238070
1,https://www.airbnb.com/rooms/832160725075323156,3/25/2023,6/7/2017,100%,86%,f,2.0,92106.0,Entire home/apt,14,...,5.0,10.0,luxury,3,28,f,232,"$17,429.00",32.742533,-117.217675
2,https://www.airbnb.com/rooms/29796733,3/25/2023,7/12/2014,,,f,2.0,92109.0,Private room,2,...,1.0,1.0,basic,2,90,f,0,"$10,000.00",32.796410,-117.228080
3,https://www.airbnb.com/rooms/51640548,3/25/2023,1/30/2017,,,f,1.0,92101.0,Entire home/apt,4,...,1.0,1.0,basic,1,365,t,0,"$10,000.00",32.713020,-117.157940
4,https://www.airbnb.com/rooms/726314173554301905,3/25/2023,5/31/2022,100%,,f,6.0,92104.0,Entire home/apt,8,...,2.0,2.0,moderate,7,30,f,90,"$10,000.00",32.736967,-117.117561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12866,https://www.airbnb.com/rooms/37741577,3/25/2023,5/31/2013,100%,98%,f,9.0,92018.0,Entire home/apt,7,...,2.0,6.0,moderate,1,28,t,0,$19.00,32.768120,-117.153800
12867,https://www.airbnb.com/rooms/634095272329380456,3/25/2023,5/24/2022,,0%,f,1.0,92101.0,Entire home/apt,2,...,1.0,1.0,moderate,1,365,f,0,$10.00,32.710360,-117.153930
12868,https://www.airbnb.com/rooms/43077965,3/25/2023,11/22/2019,,,,3.0,92109.0,Hotel room,0,...,,,basic,1,365,f,0,$0.00,32.800510,-117.257030
12869,https://www.airbnb.com/rooms/42875640,3/25/2023,11/22/2019,,,,3.0,92109.0,Hotel room,0,...,,,moderate,1,365,f,0,$0.00,32.801310,-117.257550


In [15]:
my_data["zipcode"]=my_data["zipcode"].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_data["zipcode"]=my_data["zipcode"].astype(object)


In [16]:
#finding the type of each column
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12871 entries, 0 to 12870
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   listing_url                12871 non-null  object 
 1   last_scraped               12871 non-null  object 
 2   host_since                 12868 non-null  object 
 3   host_response_rate         11440 non-null  object 
 4   host_acceptance_rate       11952 non-null  object 
 5   host_is_superhost          12864 non-null  object 
 6   host_total_listings_count  12868 non-null  float64
 7   zipcode                    12850 non-null  object 
 8   room_type                  12871 non-null  object 
 9   accommodates               12871 non-null  int64  
 10  bathrooms_text             12866 non-null  float64
 11  bedrooms                   11774 non-null  float64
 12  beds                       12707 non-null  float64
 13  amenities_cat              12871 non-null  obj

In [17]:
# making sure "NaN" is a legitimate numpy NaN and not just a string
df=my_data.replace("NaN", np.nan)
df

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,bedrooms,beds,amenities_cat,minimum_nights,maximum_nights,instant_bookable,availability_365,price,latitude,longitude
0,https://www.airbnb.com/rooms/47475849,3/25/2023,12/18/2020,,,f,1.0,92109.0,Private room,2,...,1.0,1.0,moderate,1,3,t,0,"$100,000.00",32.791900,-117.238070
1,https://www.airbnb.com/rooms/832160725075323156,3/25/2023,6/7/2017,100%,86%,f,2.0,92106.0,Entire home/apt,14,...,5.0,10.0,luxury,3,28,f,232,"$17,429.00",32.742533,-117.217675
2,https://www.airbnb.com/rooms/29796733,3/25/2023,7/12/2014,,,f,2.0,92109.0,Private room,2,...,1.0,1.0,basic,2,90,f,0,"$10,000.00",32.796410,-117.228080
3,https://www.airbnb.com/rooms/51640548,3/25/2023,1/30/2017,,,f,1.0,92101.0,Entire home/apt,4,...,1.0,1.0,basic,1,365,t,0,"$10,000.00",32.713020,-117.157940
4,https://www.airbnb.com/rooms/726314173554301905,3/25/2023,5/31/2022,100%,,f,6.0,92104.0,Entire home/apt,8,...,2.0,2.0,moderate,7,30,f,90,"$10,000.00",32.736967,-117.117561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12866,https://www.airbnb.com/rooms/37741577,3/25/2023,5/31/2013,100%,98%,f,9.0,92018.0,Entire home/apt,7,...,2.0,6.0,moderate,1,28,t,0,$19.00,32.768120,-117.153800
12867,https://www.airbnb.com/rooms/634095272329380456,3/25/2023,5/24/2022,,0%,f,1.0,92101.0,Entire home/apt,2,...,1.0,1.0,moderate,1,365,f,0,$10.00,32.710360,-117.153930
12868,https://www.airbnb.com/rooms/43077965,3/25/2023,11/22/2019,,,,3.0,92109.0,Hotel room,0,...,,,basic,1,365,f,0,$0.00,32.800510,-117.257030
12869,https://www.airbnb.com/rooms/42875640,3/25/2023,11/22/2019,,,,3.0,92109.0,Hotel room,0,...,,,moderate,1,365,f,0,$0.00,32.801310,-117.257550


In [18]:
# converting the "host_response_rate", "host_acceptance_rate" columns from object to float
df[["host_response_rate", "host_acceptance_rate"]]=df[["host_response_rate", "host_acceptance_rate"]].replace("%","", regex=True).astype(float)

In [19]:
#converting the first_review column from object to date
df["host_since"]=pd.to_datetime(df["host_since"])
df["last_scraped"]=pd.to_datetime(df["last_scraped"])
df["years_in_business"]=(((df["last_scraped"]-df["host_since"])) / np.timedelta64(1, 'Y')).round(0).astype(float)
df["years_in_business"]

0         2.0
1         6.0
2         9.0
3         6.0
4         1.0
         ... 
12866    10.0
12867     1.0
12868     3.0
12869     3.0
12870     3.0
Name: years_in_business, Length: 12871, dtype: float64

In [20]:
df["price"]=df["price"].str.replace("$","").str.replace(",","").astype(float)
df

  df["price"]=df["price"].str.replace("$","").str.replace(",","").astype(float)


Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,beds,amenities_cat,minimum_nights,maximum_nights,instant_bookable,availability_365,price,latitude,longitude,years_in_business
0,https://www.airbnb.com/rooms/47475849,2023-03-25,2020-12-18,,,f,1.0,92109.0,Private room,2,...,1.0,moderate,1,3,t,0,100000.0,32.791900,-117.238070,2.0
1,https://www.airbnb.com/rooms/832160725075323156,2023-03-25,2017-06-07,100.0,86.0,f,2.0,92106.0,Entire home/apt,14,...,10.0,luxury,3,28,f,232,17429.0,32.742533,-117.217675,6.0
2,https://www.airbnb.com/rooms/29796733,2023-03-25,2014-07-12,,,f,2.0,92109.0,Private room,2,...,1.0,basic,2,90,f,0,10000.0,32.796410,-117.228080,9.0
3,https://www.airbnb.com/rooms/51640548,2023-03-25,2017-01-30,,,f,1.0,92101.0,Entire home/apt,4,...,1.0,basic,1,365,t,0,10000.0,32.713020,-117.157940,6.0
4,https://www.airbnb.com/rooms/726314173554301905,2023-03-25,2022-05-31,100.0,,f,6.0,92104.0,Entire home/apt,8,...,2.0,moderate,7,30,f,90,10000.0,32.736967,-117.117561,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12866,https://www.airbnb.com/rooms/37741577,2023-03-25,2013-05-31,100.0,98.0,f,9.0,92018.0,Entire home/apt,7,...,6.0,moderate,1,28,t,0,19.0,32.768120,-117.153800,10.0
12867,https://www.airbnb.com/rooms/634095272329380456,2023-03-25,2022-05-24,,0.0,f,1.0,92101.0,Entire home/apt,2,...,1.0,moderate,1,365,f,0,10.0,32.710360,-117.153930,1.0
12868,https://www.airbnb.com/rooms/43077965,2023-03-25,2019-11-22,,,,3.0,92109.0,Hotel room,0,...,,basic,1,365,f,0,0.0,32.800510,-117.257030,3.0
12869,https://www.airbnb.com/rooms/42875640,2023-03-25,2019-11-22,,,,3.0,92109.0,Hotel room,0,...,,moderate,1,365,f,0,0.0,32.801310,-117.257550,3.0


In [21]:
#calculating the revenue
df["revenue"]=(365-df["availability_365"])*df["price"]
df["revenue_per_accommodates"]=df["revenue"]/df["accommodates"]
df

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,minimum_nights,maximum_nights,instant_bookable,availability_365,price,latitude,longitude,years_in_business,revenue,revenue_per_accommodates
0,https://www.airbnb.com/rooms/47475849,2023-03-25,2020-12-18,,,f,1.0,92109.0,Private room,2,...,1,3,t,0,100000.0,32.791900,-117.238070,2.0,36500000.0,1.825000e+07
1,https://www.airbnb.com/rooms/832160725075323156,2023-03-25,2017-06-07,100.0,86.0,f,2.0,92106.0,Entire home/apt,14,...,3,28,f,232,17429.0,32.742533,-117.217675,6.0,2318057.0,1.655755e+05
2,https://www.airbnb.com/rooms/29796733,2023-03-25,2014-07-12,,,f,2.0,92109.0,Private room,2,...,2,90,f,0,10000.0,32.796410,-117.228080,9.0,3650000.0,1.825000e+06
3,https://www.airbnb.com/rooms/51640548,2023-03-25,2017-01-30,,,f,1.0,92101.0,Entire home/apt,4,...,1,365,t,0,10000.0,32.713020,-117.157940,6.0,3650000.0,9.125000e+05
4,https://www.airbnb.com/rooms/726314173554301905,2023-03-25,2022-05-31,100.0,,f,6.0,92104.0,Entire home/apt,8,...,7,30,f,90,10000.0,32.736967,-117.117561,1.0,2750000.0,3.437500e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12866,https://www.airbnb.com/rooms/37741577,2023-03-25,2013-05-31,100.0,98.0,f,9.0,92018.0,Entire home/apt,7,...,1,28,t,0,19.0,32.768120,-117.153800,10.0,6935.0,9.907143e+02
12867,https://www.airbnb.com/rooms/634095272329380456,2023-03-25,2022-05-24,,0.0,f,1.0,92101.0,Entire home/apt,2,...,1,365,f,0,10.0,32.710360,-117.153930,1.0,3650.0,1.825000e+03
12868,https://www.airbnb.com/rooms/43077965,2023-03-25,2019-11-22,,,,3.0,92109.0,Hotel room,0,...,1,365,f,0,0.0,32.800510,-117.257030,3.0,0.0,
12869,https://www.airbnb.com/rooms/42875640,2023-03-25,2019-11-22,,,,3.0,92109.0,Hotel room,0,...,1,365,f,0,0.0,32.801310,-117.257550,3.0,0.0,


In [22]:
df.to_csv("./Resources/cleaned_data_SD.csv")

# check for outliers in the price columns
First, outliers are removed based on the 5%, and 95% limit.
Then, a "price-cat" column was created to tag the units based ont he following criteria:

Above the 75th percentile, 

Between 25th and 50th percentile, or 

Below 25th percentile

In [57]:
#draw the boxplot  price per bedroom
fig=df.sort_values(by='bedrooms').hvplot.box(by='bedrooms',y='price', height=400, width=800)
fig

In [58]:
#draw the boxplot accommodates vs bedrooms
fig=df.sort_values(by='accommodates').hvplot.box(by='accommodates',y='bedrooms', height=400, width=800)
fig

In [59]:
#finding the outliers for price
quantiles_df=df.groupby("bedrooms")["price"].quantile([0.25,0.5,0.75]).unstack(level=1)
quantiles_df.rename(columns={0.25:"0.25_price",0.5:"0.5_price",0.75:"0.75_price"}, inplace=True)
df2=pd.merge(df,quantiles_df, on="bedrooms")
df2["outlier"]=0
IQR=df2["0.75_price"]-df2["0.25_price"]
df2.loc[((df2["price"]<(df2["0.25_price"]-1.5*IQR)) | (df2["price"]>(df2["0.75_price"]+1.5*IQR))),'outlier']=1
df2=df2.loc[df2["outlier"]!=1]
df2

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,price,latitude,longitude,years_in_business,revenue,revenue_per_accommodates,0.25_price,0.5_price,0.75_price,outlier
299,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,309.0,32.788400,-117.254020,8.0,84357.0,21089.2500,87.00,125.0,176.00,0
300,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,306.0,32.765118,-117.252220,2.0,59058.0,14764.5000,87.00,125.0,176.00,0
301,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,304.0,32.806360,-117.234310,11.0,110960.0,27740.0000,87.00,125.0,176.00,0
302,https://www.airbnb.com/rooms/661297807980134024,2023-03-25,2022-06-29,100.0,99.0,f,17.0,92122.0,Entire home/apt,5,...,303.0,32.867460,-117.220970,1.0,110595.0,22119.0000,87.00,125.0,176.00,0
303,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,302.0,32.741770,-117.251330,9.0,15704.0,3926.0000,87.00,125.0,176.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11768,https://www.airbnb.com/rooms/624714846620622484,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,16,...,724.0,32.771330,-117.250800,8.0,252676.0,15792.2500,858.50,943.0,2421.00,0
11769,https://www.airbnb.com/rooms/714482450713596663,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,1752.0,32.711501,-117.124971,12.0,166440.0,10402.5000,1444.25,1617.0,1667.25,0
11770,https://www.airbnb.com/rooms/811514787237852543,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,1639.0,32.710931,-117.123787,12.0,134398.0,8399.8750,1444.25,1617.0,1667.25,0
11771,https://www.airbnb.com/rooms/40504870,2023-03-25,2015-09-04,100.0,100.0,t,94.0,92103.0,Entire home/apt,16,...,1595.0,32.743750,-117.183700,8.0,301455.0,18840.9375,1444.25,1617.0,1667.25,0


In [60]:
df2.sort_values(by='bedrooms').hvplot.box(y='price', by='bedrooms', width=800, height=400)

In [61]:
#finding the outliers for bedrooms
quantiles_df_2=df.groupby("accommodates")["bedrooms"].quantile([0.25,0.5,0.75]).unstack(level=1)
quantiles_df_2.rename(columns={0.25:"0.25_acc",0.5:"0.5_acc",0.75:"0.75_acc"}, inplace=True)
df2=pd.merge(df2,quantiles_df_2, on="accommodates")
df2["outlier_2"]=0
IQR=df2["0.75_acc"]-df2["0.25_acc"]
df2.loc[((df2["bedrooms"]<(df2["0.25_acc"]-1.5*IQR)) | (df2["bedrooms"]>(df2["0.75_acc"]+1.5*IQR))),'outlier_2']=1
df2=df2.loc[df2["outlier_2"]!=1]
df2

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,revenue,revenue_per_accommodates,0.25_price,0.5_price,0.75_price,outlier,0.25_acc,0.5_acc,0.75_acc,outlier_2
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,84357.0,21089.250000,87.0,125.0,176.0,0,1.0,1.0,2.0,0
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,59058.0,14764.500000,87.0,125.0,176.0,0,1.0,1.0,2.0,0
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,110960.0,27740.000000,87.0,125.0,176.0,0,1.0,1.0,2.0,0
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,15704.0,3926.000000,87.0,125.0,176.0,0,1.0,1.0,2.0,0
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,26576.0,6644.000000,87.0,125.0,176.0,0,1.0,1.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11145,https://www.airbnb.com/rooms/49447451,2023-03-25,2021-04-26,100.0,100.0,t,5.0,92126.0,Entire home/apt,11,...,28500.0,2590.909091,372.0,519.0,779.0,0,3.0,4.0,4.0,0
11146,https://www.airbnb.com/rooms/590550326623850623,2023-03-25,2016-06-27,100.0,82.0,t,13.0,92154.0,Entire home/apt,11,...,32155.0,2923.181818,372.0,519.0,779.0,0,3.0,4.0,4.0,0
11147,https://www.airbnb.com/rooms/8950246,2023-03-25,2013-04-15,90.0,64.0,t,15.0,92109.0,Entire home/apt,11,...,41322.0,3756.545455,372.0,519.0,779.0,0,3.0,4.0,4.0,0
11148,https://www.airbnb.com/rooms/711901151538425615,2023-03-25,2022-09-09,100.0,91.0,f,1.0,92154.0,Entire home/apt,11,...,86296.0,7845.090909,372.0,519.0,779.0,0,3.0,4.0,4.0,0


In [62]:
df2.sort_values(by='accommodates').hvplot.box(y='bedrooms', by='accommodates', width=800, height=400)

In [63]:
#finding the type of each column
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10746 entries, 0 to 11149
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   listing_url                10746 non-null  object        
 1   last_scraped               10746 non-null  datetime64[ns]
 2   host_since                 10745 non-null  datetime64[ns]
 3   host_response_rate         9618 non-null   float64       
 4   host_acceptance_rate       10037 non-null  float64       
 5   host_is_superhost          10746 non-null  object        
 6   host_total_listings_count  10745 non-null  float64       
 7   zipcode                    10730 non-null  object        
 8   room_type                  10746 non-null  object        
 9   accommodates               10746 non-null  int64         
 10  bathrooms_text             10744 non-null  float64       
 11  bedrooms                   10746 non-null  float64       
 12  beds

In [65]:
# Transform the categorical columns using get_dummies
categorical_columns=df2.dtypes[df2.dtypes=="object"].index.tolist()
categorical_columns.remove("listing_url")
categorical_dummies = pd.get_dummies(df2[categorical_columns])

# Display the transformed data
categorical_dummies

  uniques = Index(uniques)


Unnamed: 0,host_is_superhost_f,host_is_superhost_t,zipcode_2037.0,zipcode_22000.0,zipcode_91239.0,zipcode_91902.0,zipcode_91910.0,zipcode_91911.0,zipcode_91913.0,zipcode_91914.0,...,zipcode_92182.0,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,amenities_cat_basic,amenities_cat_luxury,amenities_cat_moderate,instant_bookable_f,instant_bookable_t
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11145,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
11146,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
11147,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
11148,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0


In [66]:
# scaling the numeric columns
numeric_columns=df2.dtypes[df2.dtypes!="object"].index.tolist()
# have to remove the id, lat and long columns out
data=df2[numeric_columns].loc[:,~df2[numeric_columns].columns.isin(["listing_url","host_since","last_scraped","latitude","longitude",
                                                                    "0.25_price","0.5_price","0.75_price",
                                                                    "0.25_acc","0.5_acc","0.75_acc",
                                                                    "outlier"])]
Scaled_df=pd.DataFrame(StandardScaler().fit_transform(data), columns=data.columns)
Scaled_df


Unnamed: 0,host_response_rate,host_acceptance_rate,host_total_listings_count,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,maximum_nights,availability_365,price,years_in_business,revenue,revenue_per_accommodates,outlier_2
0,0.304656,0.418262,0.346243,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,-0.272742,-0.747667,0.143634,0.571956,0.704198,1.333573,0.0
1,0.304656,0.469582,-0.232350,-0.331850,-0.648829,-0.809058,-0.372942,-0.211193,-1.001901,-0.107000,0.132463,-1.402978,0.261670,0.608812,0.0
2,,,-0.232350,-0.331850,-0.648829,-0.809058,-0.838682,-0.211193,-1.039014,-1.484435,0.125015,1.559422,1.169535,2.095690,0.0
3,,0.469582,-0.216498,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,1.386422,1.022176,0.117567,0.901111,-0.496674,-0.633184,0.0
4,,0.469582,-0.216498,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,1.386422,0.733876,0.117567,0.901111,-0.306502,-0.321725,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,0.304656,0.469582,-0.221782,1.825288,0.364474,1.634260,0.558537,-0.255952,1.386422,0.677818,0.110120,-1.402978,-0.272848,-0.786173,0.0
10742,0.304656,-0.454171,-0.200646,1.825288,0.364474,1.634260,2.421495,-0.255952,-0.272742,0.565701,0.091501,0.242800,-0.208915,-0.748098,0.0
10743,-0.556376,-1.377925,-0.195362,1.825288,0.364474,1.634260,1.490016,-0.300710,1.386422,0.301426,0.076605,1.230267,-0.048566,-0.652602,0.0
10744,0.304656,0.007705,-0.232350,1.825288,0.871126,1.634260,0.558537,-0.300710,1.386422,-1.140076,-0.009042,-1.732133,0.738115,-0.184091,0.0


In [67]:
# combining the numeric and categorical columns
df_scaled_dummies=pd.concat([Scaled_df.reset_index(drop=True),categorical_dummies.reset_index(drop=True)], axis=1)
df_scaled_dummies

Unnamed: 0,host_response_rate,host_acceptance_rate,host_total_listings_count,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,maximum_nights,availability_365,...,zipcode_92182.0,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,amenities_cat_basic,amenities_cat_luxury,amenities_cat_moderate,instant_bookable_f,instant_bookable_t
0,0.304656,0.418262,0.346243,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,-0.272742,-0.747667,...,0,1,0,0,0,0,0,1,1,0
1,0.304656,0.469582,-0.232350,-0.331850,-0.648829,-0.809058,-0.372942,-0.211193,-1.001901,-0.107000,...,0,1,0,0,0,0,0,1,0,1
2,,,-0.232350,-0.331850,-0.648829,-0.809058,-0.838682,-0.211193,-1.039014,-1.484435,...,0,1,0,0,0,0,0,1,1,0
3,,0.469582,-0.216498,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,1.386422,1.022176,...,0,1,0,0,0,0,0,1,0,1
4,,0.469582,-0.216498,-0.331850,-0.648829,-0.809058,-0.372942,-0.300710,1.386422,0.733876,...,0,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,0.304656,0.469582,-0.221782,1.825288,0.364474,1.634260,0.558537,-0.255952,1.386422,0.677818,...,0,1,0,0,0,0,0,1,1,0
10742,0.304656,-0.454171,-0.200646,1.825288,0.364474,1.634260,2.421495,-0.255952,-0.272742,0.565701,...,0,1,0,0,0,0,0,1,1,0
10743,-0.556376,-1.377925,-0.195362,1.825288,0.364474,1.634260,1.490016,-0.300710,1.386422,0.301426,...,0,1,0,0,0,0,0,1,1,0
10744,0.304656,0.007705,-0.232350,1.825288,0.871126,1.634260,0.558537,-0.300710,1.386422,-1.140076,...,0,1,0,0,0,0,0,1,1,0


## Creating a PCA model to cluster the data 

In [68]:
model_pca=PCA(n_components=3)
# Use the PCA model with `fit_transform` to reduce to 
# two principal components.
pca_data=model_pca.fit_transform(df_scaled_dummies.dropna())
# View the first five rows of the DataFrame. 
df_pca=pd.DataFrame(pca_data,columns=["PC1","PC2", "PC3"])
print(model_pca.explained_variance_ratio_)
df_pca

[0.28332001 0.13202161 0.08782388]


Unnamed: 0,PC1,PC2,PC3
0,-0.615241,2.044897,-0.102044
1,-0.738257,0.490105,-1.202265
2,-1.246588,-0.575499,-0.139991
3,-1.302164,-3.246545,6.527605
4,-1.286875,-1.713010,-1.074852
...,...,...,...
9429,1.839464,-1.334836,-0.858910
9430,2.598086,-0.984743,-0.270594
9431,2.313163,-0.769360,1.640015
9432,2.275846,0.226147,-0.359158


In [69]:
# merging the original data with PCA. This is not very straightforward as we need to drop rows with NA
# in the numricl columns

categorical_columns_2=categorical_columns
categorical_columns_2.append("listing_url")

list_1=df2[numeric_columns].columns.tolist()

for i in ["host_since","last_scraped","latitude","longitude",
          "0.25_price","0.5_price","0.75_price",
          "0.25_acc","0.5_acc","0.75_acc",
          "outlier"]:
    list_1.remove(i)

list_1.append("listing_url")
list_1

# need to drop the NAN in rows. This is a requirement for PCA
df_SD_2=pd.merge(df2[list_1].dropna(),df2[categorical_columns_2], on="listing_url")
df_SD_pc=pd.concat([df_SD_2.reset_index(drop=True),df_pca.reset_index(drop=True)], axis=1)
df_SD_pc


Unnamed: 0,host_response_rate,host_acceptance_rate,host_total_listings_count,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,maximum_nights,availability_365,...,outlier_2,listing_url,host_is_superhost,zipcode,room_type,amenities_cat,instant_bookable,PC1,PC2,PC3
0,100.0,99.0,220.0,4,1.0,1.0,2.0,1,365,92,...,0,https://www.airbnb.com/rooms/542822559960498876,t,92109.0,Entire home/apt,moderate,f,-0.615241,2.044897,-0.102044
1,100.0,100.0,1.0,4,1.0,1.0,2.0,3,31,172,...,0,https://www.airbnb.com/rooms/49634069,f,92109.0,Entire home/apt,moderate,t,-0.738257,0.490105,-1.202265
2,100.0,96.0,3.0,4,1.0,1.0,1.0,2,100,310,...,0,https://www.airbnb.com/rooms/36100887,f,92102.0,Entire home/apt,moderate,f,-1.246588,-0.575499,-0.139991
3,0.0,17.0,162.0,4,1.0,1.0,1.0,1,1125,356,...,0,https://www.airbnb.com/rooms/52218020,f,92101.0,Entire home/apt,basic,f,-1.302164,-3.246545,6.527605
4,100.0,100.0,1.0,4,1.0,1.0,1.0,4,365,364,...,0,https://www.airbnb.com/rooms/717954471984575654,f,92101.0,Entire home/apt,luxury,f,-1.286875,-1.713010,-1.074852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9429,100.0,100.0,5.0,11,2.0,4.0,4.0,2,1125,270,...,0,https://www.airbnb.com/rooms/49447451,t,92126.0,Entire home/apt,moderate,f,1.839464,-1.334836,-0.858910
9430,100.0,82.0,13.0,11,2.0,4.0,8.0,2,365,256,...,0,https://www.airbnb.com/rooms/590550326623850623,t,92154.0,Entire home/apt,moderate,f,2.598086,-0.984743,-0.270594
9431,90.0,64.0,15.0,11,2.0,4.0,6.0,1,1125,223,...,0,https://www.airbnb.com/rooms/8950246,t,92109.0,Entire home/apt,moderate,f,2.313163,-0.769360,1.640015
9432,100.0,91.0,1.0,11,2.5,4.0,4.0,1,1125,43,...,0,https://www.airbnb.com/rooms/711901151538425615,f,92154.0,Entire home/apt,moderate,f,2.275846,0.226147,-0.359158


In [70]:
# finding pca loading
loadings = model_pca.components_.T * np.sqrt(model_pca.explained_variance_)
loading_df = pd.DataFrame(loadings, columns=['PC1', 'PC2', 'PC3'], index=df_scaled_dummies.columns).sort_values(by="PC1", ascending=False)
loading_df

Unnamed: 0,PC1,PC2,PC3
bedrooms,0.944773,-0.093461,-0.022559
accommodates,0.916133,-0.148956,-0.091269
beds,0.901614,-0.126911,-0.049319
price,0.894333,0.037927,0.083287
bathrooms_text,0.831205,-0.069558,0.043381
...,...,...,...
amenities_cat_basic,-0.031210,-0.012926,0.033820
amenities_cat_moderate,-0.033688,-0.004589,0.004257
zipcode_92101.0,-0.039464,-0.020775,0.009199
minimum_nights,-0.087193,-0.024881,0.271167


In [71]:
df_scaled_dummies.columns

Index(['host_response_rate', 'host_acceptance_rate',
       'host_total_listings_count', 'accommodates', 'bathrooms_text',
       'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
       'availability_365', 'price', 'years_in_business', 'revenue',
       'revenue_per_accommodates', 'outlier_2', 'host_is_superhost_f',
       'host_is_superhost_t', 'zipcode_2037.0', 'zipcode_22000.0',
       'zipcode_91239.0', 'zipcode_91902.0', 'zipcode_91910.0',
       'zipcode_91911.0', 'zipcode_91913.0', 'zipcode_91914.0',
       'zipcode_91915.0', 'zipcode_91932.0', 'zipcode_91941.0',
       'zipcode_91942.0', 'zipcode_91945.0', 'zipcode_91950.0',
       'zipcode_91977.0', 'zipcode_92014.0', 'zipcode_92017.0',
       'zipcode_92018.0', 'zipcode_92025.0', 'zipcode_92027.0',
       'zipcode_92029.0', 'zipcode_92037.0', 'zipcode_92039.0',
       'zipcode_92071.0', 'zipcode_92075.0', 'zipcode_92091.0',
       'zipcode_92093.0', 'zipcode_92101.0', 'zipcode_92102.0',
       'zipcode_92103.0', 'zipco

In [72]:
#finding the unique bedrooms after dropping the nans
df_SD_pc['bedrooms'].sort_values().unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])

In [73]:
#plotting pc1 vs pc2 and color based on bedrooms
figure1=df_SD_pc.sort_values(by='bedrooms').hvplot.scatter(x="PC1",y="PC2", hover_cols="bedrooms",
                                                   by="bedrooms",width=800, height=600 )

figure1 

In [74]:
#plotting pc1 vs pc2 and color based on accomodates
figure2=df_SD_pc.sort_values(by='accommodates').hvplot.scatter(x="PC1",y="PC2", hover_cols="accommodates",
                                                   by="accommodates",width=800, height=600 )

figure2 

In [75]:
#plotting pc1 vs pc2 and color based on beds
figure3=df_SD_pc.sort_values(by='bathrooms_text').hvplot.scatter(x="PC1",y="PC2", hover_cols="bathrooms_text",
                                                   by="bathrooms_text",width=800, height=600 )

figure3 

In [76]:
#plotting pc1 vs pc2 and color based on minimum_nights
figure3=df_SD_pc.sort_values(by='minimum_nights').hvplot.scatter(x="PC1",y="PC3", hover_cols="minimum_nights",
                                                   by="minimum_nights",width=800, height=600 )

figure3 

# 
The above plots suggests to bin the number of bedrooms,bathrooms, accommodates, and min numbe rof stay to avaoid noises in the data

Bedrooms will be bin to: 1, 2, 3, 4, 5, >6

Bathrooms will bin as follows: 1, 2, 3, 4, 5, >6

Accomodates will bin as follows:1-4, 5-8, 9-12, 13-16

Min-night will bin as follows:1, 2, 3, 4, 5, 6-30, >30

In [77]:
df3=df2.copy()
df3["bedrooms_cat"]=">6"
df3["bathrooms_cat"]=">6"
df3["accommodates_cat"]="13-16"
df3["min_night_cat"]=">6"
df3["max_night_cat"]=">365 days"
df3["years_in_business_cat"]=">10"


bedroom_count=[1,2,3,4,5]
df3.loc[df3["bedrooms"].isin(bedroom_count),"bedrooms_cat"]=df3["bedrooms"]

df3.loc[df3["bathrooms_text"].apply(np.ceil)==1,"bathrooms_cat"]=1
df3.loc[df3["bathrooms_text"].apply(np.ceil)==2,"bathrooms_cat"]=2
df3.loc[df3["bathrooms_text"].apply(np.ceil)==3,"bathrooms_cat"]=3
df3.loc[df3["bathrooms_text"].apply(np.ceil)==4,"bathrooms_cat"]=4
df3.loc[df3["bathrooms_text"].apply(np.ceil)==5,"bathrooms_cat"]=5

df3.loc[df3["accommodates"]<=4,"accommodates_cat"]="1-4"
df3.loc[((df3["accommodates"]>=5) & (df3["accommodates"]<=8)),"accommodates_cat"]="5-8"
df3.loc[((df3["accommodates"]>=9) & (df3["accommodates"]<=12)),"accommodates_cat"]="9-12"

min_night_count=[1,2,3,4,5]
df3.loc[df3["minimum_nights"].isin(min_night_count),"min_night_cat"]=df3["minimum_nights"]


df3.loc[df3["maximum_nights"]<=10,"max_night_cat"]="<10 days"
df3.loc[((df3["maximum_nights"]>=11) & (df3["maximum_nights"]<=30)),"max_night_cat"]="10-30 days"
df3.loc[((df3["maximum_nights"]>=31) & (df3["maximum_nights"]<=90)),"max_night_cat"]="31-90 days"
df3.loc[((df3["maximum_nights"]>=91) & (df3["maximum_nights"]<=365)),"max_night_cat"]="90-365 days"

df3.loc[((df3["years_in_business"]<1)),"years_in_business_cat"]="0"
df3.loc[((df3["years_in_business"]>=1) & (df3["years_in_business"]<=5)),"years_in_business_cat"]="1-5"
df3.loc[((df3["years_in_business"]>=6) & (df3["years_in_business"]<=10)),"years_in_business_cat"]="6-10"


df3

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,0.25_acc,0.5_acc,0.75_acc,outlier_2,bedrooms_cat,bathrooms_cat,accommodates_cat,min_night_cat,max_night_cat,years_in_business_cat
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,1.0,1.0,2.0,0,1.0,1,1-4,1,90-365 days,6-10
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,1.0,1.0,2.0,0,1.0,1,1-4,3,31-90 days,1-5
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,1.0,1.0,2.0,0,1.0,1,1-4,3,10-30 days,>10
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,1.0,1.0,2.0,0,1.0,1,1-4,1,>365 days,6-10
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,1.0,1.0,2.0,0,1.0,1,1-4,1,>365 days,6-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11145,https://www.airbnb.com/rooms/49447451,2023-03-25,2021-04-26,100.0,100.0,t,5.0,92126.0,Entire home/apt,11,...,3.0,4.0,4.0,0,4.0,2,9-12,2,>365 days,1-5
11146,https://www.airbnb.com/rooms/590550326623850623,2023-03-25,2016-06-27,100.0,82.0,t,13.0,92154.0,Entire home/apt,11,...,3.0,4.0,4.0,0,4.0,2,9-12,2,90-365 days,6-10
11147,https://www.airbnb.com/rooms/8950246,2023-03-25,2013-04-15,90.0,64.0,t,15.0,92109.0,Entire home/apt,11,...,3.0,4.0,4.0,0,4.0,2,9-12,1,>365 days,6-10
11148,https://www.airbnb.com/rooms/711901151538425615,2023-03-25,2022-09-09,100.0,91.0,f,1.0,92154.0,Entire home/apt,11,...,3.0,4.0,4.0,0,4.0,3,9-12,1,>365 days,1-5


In [78]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10746 entries, 0 to 11149
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   listing_url                10746 non-null  object        
 1   last_scraped               10746 non-null  datetime64[ns]
 2   host_since                 10745 non-null  datetime64[ns]
 3   host_response_rate         9618 non-null   float64       
 4   host_acceptance_rate       10037 non-null  float64       
 5   host_is_superhost          10746 non-null  object        
 6   host_total_listings_count  10745 non-null  float64       
 7   zipcode                    10730 non-null  object        
 8   room_type                  10746 non-null  object        
 9   accommodates               10746 non-null  int64         
 10  bathrooms_text             10744 non-null  float64       
 11  bedrooms                   10746 non-null  float64       
 12  beds

In [79]:
#creating the revenue_cat column
quantiles_df_revenue=df3.groupby("bedrooms_cat")["revenue"].quantile([0.25,0.5,0.75]).unstack(level=1)
quantiles_df_revenue.rename(columns={0.25:"0.25_revenue",0.5:"0.5_revenue",0.75:"0.75_revenue"}, inplace=True)
df3=pd.merge(df3,quantiles_df_revenue, on="bedrooms_cat")
df3["revenue_cat"]="<50th"
# # df3.loc[df3["revenue"]<df3["0.25_revenue"], "revenue_cat"]="<25th"
df3.loc[df3["revenue"]>df3["0.5_revenue"], "revenue_cat"]=">50th"
df3

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,bedrooms_cat,bathrooms_cat,accommodates_cat,min_night_cat,max_night_cat,years_in_business_cat,0.25_revenue,0.5_revenue,0.75_revenue,revenue_cat
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,1.0,1,1-4,1,90-365 days,6-10,7040.0,21060.0,36120.0,>50th
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,1.0,1,1-4,3,31-90 days,1-5,7040.0,21060.0,36120.0,>50th
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,1.0,1,1-4,3,10-30 days,>10,7040.0,21060.0,36120.0,>50th
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,1.0,1,1-4,1,>365 days,6-10,7040.0,21060.0,36120.0,<50th
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,1.0,1,1-4,1,>365 days,6-10,7040.0,21060.0,36120.0,>50th
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,https://www.airbnb.com/rooms/714482450713596663,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,>6,>6,13-16,3,90-365 days,>10,62797.0,139011.0,287862.5,>50th
10742,https://www.airbnb.com/rooms/811514787237852543,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,>6,>6,13-16,3,10-30 days,>10,62797.0,139011.0,287862.5,<50th
10743,https://www.airbnb.com/rooms/40504870,2023-03-25,2015-09-04,100.0,100.0,t,94.0,92103.0,Entire home/apt,16,...,>6,>6,13-16,2,10-30 days,6-10,62797.0,139011.0,287862.5,>50th
10744,https://www.airbnb.com/rooms/769579906600955220,2023-03-25,2015-12-13,100.0,99.0,t,4.0,92102.0,Entire home/apt,13,...,>6,4,13-16,3,90-365 days,6-10,62797.0,139011.0,287862.5,>50th


In [80]:
#creating the revenue_per_acc_cat column
quantiles_df_revenue_acc=df3.groupby("bedrooms_cat")["revenue_per_accommodates"].quantile([0.25,0.5,0.75]).unstack(level=1)
quantiles_df_revenue_acc.rename(columns={0.25:"0.25_revenue_acc",0.5:"0.5_revenue_acc",0.75:"0.75_revenue_acc"}, inplace=True)
df3=pd.merge(df3,quantiles_df_revenue_acc, on="bedrooms_cat")
df3["revenue_cat_acc"]="<50th"
#df3.loc[df3["revenue_per_accommodates"]<df3["0.25_revenue_acc"], "revenue_cat_acc"]="<25th"
#df3.loc[((df3["revenue_per_accommodates"]<df3["0.5_revenue_acc"]) & (df3["revenue_per_accommodates"]>df3["0.3_revenue_acc"])), "revenue_cat_acc"]="30th-50th"
df3.loc[df3["revenue_per_accommodates"]>df3["0.5_revenue_acc"], "revenue_cat_acc"]=">50th"
df3

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,max_night_cat,years_in_business_cat,0.25_revenue,0.5_revenue,0.75_revenue,revenue_cat,0.25_revenue_acc,0.5_revenue_acc,0.75_revenue_acc,revenue_cat_acc
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,90-365 days,6-10,7040.0,21060.0,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,31-90 days,1-5,7040.0,21060.0,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,10-30 days,>10,7040.0,21060.0,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,>365 days,6-10,7040.0,21060.0,36120.0,<50th,2892.500000,8824.500,15295.000000,<50th
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,>365 days,6-10,7040.0,21060.0,36120.0,>50th,2892.500000,8824.500,15295.000000,<50th
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,https://www.airbnb.com/rooms/714482450713596663,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,90-365 days,>10,62797.0,139011.0,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th
10742,https://www.airbnb.com/rooms/811514787237852543,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,10-30 days,>10,62797.0,139011.0,287862.5,<50th,4194.821429,9069.375,18110.114583,<50th
10743,https://www.airbnb.com/rooms/40504870,2023-03-25,2015-09-04,100.0,100.0,t,94.0,92103.0,Entire home/apt,16,...,10-30 days,6-10,62797.0,139011.0,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th
10744,https://www.airbnb.com/rooms/769579906600955220,2023-03-25,2015-12-13,100.0,99.0,t,4.0,92102.0,Entire home/apt,13,...,90-365 days,6-10,62797.0,139011.0,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th


In [81]:
#creating the price_cat column
quantiles_df_price=df3.groupby("bedrooms_cat")["price"].quantile([0.25,0.5,0.75]).unstack(level=1)
quantiles_df_price.rename(columns={0.25:"0.25_price_bedcat",0.5:"0.5_price_bedcat",0.75:"0.75_price_bedcat"}, inplace=True)
quantiles_df_price

df3=pd.merge(df3,quantiles_df_price, on="bedrooms_cat")

df3["price_cat"]="<50th"
# # #df3.loc[df2["price"]<df3["0.25_price_bedcat"], "price_cat"]="<25th"
df3.loc[df3["price"]>df3["0.5_price_bedcat"], "price_cat"]=">50th"
df3

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,0.75_revenue,revenue_cat,0.25_revenue_acc,0.5_revenue_acc,0.75_revenue_acc,revenue_cat_acc,0.25_price_bedcat,0.5_price_bedcat,0.75_price_bedcat,price_cat
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,36120.0,<50th,2892.500000,8824.500,15295.000000,<50th,85.0,120.0,164.0,>50th
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,<50th,85.0,120.0,164.0,>50th
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,https://www.airbnb.com/rooms/714482450713596663,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th
10742,https://www.airbnb.com/rooms/811514787237852543,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,287862.5,<50th,4194.821429,9069.375,18110.114583,<50th,764.5,1034.5,1803.0,>50th
10743,https://www.airbnb.com/rooms/40504870,2023-03-25,2015-09-04,100.0,100.0,t,94.0,92103.0,Entire home/apt,16,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th
10744,https://www.airbnb.com/rooms/769579906600955220,2023-03-25,2015-12-13,100.0,99.0,t,4.0,92102.0,Entire home/apt,13,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th


In [82]:
#save the clean categorical data to csv
df3.to_csv("./Resources/cleaned_data_SD_final.csv")

In [83]:
df3

Unnamed: 0,listing_url,last_scraped,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,zipcode,room_type,accommodates,...,0.75_revenue,revenue_cat,0.25_revenue_acc,0.5_revenue_acc,0.75_revenue_acc,revenue_cat_acc,0.25_price_bedcat,0.5_price_bedcat,0.75_price_bedcat,price_cat
0,https://www.airbnb.com/rooms/542822559960498876,2023-03-25,2015-07-16,100.0,99.0,t,220.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
1,https://www.airbnb.com/rooms/49634069,2023-03-25,2021-05-06,100.0,100.0,f,1.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
2,https://www.airbnb.com/rooms/50448428,2023-03-25,2012-06-17,,,f,1.0,92109.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,>50th,85.0,120.0,164.0,>50th
3,https://www.airbnb.com/rooms/44697260,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,36120.0,<50th,2892.500000,8824.500,15295.000000,<50th,85.0,120.0,164.0,>50th
4,https://www.airbnb.com/rooms/44697268,2023-03-25,2014-03-30,,100.0,t,7.0,92107.0,Entire home/apt,4,...,36120.0,>50th,2892.500000,8824.500,15295.000000,<50th,85.0,120.0,164.0,>50th
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10741,https://www.airbnb.com/rooms/714482450713596663,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th
10742,https://www.airbnb.com/rooms/811514787237852543,2023-03-25,2011-09-20,99.0,98.0,f,34.0,92102.0,Entire home/apt,16,...,287862.5,<50th,4194.821429,9069.375,18110.114583,<50th,764.5,1034.5,1803.0,>50th
10743,https://www.airbnb.com/rooms/40504870,2023-03-25,2015-09-04,100.0,100.0,t,94.0,92103.0,Entire home/apt,16,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th
10744,https://www.airbnb.com/rooms/769579906600955220,2023-03-25,2015-12-13,100.0,99.0,t,4.0,92102.0,Entire home/apt,13,...,287862.5,>50th,4194.821429,9069.375,18110.114583,>50th,764.5,1034.5,1803.0,>50th


In [84]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10746 entries, 0 to 10745
Data columns (total 50 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   listing_url                10746 non-null  object        
 1   last_scraped               10746 non-null  datetime64[ns]
 2   host_since                 10745 non-null  datetime64[ns]
 3   host_response_rate         9618 non-null   float64       
 4   host_acceptance_rate       10037 non-null  float64       
 5   host_is_superhost          10746 non-null  object        
 6   host_total_listings_count  10745 non-null  float64       
 7   zipcode                    10730 non-null  object        
 8   room_type                  10746 non-null  object        
 9   accommodates               10746 non-null  int64         
 10  bathrooms_text             10744 non-null  float64       
 11  bedrooms                   10746 non-null  float64       
 12  beds

In [85]:
df3["accommodates_cat"].value_counts().sort_values()

13-16     365
9-12      889
5-8      3566
1-4      5926
Name: accommodates_cat, dtype: int64

In [86]:
df3["min_night_cat"].value_counts().sort_values()

5      261
4      491
>6    2111
3     2209
1     2524
2     3150
Name: min_night_cat, dtype: int64

In [87]:
df3["bedrooms_cat"].value_counts().sort_values()

>6      158
5.0     295
4.0     761
3.0    1721
2.0    2858
1.0    4953
Name: bedrooms_cat, dtype: int64

In [88]:
df3["bathrooms_cat"].value_counts().sort_values()

>6     124
5      152
4      338
3     1249
2     2865
1     6018
Name: bathrooms_cat, dtype: int64

In [51]:

df3["years_in_business_cat"].value_counts().sort_values()

0        296
>10      784
1-5     3522
6-10    6549
Name: years_in_business_cat, dtype: int64

In [52]:
df3["max_night_cat"].value_counts().sort_values()

<10 days        504
31-90 days     1055
10-30 days     2219
>365 days      3652
90-365 days    3721
Name: max_night_cat, dtype: int64

In [53]:

df3["years_in_business"].value_counts().sort_values()

15.0       1
14.0      10
13.0      32
12.0     253
0.0      296
11.0     487
3.0      589
2.0      602
4.0      697
5.0      789
10.0     841
1.0      845
9.0      975
6.0     1403
8.0     1564
7.0     1766
Name: years_in_business, dtype: int64