# Notebook 3: Preprocessing

_For USD-599 Capstone Project by Hunter Blum, Kyle Esteban Dalope, and Nicholas Lee (Summer 2023)_

***

**Content Overview:**
1. Missing Value Handling

In [1]:
# Library Imports
import pandas as pd
import numpy as np

## Missing Value Handling

In [2]:
# Read in the csv file from the previous notebook
preproc_df = pd.read_csv("../Data/clean_df.csv.gz",
                         compression = "gzip")

preproc_df.head(1)

Unnamed: 0,host_listings_count,property_type,room_type,bathrooms,bedrooms,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_maximum_nights,...,calculated_host_listings_count,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,median_income_dollars,property_type_binary,private,sentiment,review_score_weighted
0,52.0,Entire home,Entire home/apt,2.0,2.0,728.0,2,28,1.0,1125.0,...,30,0,0,3.77,92109,95170.0,house,1,0.900577,150.04


In [3]:
preproc_df.isna().sum()

host_listings_count                               51
property_type                                      0
room_type                                          0
bathrooms                                          6
bedrooms                                        1538
price                                              0
minimum_nights                                     0
maximum_nights                                     0
minimum_minimum_nights                             2
maximum_maximum_nights                             2
has_availability                                   0
availability_30                                    0
availability_365                                   0
instant_bookable                                   0
calculated_host_listings_count                     0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               3186
zipcode                                       

In [4]:
# Assess the types of listings missing records bedrooms
preproc_df[preproc_df["bedrooms"].isnull()]["property_type"].value_counts()

property_type
Entire rental unit             645
Entire guesthouse              190
Entire condo                   174
Entire guest suite             136
Entire serviced apartment      102
Entire home                     72
Entire loft                     44
Camper/RV                       23
Entire cottage                  19
Private room in rental unit     18
Entire bungalow                 17
Tiny home                       15
Private room in home            12
Room in boutique hotel          11
Private room in resort          10
Entire vacation home             9
Room in hotel                    8
Private room in guest suite      6
Private room in guesthouse       4
Private room in condo            3
Private room in camper/rv        3
Private room in hostel           2
Entire place                     2
Barn                             2
Private room in bungalow         1
Private room in cottage          1
Private room in loft             1
Casa particular                  1
Entire

It can be reasonably assumed that property type values with "room" or "RV" are a have a single bedroom. So the missing values can be filled with the value of 1.

In [5]:
preproc_df[preproc_df["property_type"].astype(str).str.contains(r"rental unit")].sort_values("bedrooms", ascending = False)

Unnamed: 0,host_listings_count,property_type,room_type,bathrooms,bedrooms,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_maximum_nights,...,calculated_host_listings_count,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,median_income_dollars,property_type_binary,private,sentiment,review_score_weighted
4465,88.0,Entire rental unit,Entire home/apt,7.0,11.0,1595.0,2,28,2.0,28.0,...,72,0,0,,92103,84756.0,house,1,0.998420,
99,659.0,Entire rental unit,Entire home/apt,8.0,9.0,972.0,1,365,2.0,365.0,...,32,0,0,0.25,92109,95170.0,house,1,0.998444,14.01
3454,659.0,Entire rental unit,Entire home/apt,5.0,9.0,1165.0,1,365,2.0,365.0,...,32,0,0,0.56,92103,84756.0,house,1,0.999799,85.00
1551,63.0,Entire rental unit,Entire home/apt,8.0,9.0,849.0,1,365,2.0,365.0,...,58,0,0,0.14,92109,95170.0,house,1,0.989029,14.01
3509,72.0,Entire rental unit,Entire home/apt,5.0,9.0,977.0,1,365,2.0,365.0,...,66,0,0,0.54,92103,84756.0,house,1,0.999802,90.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17271,1.0,Entire rental unit,Entire home/apt,1.0,,39.0,1,1125,1.0,1125.0,...,1,0,0,2.94,92173,54003.0,house,1,0.019995,236.18
17429,3.0,Entire rental unit,Entire home/apt,1.0,,110.0,4,180,4.0,180.0,...,3,0,0,0.12,91942,72145.0,house,1,0.999057,10.00
17562,43.0,Entire rental unit,Entire home/apt,1.0,,133.0,7,150,7.0,1125.0,...,36,0,0,1.63,92075,111500.0,house,1,0.999643,224.10
17736,3.0,Entire rental unit,Entire home/apt,1.0,,140.0,4,1125,4.0,1125.0,...,3,0,0,1.85,92075,111500.0,house,1,0.999657,654.36


In [21]:
# Fill bedrooms with a value of 1 for those that are listed as rooms
# Indexes of records with the word "room" or RV in the property_type
room_idx = preproc_df[
    preproc_df["property_type"].astype(str).str.contains("|".join(["room", "Room", "RV"]))
    ].index.tolist()

preproc_df.loc[room_idx, "bedrooms"] = preproc_df.loc[room_idx, "bedrooms"].fillna(1)

In [22]:
# Look at all unique bedroom-bathroom combinations in the dataset
bed_bath_df = preproc_df.groupby(["bathrooms", "bedrooms"]).size().reset_index().rename(columns={0:"count"})

# Look at listings that have one bathroom, to assess the most common number of bedrooms
bed_bath_df[bed_bath_df["bathrooms"] == 1]

Unnamed: 0,bathrooms,bedrooms,count
5,1.0,1.0,8932
6,1.0,2.0,2314
7,1.0,3.0,356
8,1.0,4.0,14
9,1.0,5.0,3
10,1.0,6.0,1
11,1.0,7.0,1
12,1.0,9.0,1
13,1.0,11.0,1
14,1.0,12.0,3


In [28]:
# Look at all listings with 0 bathrooms
bed_bath_df[bed_bath_df["bathrooms"] == 0]

Unnamed: 0,bathrooms,bedrooms,count
0,0.0,1.0,47
1,0.0,2.0,5
2,0.0,3.0,1
3,0.0,9.0,1


Based on the above, if a listing has zero or one bathroom, it most likely also only has one bedroom. Therefore, if a listing has zero or one bathroom, the value for bedroom will be filled with 1.

In [31]:
# Fill bedrooms with a value of 1 for 1 bathroom
# Indexes of records with 1 bathrooms
bathroom_1_idx = preproc_df.loc[
    preproc_df["bathrooms"].isin([0,1])
    ].index.tolist()

preproc_df.loc[bathroom_1_idx, "bedrooms"] = preproc_df.loc[bathroom_1_idx, "bedrooms"].fillna(1)
# This fills all but 10 missing bedroom values

In [32]:
preproc_df[preproc_df["bedrooms"].isnull()]

Unnamed: 0,host_listings_count,property_type,room_type,bathrooms,bedrooms,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_maximum_nights,...,calculated_host_listings_count,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,median_income_dollars,property_type_binary,private,sentiment,review_score_weighted
11666,0.0,Entire home,Entire home/apt,2.0,,499.0,2,999,2.0,1125.0,...,15,4,0,,92126,105284.0,house,1,0.999438,
11667,0.0,Entire home,Entire home/apt,2.0,,499.0,2,365,2.0,1125.0,...,15,4,0,0.2,92126,105284.0,house,1,0.9994,8.0
13799,29.0,Entire home,Entire home/apt,6.0,,1175.0,2,21,2.0,21.0,...,26,0,0,,92104,79929.0,house,1,0.994432,
14509,7.0,Tiny home,Entire home/apt,2.0,,99.0,1,1125,1.0,1125.0,...,7,5,0,5.39,92122,88073.0,house,0,0.573923,528.64
16290,191.0,Entire home,Entire home/apt,2.0,,151.0,30,1125,30.0,1125.0,...,9,0,0,0.85,91910,77005.0,house,1,0.999634,136.01


In [35]:
bed_bath_df[bed_bath_df["bathrooms"] == 2]

Unnamed: 0,bathrooms,bedrooms,count
16,2.0,1.0,249
17,2.0,2.0,2331
18,2.0,3.0,1696
19,2.0,4.0,497
20,2.0,5.0,48
21,2.0,6.0,4
22,2.0,8.0,1
23,2.0,20.0,1
24,2.0,34.0,1


Do we think the 20 and 34 bedrooms are errors? Should we fix those?

In [37]:
# The last five were filled by manually looking up the homes using the url provided in the original data
preproc_df.loc[11666, "bedrooms"] = 1 # She's charing $5,000 a night!!!
preproc_df.loc[11667, "bedrooms"] = 2 
preproc_df.loc[13799, "bedrooms"] = 2
preproc_df.loc[14509, "bedrooms"] = 2 # A tiny home logically cannot have more than 2 bedrooms
preproc_df.loc[16290, "bedrooms"] = 2 # Most 2 bathroom places have 2 bedrooms in the data

In [38]:
# Remaining number of missing 
preproc_df.isnull().sum()

host_listings_count                               51
property_type                                      0
room_type                                          0
bathrooms                                          6
bedrooms                                           0
price                                              0
minimum_nights                                     0
maximum_nights                                     0
minimum_minimum_nights                             2
maximum_maximum_nights                             2
has_availability                                   0
availability_30                                    0
availability_365                                   0
instant_bookable                                   0
calculated_host_listings_count                     0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               3186
zipcode                                       

In [64]:
preproc_df[preproc_df["bedrooms"].isnull()]

Unnamed: 0,host_listings_count,property_type,room_type,bathrooms,bedrooms,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_maximum_nights,...,calculated_host_listings_count,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,zipcode,median_income_dollars,property_type_binary,private,sentiment,review_score_weighted
4,4.0,Camper/RV,Entire home/apt,,,62.0,1,1125,1.0,1125.0,...,3,1,0,0.12,92109,95170.0,house,0,0.062897,5.00
32,44.0,Entire rental unit,Entire home/apt,1.0,,139.0,1,1125,1.0,1125.0,...,40,0,0,1.30,92109,95170.0,house,1,0.987491,69.00
35,0.0,Camper/RV,Entire home/apt,0.0,,189.0,2,365,2.0,365.0,...,3,0,0,0.10,92109,95170.0,house,0,0.998417,2.00
36,0.0,Camper/RV,Entire home/apt,0.0,,179.0,2,365,2.0,365.0,...,3,0,0,,92109,95170.0,house,0,0.998387,
53,2.0,Entire condo,Entire home/apt,1.0,,270.0,1,30,1.0,1125.0,...,2,0,0,2.33,92109,95170.0,house,1,0.975706,30.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18486,1.0,Entire guesthouse,Entire home/apt,1.0,,157.0,2,45,2.0,45.0,...,1,0,0,,92114,79917.0,house,1,0.979519,
18501,1.0,Tiny home,Entire home/apt,1.0,,65.0,2,1125,2.0,1125.0,...,1,0,0,,92114,79917.0,house,0,0.991848,
18508,1.0,Entire guest suite,Entire home/apt,1.0,,75.0,2,40,2.0,40.0,...,1,0,0,1.88,92114,79917.0,house,1,0.997124,283.86
18519,1.0,Entire guest suite,Entire home/apt,1.0,,78.0,2,1125,2.0,1125.0,...,1,0,0,2.42,92114,79917.0,house,1,0.005117,511.98
