# San Franciso

In [1]:
import pandas as pd
import numpy as np

In [2]:
listings_detailed = pd.read_csv('usa/San Francisco/listings_detailed.csv')

In [3]:
listings_detailed.shape

(6936, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(6936, 29)

In [6]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_sf.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [7]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


3


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
4519,48693144,392829961,Emily,2021-03-16,,,,0%,,4,...,0,,,,,"[""Bed sheets and pillows"", ""Free wifi"", ""ROIL ...",0.0,0,,1
4690,48973911,394984266,Hotel,2021-03-31,,within an hour,100%,100%,,5,...,0,,,,,"[""Toiletries"", ""Free wifi"", ""Limited housekeep...",0.0,10,4.2,1
5450,52498132,2577011,Sarah,2012-06-07,"San Francisco, CA",within an hour,100%,71%,t,2,...,2,,1 private bath,1.0,1.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Share...",0.0,3,5.0,2


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
4519,48693144,392829961,Emily,2021-03-16,,,,0%,,4,...,0,,,,,"[""Bed sheets and pillows"", ""Free wifi"", ""ROIL ...",0.0,0,,1
4690,48973911,394984266,Hotel,2021-03-31,,within an hour,100%,100%,,5,...,0,,,,,"[""Toiletries"", ""Free wifi"", ""Limited housekeep...",0.0,10,4.2,1


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 3


In [14]:
cleaned_listings.shape

(6933, 29)

# Count 'amenities'

In [15]:
import ast
def count_amenities(amenities_str):
    try:
        # Convert the string representation of the list back into a list
        amenities_list = ast.literal_eval(amenities_str)
        # Return the count of items in the list
        return len(amenities_list)
    except (ValueError, SyntaxError):
        # In case of any error during conversion, return 0 (or you may choose to return NaN)
        return 0

# Apply the function to each row in the 'amenities' column and create a new column 'amenities_count'
cleaned_listings['amenities_count'] = cleaned_listings['amenities'].apply(count_amenities)

In [16]:
cleaned_listings['amenities_count']

0       51
1       15
2       21
3       35
4       35
        ..
6931    44
6932    42
6933    23
6934    13
6935    14
Name: amenities_count, Length: 6933, dtype: int64

# Add 'city' column

In [17]:
# Add in a city column to each dataframe so that we can use this as part of the primary key/use it to conduct groupby for
# future EDA or additional analysis as final table will contain all of our listings data
cleaned_listings['city'] = 'San Francisco'

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [18]:
cleaned_listings['bathrooms_text'].unique()

array(['1 bath', '4 shared baths', '1.5 baths', '1 private bath',
       '1.5 shared baths', '2 baths', nan, '3 shared baths', 'Half-bath',
       '2 shared baths', '2.5 baths', '3 baths', '1 shared bath',
       '3.5 baths', 'Shared half-bath', '0 shared baths',
       'Private half-bath', '0 baths', '10 shared baths', '10 baths',
       '4.5 baths', '4 baths', '2.5 shared baths', '5 shared baths',
       '8 shared baths', '6 shared baths', '3.5 shared baths', '5 baths',
       '6.5 shared baths', '5.5 baths', '6.5 baths', '8.5 baths',
       '6 baths'], dtype=object)

In [19]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [20]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 bath' '4 shared baths' '1.5 baths' '1 private bath' '1.5 shared baths'
 '2 baths' '' '3 shared baths' '0.5 baths' '2 shared baths' '2.5 baths'
 '3 baths' '1 shared bath' '3.5 baths' '0 shared baths' '0 baths'
 '10 shared baths' '10 baths' '4.5 baths' '4 baths' '2.5 shared baths'
 '5 shared baths' '8 shared baths' '6 shared baths' '3.5 shared baths'
 '5 baths' '6.5 shared baths' '5.5 baths' '6.5 baths' '8.5 baths'
 '6 baths']


In [21]:
count_half_baths = (cleaned_listings['bathrooms_text'] == '0.5 baths').sum()
count_half_baths

15

### Check for the Nan for bathrooms_text

In [22]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
11,144978,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,,1.0,1.0,"[""Essentials"", ""Bed linens"", ""Carbon monoxide ...",55.0,121,4.67,9,19,San Francisco
70,596042,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,,1.0,1.0,"[""Oven"", ""Cooking basics"", ""Host greets you"", ...",60.0,101,4.7,9,24,San Francisco
162,745957,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",500.0,0,,3,5,San Francisco
163,747167,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",600.0,0,,3,5,San Francisco
284,1031899,5678214,Glenn,2013-03-29,"San Francisco, CA",within a few hours,100%,100%,f,2,...,,1.0,,"[""Dryer"", ""Dedicated workspace"", ""Kitchen""]",110.0,7,4.86,2,3,San Francisco
289,1273394,4836128,Vivian,2013-01-24,"Kentfield, CA",within an hour,100%,,f,2,...,,1.0,,"[""Courtyard view"", ""TV"", ""Free parking on prem...",500.0,0,,2,10,San Francisco
317,1094764,172460,Leila,2010-07-19,"Raleigh, NC",within a few hours,100%,54%,f,2,...,,1.0,1.0,[],150.0,1,5.0,2,0,San Francisco
4496,48327527,369574451,Angela,2020-09-27,"San Francisco, CA",a few days or more,0%,0%,f,1,...,,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Free park...",47.0,1,5.0,1,28,San Francisco
4723,50169742,131200418,Lois,2017-05-20,"San Francisco, CA",within an hour,100%,95%,t,49,...,,1.0,1.0,"[""Oven"", ""Self check-in"", ""Luggage dropoff all...",55.0,7,4.71,25,35,San Francisco
5296,52426464,420182099,Sam,2021-08-25,"San Francisco, CA",within an hour,100%,100%,t,10,...,,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Dining ta...",43.0,4,5.0,10,32,San Francisco


In [23]:
empty_bathrooms_text_ids = cleaned_listings[cleaned_listings['bathrooms_text'] == '']['id']
print(len(empty_bathrooms_text_ids))

12


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [24]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df


In [25]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [26]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,958,1.0,2.0,1 bath
1,5858,2.0,2.0,1 bath
2,8142,1.0,1.0,4 shared baths
3,8339,2.0,2.0,1.5 baths
4,8739,1.0,1.0,1 private bath
...,...,...,...,...
6931,818195233784141985,1.0,1.0,1 bath
6932,818231243893035313,1.0,1.0,1 bath
6933,818396267693629070,1.0,1.0,1 shared bath
6934,818474518435984613,1.0,1.0,1 shared bath


In [27]:
# # check for the previous NA rows in bathrooms text
filled_cleaned_listings[filled_cleaned_listings['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
11,144978,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,1.0 bath,1.0,1.0,"[""Essentials"", ""Bed linens"", ""Carbon monoxide ...",55.0,121,4.67,9,19,San Francisco
70,596042,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,1.0 bath,1.0,1.0,"[""Oven"", ""Cooking basics"", ""Host greets you"", ...",60.0,101,4.7,9,24,San Francisco
162,745957,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,1.0 bath,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",500.0,0,,3,5,San Francisco
163,747167,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,1.0 bath,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",600.0,0,,3,5,San Francisco
284,1031899,5678214,Glenn,2013-03-29,"San Francisco, CA",within a few hours,100%,100%,f,2,...,1.0 bath,1.0,,"[""Dryer"", ""Dedicated workspace"", ""Kitchen""]",110.0,7,4.86,2,3,San Francisco
289,1273394,4836128,Vivian,2013-01-24,"Kentfield, CA",within an hour,100%,,f,2,...,1.0 bath,1.0,,"[""Courtyard view"", ""TV"", ""Free parking on prem...",500.0,0,,2,10,San Francisco
317,1094764,172460,Leila,2010-07-19,"Raleigh, NC",within a few hours,100%,54%,f,2,...,1.0 bath,1.0,1.0,[],150.0,1,5.0,2,0,San Francisco
4496,48327527,369574451,Angela,2020-09-27,"San Francisco, CA",a few days or more,0%,0%,f,1,...,1.0 bath,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Free park...",47.0,1,5.0,1,28,San Francisco
4723,50169742,131200418,Lois,2017-05-20,"San Francisco, CA",within an hour,100%,95%,t,49,...,1.0 bath,1.0,1.0,"[""Oven"", ""Self check-in"", ""Luggage dropoff all...",55.0,7,4.71,25,35,San Francisco
5296,52426464,420182099,Sam,2021-08-25,"San Francisco, CA",within an hour,100%,100%,t,10,...,1.0 bath,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Dining ta...",43.0,4,5.0,10,32,San Francisco


Note: Looking back to the oringinal data original data, the entries with id '49229073' was deleted in the begging as it had 0 accomodates 

### Function to make numerical values for # of bath

In [28]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [29]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
0             1 bath       1.0
1             1 bath       1.0
2     4 shared baths       4.0
3          1.5 baths       1.5
4     1 private bath       1.0
...              ...       ...
6931          1 bath       1.0
6932          1 bath       1.0
6933   1 shared bath       1.0
6934   1 shared bath       1.0
6935          1 bath       1.0

[6933 rows x 2 columns]


In [30]:
updated_df['num_bath'].isnull().sum()

0

In [31]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [32]:
updated_df['num_bath'].unique()

array([ 1. ,  4. ,  1.5,  2. ,  3. ,  0.5,  2.5,  3.5,  0. , 10. ,  4.5,
        5. ,  8. ,  6. ,  6.5,  5.5,  8.5])

In [33]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [34]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,958,1169,Holly,2008-07-31,"San Francisco, CA",within an hour,100%,100%,t,1,...,1.0,2.0,"[""Oven"", ""Self check-in"", ""Cooking basics"", ""L...",202.0,383,4.76,1,51,San Francisco,1.0
1,5858,8904,Philip And Tania,2009-03-02,"San Francisco, CA",within a few hours,80%,71%,f,2,...,2.0,2.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Hange...",235.0,111,4.68,1,15,San Francisco,1.0
2,8142,21994,Aaron,2009-06-17,"San Francisco, CA",within an hour,100%,73%,f,13,...,1.0,1.0,"[""Security cameras on property"", ""Host greets ...",56.0,9,4.67,13,21,San Francisco,4.0
3,8339,24215,Rosy,2009-07-02,"San Francisco, CA",within a few hours,100%,0%,f,2,...,2.0,2.0,"[""Oven"", ""Cooking basics"", ""TV"", ""Smoke alarm""...",575.0,28,4.75,2,35,San Francisco,1.5
4,8739,7149,Ivan & Wendy,2009-01-27,"San Francisco, CA",within an hour,100%,90%,t,2,...,1.0,1.0,"[""Paid parking garage off premises"", ""Luggage ...",110.0,770,4.74,2,35,San Francisco,1.0


# Impute Bedrooms and Beds

In [35]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3., nan,  4.,  5.,  6.,  7.,  8.,  9., 11.])

In [36]:
updated_df['beds'].unique()

array([ 2.,  1.,  3.,  6.,  5.,  4.,  7., nan,  8., 10., 12.,  9., 16.,
       11.])

In [37]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
10,10578,37049,Andrew,2009-09-08,"San Francisco, CA",within a day,100%,,f,1,...,,1.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Hange...",79.0,18,4.75,1,14,San Francisco,1.0
20,504146,2486135,Laura,2012-05-28,"San Francisco, CA",within a few hours,100%,93%,t,1,...,,2.0,"[""Self check-in"", ""Luggage dropoff allowed"", ""...",295.0,111,4.78,1,39,San Francisco,1.0
28,18231,70224,Arlene Helfand,2010-01-13,"San Francisco, CA",within a few hours,100%,60%,f,1,...,,2.0,"[""Oven"", ""Cooking basics"", ""Shampoo"", ""Host gr...",95.0,69,4.75,1,31,San Francisco,1.0
34,175090,127367,Michelle,2010-05-18,"Sonoma, CA",within a few hours,100%,87%,f,8,...,,2.0,"[""Self check-in"", ""Cooking basics"", ""Luggage d...",120.0,26,4.31,6,26,San Francisco,1.0
35,175102,127367,Michelle,2010-05-18,"Sonoma, CA",within a few hours,100%,87%,f,8,...,,2.0,"[""Oven"", ""Self check-in"", ""Cooking basics"", ""L...",130.0,54,4.49,6,30,San Francisco,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807,808622297875325704,389697794,David,2021-02-22,"North Myrtle Beach, SC",within an hour,99%,70%,t,795,...,,2.0,"[""Self check-in"", ""Cooking basics"", ""Luggage d...",243.0,0,,25,32,San Francisco,1.0
6808,808622402293119774,389697794,David,2021-02-22,"North Myrtle Beach, SC",within an hour,99%,70%,t,795,...,,2.0,"[""Self check-in"", ""Cooking basics"", ""Luggage d...",243.0,0,,25,32,San Francisco,1.0
6852,831285327472387448,415840703,Anyplace,2021-07-29,"San Francisco, CA",within an hour,100%,80%,t,66,...,,1.0,"[""Free resort access"", ""Self check-in"", ""Cooki...",135.0,0,,24,62,San Francisco,1.0
6853,831287715236252567,415840703,Anyplace,2021-07-29,"San Francisco, CA",within an hour,100%,80%,t,66,...,,1.0,"[""Free resort access"", ""Self check-in"", ""Cooki...",135.0,0,,24,61,San Francisco,1.0


In [38]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
253,1201159,6561127,Omid,2013-05-24,"San Francisco, CA",within a few hours,100%,100%,t,1,...,,,"[""Oven"", ""Cooking basics"", ""Luggage dropoff al...",80.0,44,4.70,1,36,San Francisco,1.0
284,1031899,5678214,Glenn,2013-03-29,"San Francisco, CA",within a few hours,100%,100%,f,2,...,1.0,,"[""Dryer"", ""Dedicated workspace"", ""Kitchen""]",110.0,7,4.86,2,3,San Francisco,1.0
289,1273394,4836128,Vivian,2013-01-24,"Kentfield, CA",within an hour,100%,,f,2,...,1.0,,"[""Courtyard view"", ""TV"", ""Free parking on prem...",500.0,0,,2,10,San Francisco,1.0
356,137129,639671,Mark,2011-05-29,"San Francisco, CA",within a few hours,100%,38%,f,2,...,3.0,,"[""Oven"", ""Cooking basics"", ""Dining table"", ""Sm...",695.0,2,4.50,1,36,San Francisco,2.5
371,1531273,104075,Jeanne,2010-04-06,"San Francisco, CA",,,,f,1,...,1.0,,"[""Oven"", ""Self check-in"", ""Cooking basics"", ""L...",235.0,104,4.55,1,30,San Francisco,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5528,52759127,157999191,Berlin,2017-11-09,"Lafayette, CA",within an hour,100%,98%,f,24,...,1.0,,"[""Essentials"", ""Carbon monoxide alarm"", ""Hange...",55.0,5,3.40,24,17,San Francisco,1.0
5973,700262606800867046,17213536,Kris,2014-06-24,"Carlsbad, CA",within a day,100%,,f,1,...,1.0,,"[""Elevator"", ""Washer"", ""Wifi""]",100.0,0,,1,3,San Francisco,0.0
6782,827557831626079015,219930816,Sonder (San Francisco),2018-10-10,"San Francisco, CA",within an hour,100%,99%,f,18,...,,,"[""Essentials"", ""Bed linens"", ""Coffee maker"", ""...",289.0,0,,18,16,San Francisco,1.0
6784,827563145102538925,219930816,Sonder (San Francisco),2018-10-10,"San Francisco, CA",within an hour,100%,99%,f,18,...,,,"[""Essentials"", ""Bed linens"", ""Coffee maker"", ""...",222.0,0,,18,16,San Francisco,1.0


In [39]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [40]:
updated_df = update_bedrooms_and_beds(updated_df)


In [41]:
# check for the previous NA rows in bedrooms
updated_df[updated_df['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
11,144978,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,1.0,1.0,"[""Essentials"", ""Bed linens"", ""Carbon monoxide ...",55.0,121,4.67,9,19,San Francisco,1.0
70,596042,216682,Jay,2010-08-29,"San Francisco, CA",within a few hours,100%,100%,t,9,...,1.0,1.0,"[""Oven"", ""Cooking basics"", ""Host greets you"", ...",60.0,101,4.7,9,24,San Francisco,1.0
162,745957,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",500.0,0,,3,5,San Francisco,1.0
163,747167,576273,Darren,2011-05-10,"San Anselmo, CA",,,,f,3,...,1.0,1.0,"[""Washer"", ""Dryer"", ""Wifi"", ""Heating"", ""Kitchen""]",600.0,0,,3,5,San Francisco,1.0
284,1031899,5678214,Glenn,2013-03-29,"San Francisco, CA",within a few hours,100%,100%,f,2,...,1.0,1.0,"[""Dryer"", ""Dedicated workspace"", ""Kitchen""]",110.0,7,4.86,2,3,San Francisco,1.0
289,1273394,4836128,Vivian,2013-01-24,"Kentfield, CA",within an hour,100%,,f,2,...,1.0,1.0,"[""Courtyard view"", ""TV"", ""Free parking on prem...",500.0,0,,2,10,San Francisco,1.0
317,1094764,172460,Leila,2010-07-19,"Raleigh, NC",within a few hours,100%,54%,f,2,...,1.0,1.0,[],150.0,1,5.0,2,0,San Francisco,1.0
4496,48327527,369574451,Angela,2020-09-27,"San Francisco, CA",a few days or more,0%,0%,f,1,...,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Free park...",47.0,1,5.0,1,28,San Francisco,1.0
4723,50169742,131200418,Lois,2017-05-20,"San Francisco, CA",within an hour,100%,95%,t,49,...,1.0,1.0,"[""Oven"", ""Self check-in"", ""Luggage dropoff all...",55.0,7,4.71,25,35,San Francisco,1.0
5296,52426464,420182099,Sam,2021-08-25,"San Francisco, CA",within an hour,100%,100%,t,10,...,1.0,1.0,"[""Cooking basics"", ""Shampoo"", ""TV"", ""Dining ta...",43.0,4,5.0,10,32,San Francisco,1.0


In [42]:
updated_df['beds'].unique()

array([ 2.,  1.,  3.,  6.,  5.,  4.,  7.,  8., 10., 12.,  0.,  9., 16.,
       11.])

In [43]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [44]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  0.,  8.,  9., 11.])

In [45]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_since

In [46]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [47]:
updated_df['host_since'].isnull().sum()

0

In [48]:
updated_df['host_since'].unique()

array(['2008-07-31', '2009-03-02', '2009-06-17', ..., '2023-01-31',
       '2023-02-03', '2023-02-02'], dtype=object)

In [49]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_location

In [50]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 1219 missing values in the "host_location" column.


In [51]:
updated_df['host_location'].isnull().sum()

0

In [52]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

1219

In [53]:
updated_df['host_location'].unique()

array(['San Francisco, CA', 'Palm Springs, CA', 'Austin, TX',
       'Sonoma, CA', 'Santa Fe, NM', 'Dublin, CA', 'Mariposa, CA',
       'United States', 'San Anselmo, CA', 'California, United States',
       'Los Angeles, CA', 'Castro Valley, CA', 'unknown',
       'Connecticut, United States', 'Seattle, WA', 'Kentfield, CA',
       'Raleigh, NC', 'Berkeley, CA', 'Sausalito, CA', 'Oakland, CA',
       'Chicago, IL', 'Palo Alto, CA', 'Portland, OR', 'Holland, MI',
       'Napa, CA', 'Sacramento, CA', 'Bozeman, MT', 'Portola Valley, CA',
       'Bali, Indonesia', 'Wilson, WY', 'Belmont, CA', 'Menlo Park, CA',
       'Reno, NV', 'New York, NY', 'Troy, NY', 'Redwood City, CA',
       'New Orleans, LA', 'Bellevue, WA', 'Atherton, CA',
       'New York, United States', 'Honolulu, HI', 'Denver, CO',
       'Auckland, New Zealand', 'Santa Barbara, CA', 'Boulder, CO',
       'San Diego, CA', 'Brisbane, CA', 'Danville, CA', 'Penn Valley, CA',
       'Discovery Bay, CA', 'Aptos, CA', 'Kansas City

In [54]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [55]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_is_superhost

In [56]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 7 missing values in the "host_is_superhost" column.


In [57]:
updated_df['host_is_superhost'].isnull().sum()

0

In [58]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

4065

In [59]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [60]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [61]:
updated_df['host_listings_count'].isnull().sum()

0

In [62]:
updated_df['host_listings_count'].unique()

array([   1,    2,   13,    9,    5,    6,    7,    8,    4,    3,   36,
         31,   11,   22,   57,   20,   10,   46,   18,   16,   17,   79,
        119,   19,   40,   29,   91,  120,  154,   25,   49,   12, 1512,
         15,  249,   43,   14,   21,  489,   33,   32,   61,  130, 4774,
         27,   50,   41,   77,   23,   73,  421,   44,  658,   24,   26,
         37,   67,   68,   51,  101,  226,  245,  168,  197,  636,  289,
        521,   47,   39,   66,  368,  173,  192,  172, 3282,  152,   34,
        430,  374,   95,   38,   42,  273, 2600,   30, 2508,  279,  165,
         64,  795,   28])

# host_total_listings_count

In [63]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [64]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [65]:
updated_df['host_total_listings_count'].unique()

array([   1,    2,   14,    6,    3,    9,    5,   12,   17,    4,   70,
         13,    8,   15,    7,   48,   56,   11,   10,   20,   65,   81,
         16,   61,  129,   62,   18,   33,   66,   87,  122,   31,   34,
        113,  138,  154,   37,   92, 1962,   25,   26,   30,   45,  403,
         51,   40,   75,  918,   46,   50,   63,  205,   21,  169, 5310,
         38,   27,   24,   28,   77,   19,   90,   22,  397,   55,  128,
         83,  105,   39,  123,  570,   44,  709,   59,   86,   94,   97,
        979,  288,   43,  177,   47,  211,  736,  415,  840,  207,  838,
        664,  222, 4522,  118,   52,  323,   29,  179,  770,  375,  108,
         64,  236,  163,  374, 8278, 2750,  290,  167,   58,   68,  841,
        252])

# host_verifications

In [66]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [67]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", "['email']"], dtype=object)

In [68]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [69]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [70]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", "['email']"], dtype=object)

# host_identity_verified

In [71]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [72]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [73]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [74]:
updated_df['calculated_host_listings_count'].unique()

array([  1,  13,   2,   9,   5,   6,   3,  28,  22,   4,  29,   7,  17,
        10,  46,  12,  16,  25,  19,  15,  14,  27,  43, 154,  24,  31,
        20,   8,  61,  41,  38, 138,  21,  11,  18,  55,  23,  36,  26])

# host_name

In [75]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_has_profile_pic

In [76]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [77]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

In [78]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 0 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [79]:
updated_df['neighbourhood'].unique()

array(['San Francisco, California, United States', nan,
       'san francisco, California, United States',
       'San Francisco, Hayes Valley, California, United States',
       'Noe Valley - San Francisco, California, United States',
       'San Francisco , Ca, United States',
       'Tiburon, California, United States',
       'Pacifica, California, United States'], dtype=object)

In [80]:
updated_df['neighbourhood'].isnull().sum()

2084

In [81]:
updated_df['neighbourhood_cleansed'].unique()

array(['Western Addition', 'Bernal Heights', 'Haight Ashbury', 'Mission',
       'Outer Richmond', 'Bayview', 'Inner Richmond', 'Nob Hill',
       'Pacific Heights', 'Outer Mission', 'Diamond Heights', 'Marina',
       'Financial District', 'Twin Peaks', 'Castro/Upper Market',
       'Russian Hill', 'Noe Valley', 'Chinatown', 'Glen Park',
       'North Beach', 'Downtown/Civic Center', 'Excelsior',
       'Potrero Hill', 'Inner Sunset', 'Parkside', 'Outer Sunset',
       'South of Market', 'Lakeshore', 'West of Twin Peaks',
       'Presidio Heights', 'Ocean View', 'Visitacion Valley',
       'Crocker Amazon', 'Presidio', 'Seacliff'], dtype=object)

### We are going to drop neighbourhood column for SF Dataset

In [82]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# Final look into missing values

In [83]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                1018
host_response_rate                1018
host_acceptance_rate               801
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

# Save the Final Dataframe

In [85]:
updated_df.to_csv('data/listings_detailed_after_na_sf.csv', index=False)