# Seattle

In [1]:
import pandas as pd
import numpy as np

In [2]:
# calendar = pd.read_csv('usa/Seattle/calendar.csv')
# listings = pd.read_csv('usa/Seattle/listings.csv')
listings_detailed = pd.read_csv('usa/Seattle/listings_detailed.csv')

In [3]:
listings_detailed.shape

(6376, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(6376, 29)

In [6]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_seattle.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [7]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1944,34505734,60645407,Alyssa,2016-02-27,"Seattle, WA",within an hour,75%,93%,t,1,...,6,,1 bath,2.0,2.0,"[""Long term stays allowed"", ""Washer"", ""Dryer"",...",10.0,96,4.85,1


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


1


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2920,49229073,267653164,Hotel Theodore,2019-06-10,"Seattle, WA",within an hour,100%,88%,,14,...,0,,,,,"[""Air conditioning"", ""Crib"", ""First aid kit"", ...",0.0,179,4.67,1


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2920,49229073,267653164,Hotel Theodore,2019-06-10,"Seattle, WA",within an hour,100%,88%,,14,...,0,,,,,"[""Air conditioning"", ""Crib"", ""First aid kit"", ...",0.0,179,4.67,1


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 1


In [14]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,6606,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,1,,1 bath,1.0,1.0,"[""Air conditioning"", ""Hair dryer"", ""Bed linens...",99.0,159,4.57,3
1,9419,30559,Angielena,2009-08-09,"Seattle, WA",within an hour,100%,98%,t,9,...,2,,3 shared baths,1.0,2.0,"[""Long term stays allowed"", ""Shared patio or b...",55.0,175,4.65,9
2,9531,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,4,,1 bath,2.0,3.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",175.0,71,4.9,2
3,9534,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,3,,1 bath,2.0,2.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",135.0,73,4.94,2
4,9596,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,4,,1 bath,1.0,4.0,"[""Stove"", ""Hair dryer"", ""Shampoo"", ""Coffee mak...",130.0,97,4.56,3


In [15]:
cleaned_listings.shape

(6375, 29)

# Count 'amenities'

In [16]:
import ast
def count_amenities(amenities_str):
    try:
        # Convert the string representation of the list back into a list
        amenities_list = ast.literal_eval(amenities_str)
        # Return the count of items in the list
        return len(amenities_list)
    except (ValueError, SyntaxError):
        # In case of any error during conversion, return 0 (or you may choose to return NaN)
        return 0

# Apply the function to each row in the 'amenities' column and create a new column 'amenities_count'
cleaned_listings['amenities_count'] = cleaned_listings['amenities'].apply(count_amenities)

In [17]:
cleaned_listings['amenities_count']

0       27
1       31
2       63
3       57
4       28
        ..
6371    54
6372    41
6373    44
6374    11
6375    51
Name: amenities_count, Length: 6375, dtype: int64

# Add 'city' column

In [18]:
# Add in a city column to each dataframe so that we can use this as part of the primary key/use it to conduct groupby for
# future EDA or additional analysis as final table will contain all of our listings data
cleaned_listings['city'] = 'Seattle'

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [19]:
cleaned_listings['bathrooms_text'].unique()

array(['1 bath', '3 shared baths', '2 baths', '1.5 baths', '2.5 baths',
       '4 shared baths', '1 shared bath', '3 baths', '1 private bath',
       '1.5 shared baths', '2 shared baths', '3.5 baths', '4 baths',
       'Half-bath', '2.5 shared baths', '3.5 shared baths',
       '0 shared baths', '4.5 baths', '16 shared baths', '0 baths',
       '6 baths', 'Private half-bath', '6.5 baths', '7 baths',
       'Shared half-bath'], dtype=object)

In [20]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [21]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 bath' '3 shared baths' '2 baths' '1.5 baths' '2.5 baths'
 '4 shared baths' '1 shared bath' '3 baths' '1 private bath'
 '1.5 shared baths' '2 shared baths' '3.5 baths' '4 baths' '0.5 baths'
 '2.5 shared baths' '3.5 shared baths' '0 shared baths' '4.5 baths'
 '16 shared baths' '0 baths' '6 baths' '6.5 baths' '7 baths']


In [22]:
count_half_baths = (cleaned_listings['bathrooms_text'] == '0.5 baths').sum()
count_half_baths

12

### Check for the Nan for bathrooms_text

In [23]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [24]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [25]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [26]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,6606,1.0,1.0,1 bath
1,9419,1.0,2.0,3 shared baths
2,9531,2.0,3.0,1 bath
3,9534,2.0,2.0,1 bath
4,9596,1.0,4.0,1 bath
...,...,...,...,...
6371,1732441,1.0,1.0,1 bath
6372,1737244,1.0,1.0,1 shared bath
6373,1764100,1.0,2.0,1 bath
6374,1773803,1.0,1.0,1 bath


In [27]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 49229073)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city


Note: Looking back to the oringinal data original data, the entries with id '49229073' was deleted in the begging as it had 0 accomodates 

### Function to make numerical values for # of bath

In [28]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [29]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
0             1 bath       1.0
1     3 shared baths       3.0
2             1 bath       1.0
3             1 bath       1.0
4             1 bath       1.0
...              ...       ...
6371          1 bath       1.0
6372   1 shared bath       1.0
6373          1 bath       1.0
6374          1 bath       1.0
6375       1.5 baths       1.5

[6375 rows x 2 columns]


In [30]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,6606,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,1.0,1.0,"[""Air conditioning"", ""Hair dryer"", ""Bed linens...",99.0,159,4.57,3,27,Seattle,1.0
1,9419,30559,Angielena,2009-08-09,"Seattle, WA",within an hour,100%,98%,t,9,...,1.0,2.0,"[""Long term stays allowed"", ""Shared patio or b...",55.0,175,4.65,9,31,Seattle,3.0
2,9531,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,2.0,3.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",175.0,71,4.90,2,63,Seattle,1.0
3,9534,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,2.0,2.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",135.0,73,4.94,2,57,Seattle,1.0
4,9596,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,1.0,4.0,"[""Stove"", ""Hair dryer"", ""Shampoo"", ""Coffee mak...",130.0,97,4.56,3,28,Seattle,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6371,1732441,1214247,Marcell,2011-09-27,"Seattle, WA",within a few hours,100%,77%,f,1,...,1.0,1.0,"[""Long term stays allowed"", ""Washer"", ""Stove"",...",85.0,174,4.92,1,54,Seattle,1.0
6372,1737244,9148911,Constance,2013-09-30,"Seattle, WA",,,50%,f,1,...,1.0,1.0,"[""Children\u2019s dinnerware"", ""Washer"", ""Drye...",71.0,40,4.72,1,41,Seattle,1.0
6373,1764100,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,1.0,2.0,"[""Long term stays allowed"", ""Cleaning products...",115.0,52,4.27,117,44,Seattle,1.0
6374,1773803,8026420,Sheldon,2013-08-08,"Seattle, WA",within a day,100%,0%,f,2,...,1.0,1.0,"[""Washer"", ""Kitchen"", ""Dryer"", ""Smoke alarm"", ...",92.0,304,4.30,2,11,Seattle,1.0


In [31]:
updated_df['num_bath'].isnull().sum()

0

In [32]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [33]:
updated_df['num_bath'].unique()

array([ 1. ,  3. ,  2. ,  1.5,  2.5,  4. ,  3.5,  0.5,  0. ,  4.5, 16. ,
        6. ,  6.5,  7. ])

In [34]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [35]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,6606,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,1.0,1.0,"[""Air conditioning"", ""Hair dryer"", ""Bed linens...",99.0,159,4.57,3,27,Seattle,1.0
1,9419,30559,Angielena,2009-08-09,"Seattle, WA",within an hour,100%,98%,t,9,...,1.0,2.0,"[""Long term stays allowed"", ""Shared patio or b...",55.0,175,4.65,9,31,Seattle,3.0
2,9531,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,2.0,3.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",175.0,71,4.90,2,63,Seattle,1.0
3,9534,31481,Cassie,2009-08-13,"Seattle, WA",within a few hours,100%,77%,t,2,...,2.0,2.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",135.0,73,4.94,2,57,Seattle,1.0
4,9596,14942,Joyce,2009-04-26,"Seattle, WA",within a few hours,100%,94%,f,5,...,1.0,4.0,"[""Stove"", ""Hair dryer"", ""Shampoo"", ""Coffee mak...",130.0,97,4.56,3,28,Seattle,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6371,1732441,1214247,Marcell,2011-09-27,"Seattle, WA",within a few hours,100%,77%,f,1,...,1.0,1.0,"[""Long term stays allowed"", ""Washer"", ""Stove"",...",85.0,174,4.92,1,54,Seattle,1.0
6372,1737244,9148911,Constance,2013-09-30,"Seattle, WA",,,50%,f,1,...,1.0,1.0,"[""Children\u2019s dinnerware"", ""Washer"", ""Drye...",71.0,40,4.72,1,41,Seattle,1.0
6373,1764100,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,1.0,2.0,"[""Long term stays allowed"", ""Cleaning products...",115.0,52,4.27,117,44,Seattle,1.0
6374,1773803,8026420,Sheldon,2013-08-08,"Seattle, WA",within a day,100%,0%,f,2,...,1.0,1.0,"[""Washer"", ""Kitchen"", ""Dryer"", ""Smoke alarm"", ...",92.0,304,4.30,2,11,Seattle,1.0


# Impute Bedrooms and Beds

In [36]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  6., nan,  4.,  5.,  7.,  8., 11.])

In [37]:
updated_df['beds'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  7.,  8., nan,  6., 14., 12., 10., 15.,
        9., 16., 11.])

In [38]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
55,607788,103427,Jacqueline M,2010-04-04,"San Francisco, CA",within a day,61%,8%,f,187,...,,1.0,"[""Washer"", ""Kitchen"", ""Private entrance"", ""Dry...",232.0,2,3.00,2,14,Seattle,1.0
74,723846,3737272,Cheryl & Steve,2012-10-02,"Seattle, WA",within an hour,100%,100%,t,1,...,,1.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",129.0,179,4.80,1,60,Seattle,1.0
87,1820794,9533123,Pam,2013-10-20,"Seattle, WA",within an hour,100%,96%,t,1,...,,1.0,"[""Long term stays allowed"", ""Washer"", ""Air con...",134.0,561,4.93,1,42,Seattle,1.0
104,2150760,10977007,Michelle,2014-01-02,"Seattle, WA",within a few hours,100%,79%,t,2,...,,1.0,"[""Long term stays allowed"", ""Hair dryer"", ""Bed...",125.0,198,4.74,1,31,Seattle,1.0
111,2358140,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,,1.0,"[""Cleaning products"", ""Stove"", ""Bed linens"", ""...",80.0,43,4.21,117,32,Seattle,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6259,846355703375944157,114858194,Elisabeth,2017-02-04,"Seattle, WA",within an hour,100%,50%,f,3,...,,1.0,"[""Long term stays allowed"", ""Conditioner"", ""Cl...",105.0,1,5.00,3,40,Seattle,1.0
6356,1589681,681242,Heather,2011-06-09,"Seattle, WA",within an hour,100%,100%,f,1,...,,1.0,"[""Conditioner"", ""Cleaning products"", ""Shared p...",79.0,381,4.75,1,47,Seattle,1.0
6361,1599010,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,,1.0,"[""Long term stays allowed"", ""Stove"", ""Bed line...",89.0,22,4.27,117,35,Seattle,1.0
6362,1603124,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,,1.0,"[""Stove"", ""Bed linens"", ""Shampoo"", ""Security c...",105.0,23,4.26,117,26,Seattle,1.0


In [39]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
156,3022308,3074414,Melissa,2012-07-27,"Seattle, WA",within an hour,98%,94%,f,118,...,1.0,,"[""Long term stays allowed"", ""Cleaning products...",95.0,8,3.25,117,39,Seattle,1.0
229,4279578,22218808,Donna,2014-10-06,"Seattle, WA",within a few hours,100%,93%,t,2,...,,,"[""Public or shared beach access"", ""First aid k...",84.0,406,4.85,1,39,Seattle,1.0
261,4993710,25746073,Shawna,2015-01-07,"Seattle, WA",,,,f,1,...,1.0,,"[""Stove"", ""Shared patio or balcony"", ""First ai...",120.0,153,4.84,1,32,Seattle,1.0
294,5749958,3751963,Alan & Katence,2012-10-03,"Seattle, WA",within a day,50%,67%,f,1,...,2.0,,"[""Washer"", ""Air conditioning"", ""Dryer"", ""Stove...",166.0,86,4.85,1,31,Seattle,1.0
623,12625278,16718874,Patrice,2014-06-12,"Seattle, WA",within a day,100%,50%,f,1,...,1.0,,"[""Washer"", ""Carbon monoxide alarm"", ""Kitchen"",...",53.0,170,4.79,1,14,Seattle,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4895,671669265235442364,219526255,Sonder (Seattle),2018-10-08,"Seattle, WA",within an hour,99%,99%,f,18,...,,,"[""Washer"", ""Air conditioning"", ""Dryer"", ""Stove...",160.0,4,5.00,18,30,Seattle,1.0
4896,671670971425563724,219526255,Sonder (Seattle),2018-10-08,"Seattle, WA",within an hour,99%,99%,f,18,...,,,"[""Washer"", ""Air conditioning"", ""Dryer"", ""Stove...",167.0,86,4.53,18,30,Seattle,1.0
4897,671679008224438796,219526255,Sonder (Seattle),2018-10-08,"Seattle, WA",within an hour,99%,99%,f,18,...,,,"[""Washer"", ""Air conditioning"", ""Dryer"", ""Stove...",236.0,83,4.47,18,30,Seattle,1.0
4899,671687362388947426,219526255,Sonder (Seattle),2018-10-08,"Seattle, WA",within an hour,99%,99%,f,18,...,1.0,,"[""Washer"", ""Air conditioning"", ""Dryer"", ""Stove...",151.0,2,4.50,18,30,Seattle,1.0


In [40]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [41]:
updated_df = update_bedrooms_and_beds(updated_df)


In [42]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 6413) | 
    (updated_df['id'] == 78884) | 
    (updated_df['id'] == 219168) | 
    (updated_df['id'] == 224603) |
    (updated_df['id'] == 343889) | 
    (updated_df['id'] == 844959293264278924) | 
    (updated_df['id'] == 834072988781055677) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [43]:
# check for the previous NA rows in beds
# updated_df[
#     (updated_df['id'] == 49229073)
# ]

In [44]:
updated_df['beds'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  7.,  8.,  6., 14., 12., 10., 15.,  9.,
       16.,  0., 11.])

In [45]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [46]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  6.,  4.,  5.,  7.,  8.,  0., 16., 11.])

In [47]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_since

In [48]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [49]:
updated_df['host_since'].isnull().sum()

0

In [50]:
updated_df['host_since'].unique()

array(['2009-04-26', '2009-08-09', '2009-08-13', ..., '2013-09-20',
       '2011-09-27', '2013-09-30'], dtype=object)

In [51]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_location

In [52]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 900 missing values in the "host_location" column.


In [53]:
updated_df['host_location'].isnull().sum()

0

In [54]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

900

In [55]:
updated_df['host_location'].unique()

array(['Seattle, WA', 'Carlsbad, CA', 'Grosse Pointe Farms, MI',
       'unknown', 'Vashon, WA', 'Silver Lake, WA',
       'Port Coquitlam, Canada', 'San Francisco, CA', 'Luxembourg',
       'Washington, United States', 'Anacortes, WA', 'Mount Vernon, WA',
       'Berkeley, CA', 'Bend, OR', 'Bellevue, WA', 'Kapaa, HI',
       'San Diego, CA', 'Gig Harbor, WA', 'Honolulu, HI',
       'Woodinville, WA', 'Edmonds, WA', 'Mountlake Terrace, WA',
       'San Jose, CA', 'Palo Alto, CA', 'Prescott Valley, AZ',
       'Holland, MI', 'Washington, CA', 'Santa Fe, NM',
       'St Petersburg, FL', 'United States', 'Bow, WA', 'Camas, WA',
       'Santa Clarita, CA', 'Soldotna, AK', 'New South Wales, Australia',
       'Wenatchee, WA', 'Tacoma, WA', 'Renton, WA', 'Vancouver, Canada',
       'Kirkland, WA', 'Redmond, WA', 'Issaquah, WA', 'Merrick, NY',
       'Normandy Park, WA', 'Kent, WA', 'Sammamish, WA',
       'University Place, WA', 'Houston, TX', 'New York, NY',
       'South Windsor, CT', 'Chi

In [56]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [57]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_is_superhost

In [58]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [59]:
updated_df['host_is_superhost'].isnull().sum()

0

In [60]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

3432

In [61]:
updated_df['host_is_superhost'].unique()

array(['f', 't'], dtype=object)

# host_listings_count

In [62]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [63]:
updated_df['host_listings_count'].isnull().sum()

0

In [64]:
updated_df['host_listings_count'].unique()

array([   5,    9,    2,   10,    1,    4,    3,   12,   11,    6,  187,
        118,    7,    8,   27, 1629,   17,   13,   14,   36,   48,   38,
         22,   44,   51,   21,   15,   91, 1054,   79,   50,   16,   41,
        156,   66,   18, 1502,  417,   28,   39,   49, 4807,   19,  275,
         26,  201,  202,   23, 3302,   81,   57,   83,   71,  763,  154,
         42,  519,   25,   53,  174,  246,  105,   31,  291,   20,  385,
         77,  635,  287,   24,  151,   67,   40])

# host_total_listings_count

In [65]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [66]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [67]:
updated_df['host_total_listings_count'].unique()

array([   5,   10,    2,   15,    4,    1,    3,   21,    6,   11,   13,
         12,   22,  286,  179,    7,    8,   27,    9,   28,   14,   20,
       1930,   19,   60,   17,   42,   64,   44,   33,   24,  120,   63,
         23,  232,  274,   18, 1859,  112,   72,   16,   59,  483,   75,
         37, 1967,  117,   54,  569,   55,   50,   68, 5370,   92,   25,
        560,   48,   35,  239,  223,   26, 4535,   77,   83,  115,   85,
        151, 1486,   49,  326,  149,  802,  101,  668,  294,  164,   73,
         30,  567,  465,   91,  690,  378,  418,  247,   31,   40])

# host_verifications

In [68]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [69]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['email']", "['phone', 'work_email']"], dtype=object)

In [70]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [71]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [72]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['email']", "['phone', 'work_email']"], dtype=object)

# host_identity_verified

In [73]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [74]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [75]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [76]:
updated_df['calculated_host_listings_count'].unique()

array([  3,   9,   2,  10,   1,   4,   5, 117,   7,   6,  11,   8,  27,
        32,  16,  14,  18,  15,  39,  49,  52, 112, 408,  21,  20,  87,
        26,  47,  12,  53,  17,  13,  54])

# host_name

In [77]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_has_profile_pic

In [78]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [79]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

In [80]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 0 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [81]:
updated_df['neighbourhood'].unique()

array(['Seattle, Washington, United States', nan,
       'Ballard, Seattle, Washington, United States',
       'West Seattle, Washington, United States',
       'Ballard Seattle, Washington, United States',
       'Capitol Hill, Seattle, Washington, United States',
       'West Seattle, Washington, Washington, United States',
       'Seattle, Wa, United States',
       'Shoreline, Washington, United States',
       'Seattle , Washington, United States'], dtype=object)

In [82]:
updated_df['neighbourhood'].isnull().sum()

1705

In [83]:
updated_df['neighbourhood_cleansed'].unique()

array(['Wallingford', 'Georgetown', 'Fairmount Park', 'Whittier Heights',
       'Eastlake', 'Fremont', 'Green Lake', 'Portage Bay',
       'Lower Queen Anne', 'Phinney Ridge', 'Crown Hill', 'Columbia City',
       'Lawton Park', 'Mid-Beacon Hill', 'Alki', 'North Queen Anne',
       'West Queen Anne', 'First Hill', 'Broadway', 'Stevens',
       'North Admiral', 'International District', 'North Beacon Hill',
       'Greenwood', 'Cedar Park', 'Mount Baker', 'Mann', 'Genesee',
       'Central Business District', 'Ravenna', 'Belltown',
       'University District', 'Harrison/Denny-Blaine', 'South Delridge',
       'Atlantic', 'Broadview', 'Maple Leaf', 'East Queen Anne',
       'Pioneer Square', 'Leschi', 'Seward Park', 'West Woodland',
       'Adams', 'Bitter Lake', 'Brighton', 'Interbay', 'Madrona',
       'View Ridge', 'North Delridge', 'Pike-Market', 'High Point',
       'Yesler Terrace', 'Laurelhurst', 'Fauntleroy', 'Harbor Island',
       'Windermere', 'Minor', 'Rainier Beach', 'Vict

### We are going to drop neighbourhood column for Seattle Dataset

In [84]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# Final look into missing values

In [85]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                 720
host_response_rate                 720
host_acceptance_rate               455
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

# Save the Final Dataframe

In [88]:
updated_df.to_csv('data/listings_detailed_after_na_seattle.csv', index=False)