# Denver

In [38]:
import pandas as pd
import numpy as np

In [39]:
listings_detailed = pd.read_csv('usa/Denver/listings_detailed-denver.csv')

In [40]:
listings_detailed.shape

(5362, 75)

# Delete Unnecessary Columns

In [41]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [42]:
listings_detailed.shape

(5362, 29)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [43]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [44]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1624,42757920,340401226,Jake,2020-03-05,,within a day,60%,33%,f,1,...,2,,1 private bath,1.0,,"[""Private living room"", ""Iron"", ""Essentials"", ...",10.0,1,5.0,1
1870,45725617,176075890,Debra,2018-03-01,,a few days or more,29%,46%,f,2,...,1,,0 baths,,,[],10.0,8,4.88,2
4665,761561137710874547,392360868,Lincolnwood,2021-03-13,,within an hour,96%,96%,t,15,...,3,,1 bath,1.0,1.0,"[""Dishes and silverware"", ""Radiant heating"", ""...",10.0,0,,15


In [45]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


1


In [46]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1651,43095918,323075243,Louie,2019-12-31,,,,,,8,...,0,,,,,"[""Bed sheets and pillows"", ""Complimentary self...",0.0,0,,1


In [47]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1651,43095918,323075243,Louie,2019-12-31,,,,,,8,...,0,,,,,"[""Bed sheets and pillows"", ""Complimentary self...",0.0,0,,1


## Create Function to Identify the Zero/NA values and Remove them

In [48]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [49]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 1


In [50]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,2,,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2
1,1223612,6674631,Cynthia,2013-05-31,"Denver, CO",within a few hours,100%,70%,t,3,...,3,,1 bath,1.0,2.0,"[""Hot water"", ""Iron"", ""Essentials"", ""Heating"",...",58.0,184,4.76,3
2,1313699,7138728,Ann,2013-06-26,"Denver, CO",,,83%,f,1,...,10,,2 baths,5.0,5.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",218.0,24,4.92,1
3,1327856,7206373,Leila And Alex,2013-06-30,"Denver, CO",within an hour,100%,100%,t,2,...,6,,2 baths,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Indoor fir...",139.0,174,4.75,1
4,1402409,1382900,Pamela,2011-11-07,"Denver, CO",within an hour,100%,97%,t,1,...,10,,2 baths,5.0,6.0,"[""Dishes and silverware"", ""Dryer"", ""Outlet cov...",256.0,142,4.95,1


In [51]:
cleaned_listings.shape

(5361, 29)

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [52]:
cleaned_listings['bathrooms_text'].unique()

array(['1 bath', '2 baths', '1 shared bath', '1 private bath',
       '1.5 shared baths', '1.5 baths', '2.5 baths', '3 baths',
       '4.5 baths', '4 baths', '5 baths', '3.5 baths', '2 shared baths',
       '5.5 baths', '17 shared baths', nan, '0 baths', '9.5 baths',
       '4 shared baths', 'Half-bath', '2.5 shared baths',
       '0 shared baths', '6 baths', '6.5 baths', 'Shared half-bath'],
      dtype=object)

In [53]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [54]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 bath' '2 baths' '1 shared bath' '1 private bath' '1.5 shared baths'
 '1.5 baths' '2.5 baths' '3 baths' '4.5 baths' '4 baths' '5 baths'
 '3.5 baths' '2 shared baths' '5.5 baths' '17 shared baths' '' '0 baths'
 '9.5 baths' '4 shared baths' '0.5 baths' '2.5 shared baths'
 '0 shared baths' '6 baths' '6.5 baths']


### Check for the Nan for bathrooms_text

In [55]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
732,24180313,160449770,Ember,2017-11-28,,within an hour,93%,99%,t,6,...,1,,,1.0,1.0,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",57.0,224,4.75,6
1873,45756783,213457607,Borauzima,2018-09-04,"Chicago, IL",,,,f,1,...,2,,,1.0,1.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",119.0,0,,1
5209,849934100277878013,486880651,Tailored Stays,2022-11-08,,within an hour,100%,96%,f,11,...,4,,,1.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Smoking al...",125.0,0,,2
5329,283162,1475137,Lee,2011-12-05,"Portland, OR",within a few hours,67%,60%,t,2,...,4,,,2.0,3.0,"[""Dryer"", ""Pool"", ""Elevator"", ""Heating"", ""Gym""...",165.0,4,4.5,1


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [56]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [57]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [58]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,177,,1.0,1 bath
1,1223612,1.0,2.0,1 bath
2,1313699,5.0,5.0,2 baths
3,1327856,2.0,2.0,2 baths
4,1402409,5.0,6.0,2 baths
...,...,...,...,...
5357,1069351,2.0,2.0,1.5 baths
5358,1074386,1.0,1.0,1 bath
5359,1082572,1.0,2.0,1 bath
5360,1087609,,,1 bath


In [59]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 24180313) | 
    (filled_cleaned_listings['id'] == 45756783) | 
    (filled_cleaned_listings['id'] == 849934100277878013) | 
    (filled_cleaned_listings['id'] == 283162) ]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
732,24180313,160449770,Ember,2017-11-28,,within an hour,93%,99%,t,6,...,1,,1.0 bath,1.0,1.0,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",57.0,224,4.75,6
1873,45756783,213457607,Borauzima,2018-09-04,"Chicago, IL",,,,f,1,...,2,,1.0 bath,1.0,1.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",119.0,0,,1
5209,849934100277878013,486880651,Tailored Stays,2022-11-08,,within an hour,100%,96%,f,11,...,4,,1.0 bath,1.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Smoking al...",125.0,0,,2
5329,283162,1475137,Lee,2011-12-05,"Portland, OR",within a few hours,67%,60%,t,2,...,4,,2.0 bath,2.0,3.0,"[""Dryer"", ""Pool"", ""Elevator"", ""Heating"", ""Gym""...",165.0,4,4.5,1


### Function to make numerical values for # of bath

In [60]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [61]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

     bathrooms_text  num_bath
0            1 bath       1.0
1            1 bath       1.0
2           2 baths       2.0
3           2 baths       2.0
4           2 baths       2.0
...             ...       ...
5357      1.5 baths       1.5
5358         1 bath       1.0
5359         1 bath       1.0
5360         1 bath       1.0
5361         1 bath       1.0

[5361 rows x 2 columns]


In [62]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2,1.0
1,1223612,6674631,Cynthia,2013-05-31,"Denver, CO",within a few hours,100%,70%,t,3,...,,1 bath,1.0,2.0,"[""Hot water"", ""Iron"", ""Essentials"", ""Heating"",...",58.0,184,4.76,3,1.0
2,1313699,7138728,Ann,2013-06-26,"Denver, CO",,,83%,f,1,...,,2 baths,5.0,5.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",218.0,24,4.92,1,2.0
3,1327856,7206373,Leila And Alex,2013-06-30,"Denver, CO",within an hour,100%,100%,t,2,...,,2 baths,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Indoor fir...",139.0,174,4.75,1,2.0
4,1402409,1382900,Pamela,2011-11-07,"Denver, CO",within an hour,100%,97%,t,1,...,,2 baths,5.0,6.0,"[""Dishes and silverware"", ""Dryer"", ""Outlet cov...",256.0,142,4.95,1,2.0


In [63]:
updated_df['num_bath'].isnull().sum()

0

In [64]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [65]:
updated_df['num_bath'].unique()

array([ 1. ,  2. ,  1.5,  2.5,  3. ,  4.5,  4. ,  5. ,  3.5,  5.5, 17. ,
        0. ,  9.5,  0.5,  6. ,  6.5])

In [66]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [67]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2,1.0
1,1223612,6674631,Cynthia,2013-05-31,"Denver, CO",within a few hours,100%,70%,t,3,...,3,1 bath,1.0,2.0,"[""Hot water"", ""Iron"", ""Essentials"", ""Heating"",...",58.0,184,4.76,3,1.0
2,1313699,7138728,Ann,2013-06-26,"Denver, CO",,,83%,f,1,...,10,2 baths,5.0,5.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",218.0,24,4.92,1,2.0
3,1327856,7206373,Leila And Alex,2013-06-30,"Denver, CO",within an hour,100%,100%,t,2,...,6,2 baths,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Indoor fir...",139.0,174,4.75,1,2.0
4,1402409,1382900,Pamela,2011-11-07,"Denver, CO",within an hour,100%,97%,t,1,...,10,2 baths,5.0,6.0,"[""Dishes and silverware"", ""Dryer"", ""Outlet cov...",256.0,142,4.95,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5357,1069351,3577356,Bonnie,2012-09-15,"Denver, CO",within a day,50%,50%,f,1,...,4,1.5 baths,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Cooking ba...",100.0,5,5.00,1,1.5
5358,1074386,5910663,Tony,2013-04-13,"Denver, CO",within an hour,100%,100%,t,2,...,2,1 bath,1.0,1.0,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",208.0,107,4.57,2,1.0
5359,1082572,879998,Jennifer,2011-07-27,,within an hour,100%,96%,t,7,...,3,1 bath,1.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Smoking al...",150.0,46,4.88,4,1.0
5360,1087609,5124037,Darja,2013-02-17,"Denver, CO",within a few hours,100%,31%,t,1,...,2,1 bath,,,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",94.0,23,4.48,1,1.0


# Impute Bedrooms and Beds

In [68]:
updated_df['bedrooms'].unique()

array([nan,  1.,  5.,  2.,  3.,  4.,  6.,  7.,  9.,  8.])

In [73]:
updated_df['beds'].unique()

array([ 1.,  2.,  5.,  6.,  3.,  4.,  7., 13.,  9., nan,  8., 18., 10.,
       11., 12., 16., 14., 17.])

In [74]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2,1.0
11,1737365,3378084,Alexandra,2012-08-26,"Denver, CO",within a few hours,100%,75%,t,2,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",100.0,74,4.76,2,1.0
25,2216969,10917089,Jeffrey,2013-12-30,United States,within an hour,100%,100%,t,5,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Essentials"", ""Heati...",70.0,156,4.90,4,1.0
100,8366762,9153888,Brian,2013-09-30,,within an hour,100%,97%,f,9,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",55.0,144,4.59,1,1.0
118,8941796,46741655,Chris,2015-10-16,"Denver, CO",within a few hours,100%,85%,t,2,...,2,1 bath,,1.0,"[""Coffee maker"", ""Essentials"", ""Heating"", ""HDT...",72.0,711,4.90,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5262,4753876,24532369,Ryan & Tina,2014-12-05,"Denver, CO",within an hour,100%,99%,t,2,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",95.0,482,4.82,1,1.0
5264,4766503,23195393,Laurel And Paul,2014-10-31,"Denver, CO",within an hour,100%,100%,t,1,...,2,1 bath,,2.0,"[""Dishes and silverware"", ""Essentials"", ""Cloth...",75.0,295,4.94,1,1.0
5281,5558992,26125748,Adi & Edwin,2015-01-14,"Denver, CO",within an hour,100%,100%,t,1,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",80.0,157,4.82,1,1.0
5321,1940,2150,Joanne,2008-08-16,"Denver, CO",within an hour,100%,100%,t,1,...,2,1 bath,,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",87.0,122,4.86,1,1.0


In [75]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
418,18002228,24326234,Christina,2014-11-30,"Denver, CO",within an hour,100%,100%,f,3,...,5,1 bath,2.0,,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",120.0,232,4.83,3,1.0
573,21241714,65787978,Luis,2016-04-04,"Denver, CO",within an hour,100%,100%,f,3,...,2,1.5 shared baths,1.0,,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",30.0,91,4.74,3,1.5
574,21241977,65787978,Luis,2016-04-04,"Denver, CO",within an hour,100%,100%,f,3,...,2,1.5 shared baths,1.0,,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",30.0,103,4.55,3,1.5
659,22727793,160449770,Ember,2017-11-28,,within an hour,93%,99%,t,6,...,16,17 shared baths,1.0,,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",53.0,99,4.85,6,17.0
741,24452403,176075890,Debra,2018-03-01,,a few days or more,29%,46%,f,2,...,1,0 baths,,,[],74.0,34,4.68,2,0.0
748,24631282,55376910,Kelsey,2016-01-20,"Denver, CO",within an hour,100%,100%,t,2,...,2,1 bath,1.0,,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",129.0,52,4.73,2,1.0
1428,39296756,291640238,Bahman,2019-09-03,"Denver, CO",a few days or more,0%,0%,f,1,...,2,1 bath,1.0,,"[""Dryer"", ""Paid parking on premises"", ""Pool"", ...",85.0,1,,1,1.0
1441,39392919,219524979,Sonder (Denver),2018-10-08,"Denver, CO",within an hour,97%,99%,f,12,...,2,1 bath,,,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",158.0,213,4.52,12,1.0
1568,41745650,23332515,Beth,2014-11-04,"Denver, CO",within an hour,100%,98%,t,1,...,2,1 bath,,,"[""Dishes and silverware"", ""Carbon monoxide ala...",74.0,136,4.86,1,1.0
1578,41976040,219524979,Sonder (Denver),2018-10-08,"Denver, CO",within an hour,97%,99%,f,12,...,2,1 bath,,,"[""Dishes and silverware"", ""Dryer"", ""Coffee mak...",150.0,417,4.63,12,1.0


In [76]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [77]:
updated_df = update_bedrooms_and_beds(updated_df)


In [80]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 42757920) | 
    (updated_df['id'] == 50422078) | 
    (updated_df['id'] == 689085319927829820) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
1624,42757920,340401226,Jake,2020-03-05,,within a day,60%,33%,f,1,...,2,1 private bath,1.0,1.0,"[""Private living room"", ""Iron"", ""Essentials"", ...",10.0,1,5.0,1,1.0
2474,50422078,406033071,Austin,2021-06-08,,within a few hours,71%,94%,f,15,...,2,2 shared baths,1.0,1.0,"[""Dishes and silverware"", ""Smoking allowed"", ""...",30.0,2,3.0,12,2.0
4147,689085319927829820,41414078,Melissa,2015-08-13,"Denver, CO",within an hour,100%,94%,f,1,...,6,2 baths,3.0,3.0,"[""Dishes and silverware"", ""Gas stove"", ""Indoor...",639.0,10,5.0,1,2.0


In [81]:
updated_df['beds'].unique()

array([ 1.,  2.,  5.,  6.,  3.,  4.,  7., 13.,  9.,  8., 18.,  0., 10.,
       11., 12., 16., 14., 17.])

In [82]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [83]:
updated_df['bedrooms'].unique()

array([1., 5., 2., 3., 4., 6., 7., 0., 9., 8.])

In [84]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_since

In [85]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [86]:
updated_df['host_since'].isnull().sum()

0

In [90]:
updated_df[updated_df['host_since'] == 'N/A']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [87]:
updated_df['host_since'].unique()

array(['2008-07-07', '2013-05-31', '2013-06-26', ..., '2012-09-21',
       '2013-02-21', '2012-09-15'], dtype=object)

# host_location

In [91]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 0 missing values in the "host_location" column.


In [92]:
updated_df['host_location'].isnull().sum()

0

In [93]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

752

In [94]:
updated_df['host_location'].unique()

array(['Denver, CO', 'United States', 'Colorado, United States',
       'Aspen, CO', 'unknown', 'Seattle, WA', 'Silverthorne, CO',
       'Englewood, CO', 'Fort Collins, CO', 'Aurora, CO',
       'Santa Monica, CA', 'Newport Beach, CA', 'Louisville, CO',
       'Tucson, AZ', 'Las Vegas, NV', 'London, United Kingdom',
       'Bocas del Toro Province, Panama', 'Atlanta, GA', 'Windsor, CO',
       'San Francisco, CA', 'Highlands Ranch, CO', 'Austin, TX',
       'Castle Rock, CO', 'Boulder, CO', 'Golden, CO', 'Calabasas, CA',
       'New York, NY', 'Eagle, CO', 'Westminster, CO', 'Chattanooga, TN',
       'Scottsdale, AZ', 'Wheat Ridge, CO', 'Nashville, TN',
       'Vila Nova de Poiares, Portugal', 'Philadelphia, PA',
       'Evergreen, CO', 'Lakewood, CO', 'Minneapolis, MN',
       'Greenwood Village, CO', 'San Juan, Puerto Rico', 'Tempe, AZ',
       'Casablanca, Morocco', 'Vail, CO', 'Tampa, FL',
       'Steamboat Springs, CO', 'Centennial, CO', 'Yorba Linda, CA',
       'Valley Center, 

In [95]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_is_superhost

In [96]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [97]:
updated_df['host_is_superhost'].isnull().sum()

0

In [98]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

2799

In [99]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [100]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [101]:
updated_df['host_listings_count'].isnull().sum()

0

In [102]:
updated_df['host_listings_count'].unique()

array([   2,    3,    1,    8,    4,    5,    6,   11,    9,   14,   18,
         21,   17,    7,   10,   40,   68,   24,   77,   53,   96,   23,
         46,   71,   22,  253,   27,   15,   32,   16,   39,   26,   12,
         80,   19,  246,  826,  178,  777,   38, 3305, 4818,   57,  146,
         13,   44,   31,   28,  157,   62,  171,   36, 2314,  348,  572,
         65,   33])

# host_total_listings_count

In [103]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [104]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [105]:
updated_df['host_total_listings_count'].unique()

array([   2,    3,    1,    4,   11,    8,    5,    6,    7,   31,    9,
         17,   10,   16,   13,   28,   38,   65,   73,   29,   35,   14,
         93,   32,   63,   15,   33,  110,   36,   61,   41,   25,  729,
         48,   12,   18,   56,   77,   94,   20,   21,  294, 1280,   26,
        433, 1534,   45,  107,   24, 4537, 5399,   84,  281,   23,   19,
         22,  124,   34,   58,  208,   44,   68,  181,   86, 7847,  384,
       1018,   57,   71])

# host_verifications

In [106]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [107]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", "['email']"], dtype=object)

# host_identity_verified

In [108]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [109]:
updated_df['host_identity_verified'].unique()

array(['f', 't'], dtype=object)

# calculated_host_listings_count

In [110]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [111]:
updated_df['calculated_host_listings_count'].unique()

array([  2,   3,   1,   7,   4,   5,   6,  11,  21,  25,  20,  10,  18,
        43,  12,  14,   9,  16,   8,  13,  53,  93,  26,  38,  86, 253,
        17,  15,  44,  24,  48])

# host_name

In [112]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_has_profile_pic

In [113]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [114]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

# Neighborhood

In [115]:
updated_df['neighbourhood'].unique()

array(['Denver, Colorado, United States', nan,
       'Englewood, Colorado, United States',
       'Wheat Ridge, Colorado, United States',
       'DENVER , Colorado, United States',
       'Aurora, Colorado, United States',
       'Wheat ridge, Colorado, United States',
       'Stapleton, Denver, Colorado, United States',
       'Lakewood, Colorado, United States',
       'Denver , Colorado, United States', 'Denver , Co, United States',
       'Denver, United States', 'Lakewood , Colorado, United States',
       'Edgewater, Colorado, United States',
       'Littleton, Colorado, United States'], dtype=object)

In [116]:
updated_df['neighbourhood'].isnull().sum()

1501

In [117]:
updated_df['neighbourhood_cleansed'].unique()

array(['Virginia Village', 'Cheesman Park', 'Platt Park', 'Capitol Hill',
       'West Colfax', 'Berkeley', 'West Highland', 'Speer',
       'South Park Hill', 'Sloan Lake', 'Highland', 'Washington Park',
       'Congress Park', 'CBD', 'University', 'Clayton', 'City Park West',
       'Lincoln Park', 'Hilltop', 'Country Club', 'Union Station',
       'Cory - Merrill', 'Athmar Park', 'Stapleton', 'Five Points',
       'Windsor', 'Baker', 'Gateway - Green Valley Ranch', 'Cole',
       'Rosedale', 'Whittier', 'Fort Logan', 'Sunnyside', 'City Park',
       'Skyland', 'North Capitol Hill', 'Wellshire', 'Goldsmith',
       'Hampden', 'Hampden South', 'Washington Park West', 'Hale',
       'North Park Hill', 'University Park', 'Jefferson Park',
       'Barnum West', 'Overland', 'Barnum', 'University Hills',
       'Lowry Field', 'East Colfax', 'Washington Virginia Vale',
       'Cherry Creek', 'Regis', 'Northeast Park Hill', 'Civic Center',
       'Montclair', 'Villa Park', 'Ruby Hill', 'Indi

In [118]:
updated_df['neighbourhood_cleansed'].isnull().sum()

0

### We are going to drop neighbourhood column for Denver Dataset

In [119]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# host_response_time

In [120]:
updated_df['host_response_time'].unique()

array(['within a day', 'within a few hours', nan, 'within an hour',
       'a few days or more'], dtype=object)

# host_response_rate

In [121]:
updated_df['host_response_rate'].unique()

array(['100%', nan, '90%', '96%', '97%', '88%', '73%', '80%', '86%',
       '67%', '0%', '92%', '75%', '50%', '93%', '98%', '29%', '25%',
       '83%', '70%', '99%', '91%', '60%', '89%', '87%', '94%', '33%',
       '71%', '78%', '40%', '74%', '64%'], dtype=object)

# host_acceptance_rate

In [122]:
updated_df['host_acceptance_rate'].unique()

array(['100%', '70%', '83%', '97%', '99%', '87%', '75%', nan, '77%',
       '84%', '20%', '59%', '67%', '80%', '0%', '88%', '44%', '91%',
       '79%', '95%', '93%', '98%', '50%', '85%', '86%', '33%', '81%',
       '96%', '94%', '52%', '89%', '82%', '90%', '40%', '92%', '58%',
       '60%', '64%', '73%', '69%', '78%', '63%', '56%', '46%', '25%',
       '57%', '22%', '43%', '29%', '53%', '38%', '71%', '55%', '36%',
       '76%', '68%', '54%', '14%', '49%', '30%', '74%', '62%', '42%',
       '41%', '19%', '65%', '27%', '61%', '31%'], dtype=object)

# Adding new columns for city

Add new column and put 'Denver' for all rows for later joining purpose



In [123]:
updated_df['city'] = 'Denver'
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,1 bath,1.0,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2,1.0,Denver
1,1223612,6674631,Cynthia,2013-05-31,"Denver, CO",within a few hours,100%,70%,t,3,...,1 bath,1.0,2.0,"[""Hot water"", ""Iron"", ""Essentials"", ""Heating"",...",58.0,184,4.76,3,1.0,Denver
2,1313699,7138728,Ann,2013-06-26,"Denver, CO",,,83%,f,1,...,2 baths,5.0,5.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",218.0,24,4.92,1,2.0,Denver
3,1327856,7206373,Leila And Alex,2013-06-30,"Denver, CO",within an hour,100%,100%,t,2,...,2 baths,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Indoor fir...",139.0,174,4.75,1,2.0,Denver
4,1402409,1382900,Pamela,2011-11-07,"Denver, CO",within an hour,100%,97%,t,1,...,2 baths,5.0,6.0,"[""Dishes and silverware"", ""Dryer"", ""Outlet cov...",256.0,142,4.95,1,2.0,Denver


# Count number of amenities

In [124]:
import ast

# Convert the 'amenities' string to a list using ast.literal_eval
updated_df['amenities_list'] = updated_df['amenities'].apply(lambda x: ast.literal_eval(x))

# Count the number of amenities in each list and create a new column with these counts
updated_df['amenities_count'] = updated_df['amenities_list'].apply(lambda x: len(x))

updated_df.drop('amenities_list', axis=1, inplace=True)


In [125]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
0,177,615,Joe,2008-07-07,"Denver, CO",within a day,100%,100%,t,2,...,1.0,1.0,"[""Dishes and silverware"", ""Coffee maker"", ""Ess...",79.0,120,4.78,2,1.0,Denver,29
1,1223612,6674631,Cynthia,2013-05-31,"Denver, CO",within a few hours,100%,70%,t,3,...,1.0,2.0,"[""Hot water"", ""Iron"", ""Essentials"", ""Heating"",...",58.0,184,4.76,3,1.0,Denver,12
2,1313699,7138728,Ann,2013-06-26,"Denver, CO",,,83%,f,1,...,5.0,5.0,"[""Dishes and silverware"", ""Dryer"", ""Essentials...",218.0,24,4.92,1,2.0,Denver,49
3,1327856,7206373,Leila And Alex,2013-06-30,"Denver, CO",within an hour,100%,100%,t,2,...,2.0,2.0,"[""Dishes and silverware"", ""Dryer"", ""Indoor fir...",139.0,174,4.75,1,2.0,Denver,35
4,1402409,1382900,Pamela,2011-11-07,"Denver, CO",within an hour,100%,97%,t,1,...,5.0,6.0,"[""Dishes and silverware"", ""Dryer"", ""Outlet cov...",256.0,142,4.95,1,2.0,Denver,74


In [126]:
# sanity check

see = updated_df.loc[updated_df['id'] == 1223612]


amenities_text = see['amenities'].iloc[0]

# Print the extracted 'amenities' list
print(amenities_text)

["Hot water", "Iron", "Essentials", "Heating", "Extra pillows and blankets", "Hangers", "Wifi", "Shampoo", "Indoor fireplace", "Hair dryer", "Kitchen", "Bed linens"]


# Final look into missing values

In [127]:
updated_df.isnull().sum()

id                                  0
host_id                             0
host_name                           0
host_since                          0
host_location                       0
host_response_time                675
host_response_rate                675
host_acceptance_rate              436
host_is_superhost                   0
host_listings_count                 0
host_total_listings_count           0
host_verifications                  0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
latitude                            0
longitude                           0
room_type                           0
accommodates                        0
bathrooms_text                      0
bedrooms                            0
beds                                0
amenities                           0
price                               0
number_of_reviews                   0
review_scores_value               963
calculated_h

In [128]:
# Save as a new csv file
updated_df.to_csv('clean-data/listings_detailed_clean_denver.csv', index=False)