# Austin

In [655]:
import pandas as pd
import numpy as np

In [656]:
# calendar = pd.read_csv('usa/Seattle/calendar.csv')
# listings = pd.read_csv('usa/Seattle/listings.csv')
listings_detailed = pd.read_csv('usa/Austin/listings_detailed.csv')

In [657]:
listings_detailed.shape

(14368, 75)

# Delete Unnecessary Columns

In [658]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [659]:
listings_detailed.shape

(14368, 29)

In [660]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_austin.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [661]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [662]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
3050,20647325,28367024,John,2015-02-25,"Stuttgart, Germany",,,,f,1.0,...,3,,1 shared bath,1.0,1.0,"[""Free parking on premises"", ""Backyard"", ""Dish...",10.0,1,5.0,1
3063,20691399,51672309,Jamie,2015-12-18,"Austin, TX",,,,f,1.0,...,2,,1 private bath,1.0,1.0,"[""Security cameras on property"", ""Washer"", ""Es...",10.0,0,,1
3067,20713150,3864713,Jeff,2012-10-14,"New York, NY",,,,f,1.0,...,1,,1 shared bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Essentials"", ""TV"", ...",10.0,0,,1
3322,21946737,160198833,Matt,2017-11-26,,,,,f,1.0,...,2,,1 shared bath,1.0,1.0,"[""Essentials"", ""Heating"", ""TV"", ""Pets allowed""...",10.0,0,,1
5114,36770125,3440175,James,2012-09-01,"Austin, TX",within an hour,100%,100%,f,9.0,...,2,,1 bath,,1.0,"[""Free parking on premises"", ""Heating"", ""Backy...",10.0,23,4.91,8
5987,42664390,192261997,Christian,2018-05-29,"Ensenada, Mexico",,,100%,f,1.0,...,4,,1 private bath,,1.0,"[""Washer"", ""Free parking on premises"", ""TV"", ""...",10.0,1,5.0,1


In [663]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


2


In [664]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
5998,42738847,314291208,Hotel Granduca,2019-12-03,"Austin, TX",,,,,5.0,...,0,,,,,"[""Bottled water"", ""Garden"", ""First aid kit"", ""...",0.0,0,,1
6413,45895289,260188286,Hotel Saint Cecilia,2019-05-06,,,,0%,,8.0,...,0,,,,,"[""Pack \u2019n play/Travel crib"", ""Heating"", ""...",0.0,0,,1


In [665]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
5998,42738847,314291208,Hotel Granduca,2019-12-03,"Austin, TX",,,,,5.0,...,0,,,,,"[""Bottled water"", ""Garden"", ""First aid kit"", ""...",0.0,0,,1
6413,45895289,260188286,Hotel Saint Cecilia,2019-05-06,,,,0%,,8.0,...,0,,,,,"[""Pack \u2019n play/Travel crib"", ""Heating"", ""...",0.0,0,,1


## Create Function to Identify the Zero/NA values and Remove them

In [666]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [667]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 2


In [668]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,5456,8028,Sylvia,2009-02-16,"Austin, TX",within an hour,100%,95%,t,2.0,...,3,,1 bath,1.0,2.0,"[""Heating"", ""Backyard"", ""Bed linens"", ""Hot wat...",176.0,630,4.79,2
1,5769,8186,Elizabeth,2009-02-19,"Austin, TX",within a few hours,100%,95%,t,1.0,...,2,,1 shared bath,1.0,1.0,"[""Private backyard"", ""Free parking on premises...",42.0,275,4.92,1
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,2,,1 bath,,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1
3,6448,14156,Amy,2009-04-20,"Austin, TX",within an hour,100%,100%,t,1.0,...,2,,1 bath,1.0,2.0,"[""Pack \u2019n play/Travel crib"", ""Free parkin...",240.0,295,4.9,1
4,8502,25298,Karen,2009-07-11,"Austin, TX",within a day,60%,70%,f,1.0,...,2,,1 bath,1.0,1.0,"[""Essentials"", ""Heating"", ""Central air conditi...",85.0,48,4.57,1


In [669]:
cleaned_listings.shape

(14366, 29)

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [670]:
cleaned_listings['bathrooms_text'].unique()

array(['1 bath', '1 shared bath', '2 baths', '1.5 baths', '2.5 baths',
       '3 baths', '1 private bath', nan, '3.5 baths', '0 baths',
       '4 baths', '4.5 baths', '2.5 shared baths', '0 shared baths',
       '1.5 shared baths', 'Half-bath', '5 baths', '8 baths', '5.5 baths',
       '2 shared baths', 'Private half-bath', '6.5 baths', '6 baths',
       '3 shared baths', '4 shared baths', '10.5 baths', '17 baths',
       '7.5 baths', '7 baths', '8.5 baths', '11.5 baths',
       'Shared half-bath', '9.5 baths', '3.5 shared baths', '10 baths'],
      dtype=object)

In [671]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [672]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 bath' '1 shared bath' '2 baths' '1.5 baths' '2.5 baths' '3 baths'
 '1 private bath' '' '3.5 baths' '0 baths' '4 baths' '4.5 baths'
 '2.5 shared baths' '0 shared baths' '1.5 shared baths' '0.5 baths'
 '5 baths' '8 baths' '5.5 baths' '2 shared baths' '6.5 baths' '6 baths'
 '3 shared baths' '4 shared baths' '10.5 baths' '17 baths' '7.5 baths'
 '7 baths' '8.5 baths' '11.5 baths' '9.5 baths' '3.5 shared baths'
 '10 baths']


### Check for the Nan for bathrooms_text

In [673]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
70,247770,1298575,Gloria,2011-10-16,"Austin, TX",,,,f,1.0,...,5,,,3.0,2.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",175.0,1,5.0,1
84,314152,1614097,Denise,2012-01-16,"Austin, TX",,,,f,1.0,...,4,,,2.0,,[],350.0,0,,1
105,337125,261883,Carolyn,2010-10-14,"Austin, TX",,,,f,1.0,...,2,,,1.0,2.0,"[""Washer"", ""Essentials"", ""Fire extinguisher"", ...",80.0,33,4.88,1
126,353955,1792021,Mykel,2012-02-22,"Austin, TX",,,,f,1.0,...,2,,,2.0,1.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",225.0,1,5.0,1
131,355232,1798084,Jeffrey,2012-02-23,"Austin, TX",,,0%,f,1.0,...,7,,,3.0,7.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",1000.0,3,5.0,1
197,666768,2350380,Nessa,2012-05-11,"Austin, TX",within a day,86%,21%,f,3.0,...,2,,,1.0,1.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",100.0,8,4.88,3
220,711149,3659932,Ken,2012-09-24,"Austin, TX",,,0%,f,2.0,...,6,,,3.0,4.0,"[""Washer"", ""Essentials"", ""Extra pillows and bl...",1100.0,0,,1
290,929579,5004628,Lauren,2013-02-07,"Austin, TX",,,,f,1.0,...,2,,,2.0,,[],800.0,0,,1
313,957405,5206723,Jeannette,2013-02-24,"Austin, TX",,,,f,1.0,...,4,,,1.0,1.0,"[""Wifi"", ""TV"", ""Kitchen""]",350.0,0,,1
326,978089,5352009,Daniel,2013-03-06,"Austin, TX",,,,f,1.0,...,2,,,1.0,,[],200.0,1,5.0,1


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [674]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [675]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [676]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,5456,1.0,2.0,1 bath
1,5769,1.0,1.0,1 shared bath
2,6413,,1.0,1 bath
3,6448,1.0,2.0,1 bath
4,8502,1.0,1.0,1 bath
...,...,...,...,...
14363,847159076072987428,4.0,5.0,2 baths
14364,847171709264672413,,1.0,1 bath
14365,847178203609366885,2.0,3.0,1 bath
14366,847256590826352221,2.0,4.0,2 baths


In [677]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 247770) | 
    (filled_cleaned_listings['id'] == 314152) | 
    (filled_cleaned_listings['id'] == 337125) | 
    (filled_cleaned_listings['id'] == 353955) |
    (filled_cleaned_listings['id'] == 666768) | 
    (filled_cleaned_listings['id'] == 355232) | 
    (filled_cleaned_listings['id'] == 711149) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
70,247770,1298575,Gloria,2011-10-16,"Austin, TX",,,,f,1.0,...,5,,3.0 bath,3.0,2.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",175.0,1,5.0,1
84,314152,1614097,Denise,2012-01-16,"Austin, TX",,,,f,1.0,...,4,,2.0 bath,2.0,,[],350.0,0,,1
105,337125,261883,Carolyn,2010-10-14,"Austin, TX",,,,f,1.0,...,2,,1.0 bath,1.0,2.0,"[""Washer"", ""Essentials"", ""Fire extinguisher"", ...",80.0,33,4.88,1
126,353955,1792021,Mykel,2012-02-22,"Austin, TX",,,,f,1.0,...,2,,2.0 bath,2.0,1.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",225.0,1,5.0,1
131,355232,1798084,Jeffrey,2012-02-23,"Austin, TX",,,0%,f,1.0,...,7,,3.0 bath,3.0,7.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",1000.0,3,5.0,1
197,666768,2350380,Nessa,2012-05-11,"Austin, TX",within a day,86%,21%,f,3.0,...,2,,1.0 bath,1.0,1.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",100.0,8,4.88,3
220,711149,3659932,Ken,2012-09-24,"Austin, TX",,,0%,f,2.0,...,6,,3.0 bath,3.0,4.0,"[""Washer"", ""Essentials"", ""Extra pillows and bl...",1100.0,0,,1


### Function to make numerical values for # of bath

In [678]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [679]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
0             1 bath       1.0
1      1 shared bath       1.0
2             1 bath       1.0
3             1 bath       1.0
4             1 bath       1.0
...              ...       ...
14363        2 baths       2.0
14364         1 bath       1.0
14365         1 bath       1.0
14366        2 baths       2.0
14367         1 bath       1.0

[14366 rows x 2 columns]


In [680]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,5456,8028,Sylvia,2009-02-16,"Austin, TX",within an hour,100%,95%,t,2.0,...,,1 bath,1.0,2.0,"[""Heating"", ""Backyard"", ""Bed linens"", ""Hot wat...",176.0,630,4.79,2,1.0
1,5769,8186,Elizabeth,2009-02-19,"Austin, TX",within a few hours,100%,95%,t,1.0,...,,1 shared bath,1.0,1.0,"[""Private backyard"", ""Free parking on premises...",42.0,275,4.92,1,1.0
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,,1 bath,,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0
3,6448,14156,Amy,2009-04-20,"Austin, TX",within an hour,100%,100%,t,1.0,...,,1 bath,1.0,2.0,"[""Pack \u2019n play/Travel crib"", ""Free parkin...",240.0,295,4.90,1,1.0
4,8502,25298,Karen,2009-07-11,"Austin, TX",within a day,60%,70%,f,1.0,...,,1 bath,1.0,1.0,"[""Essentials"", ""Heating"", ""Central air conditi...",85.0,48,4.57,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14363,847159076072987428,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,,2 baths,4.0,5.0,"[""Washer"", ""Fire extinguisher"", ""Free parking ...",129.0,0,,41,2.0
14364,847171709264672413,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,,1 bath,,1.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",54.0,0,,41,1.0
14365,847178203609366885,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,,1 bath,2.0,3.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",145.0,0,,41,1.0
14366,847256590826352221,499116561,Edwin,2023-02-01,"Phoenix, AZ",within an hour,100%,100%,f,2.0,...,,2 baths,2.0,4.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",144.0,0,,1,2.0


In [681]:
updated_df['num_bath'].isnull().sum()

0

In [682]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [683]:
updated_df['num_bath'].unique()

array([ 1. ,  2. ,  1.5,  2.5,  3. ,  3.5,  0. ,  4. ,  4.5,  0.5,  5. ,
        8. ,  5.5,  6.5,  6. , 10.5, 17. ,  7.5,  7. ,  8.5, 11.5,  9.5,
       10. ])

In [684]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [685]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,5456,8028,Sylvia,2009-02-16,"Austin, TX",within an hour,100%,95%,t,2.0,...,3,1 bath,1.0,2.0,"[""Heating"", ""Backyard"", ""Bed linens"", ""Hot wat...",176.0,630,4.79,2,1.0
1,5769,8186,Elizabeth,2009-02-19,"Austin, TX",within a few hours,100%,95%,t,1.0,...,2,1 shared bath,1.0,1.0,"[""Private backyard"", ""Free parking on premises...",42.0,275,4.92,1,1.0
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,2,1 bath,,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0
3,6448,14156,Amy,2009-04-20,"Austin, TX",within an hour,100%,100%,t,1.0,...,2,1 bath,1.0,2.0,"[""Pack \u2019n play/Travel crib"", ""Free parkin...",240.0,295,4.90,1,1.0
4,8502,25298,Karen,2009-07-11,"Austin, TX",within a day,60%,70%,f,1.0,...,2,1 bath,1.0,1.0,"[""Essentials"", ""Heating"", ""Central air conditi...",85.0,48,4.57,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14363,847159076072987428,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,10,2 baths,4.0,5.0,"[""Washer"", ""Fire extinguisher"", ""Free parking ...",129.0,0,,41,2.0
14364,847171709264672413,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,2,1 bath,,1.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",54.0,0,,41,1.0
14365,847178203609366885,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,5,1 bath,2.0,3.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",145.0,0,,41,1.0
14366,847256590826352221,499116561,Edwin,2023-02-01,"Phoenix, AZ",within an hour,100%,100%,f,2.0,...,6,2 baths,2.0,4.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",144.0,0,,1,2.0


# Impute Bedrooms and Beds

In [686]:
updated_df['bedrooms'].unique()

array([ 1., nan,  2.,  3.,  5.,  4.,  8.,  6.,  7., 13., 15., 23., 14.,
        9., 10., 12.])

In [687]:
updated_df['beds'].unique()

array([  2.,   1.,   4.,   3.,  14.,   5.,  13.,  nan,   9.,   6.,   7.,
        10.,   8.,  16.,  12.,  15.,  22.,  11.,  36.,  18.,  39.,  61.,
       132.,  17.,  26.,  24.,  21.,  20.,  23.,  25.])

In [688]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,2,1 bath,,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0
33,78884,2157243,Sean,2012-04-17,"Austin, TX",within an hour,100%,100%,t,8.0,...,2,1 bath,,2.0,"[""Paid dryer \u2013 In building"", ""Free parkin...",148.0,190,4.75,8,1.0
63,219168,1134580,Kalu,2011-09-09,"Austin, TX",,,,f,1.0,...,2,1 bath,,1.0,"[""Essentials"", ""Free parking on premises"", ""He...",250.0,8,4.88,1,1.0
65,224603,1169129,David And Kristy,2011-09-16,"Austin, TX",within a day,100%,70%,t,2.0,...,2,1 bath,,1.0,"[""Free parking on premises"", ""Bed linens"", ""Ho...",112.0,114,4.87,2,1.0
113,343889,1744639,Darcy,2012-02-13,"Austin, TX",within a few hours,100%,100%,f,1.0,...,2,1 bath,,1.0,"[""Clothing storage: wardrobe"", ""Free parking o...",47.0,35,4.69,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14319,844959293264278924,482174715,Suite Life,2022-10-04,,within an hour,99%,76%,t,620.0,...,4,1 bath,,2.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",460.0,0,,30,1.0
14320,844959370029376106,482174715,Suite Life,2022-10-04,,within an hour,99%,76%,t,620.0,...,4,1 bath,,2.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",460.0,0,,30,1.0
14321,844959440493704150,482174715,Suite Life,2022-10-04,,within an hour,99%,76%,t,620.0,...,4,1 bath,,2.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",479.0,0,,30,1.0
14322,844959652169052749,482174715,Suite Life,2022-10-04,,within an hour,99%,76%,t,620.0,...,4,1 bath,,2.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",460.0,0,,30,1.0


In [689]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
84,314152,1614097,Denise,2012-01-16,"Austin, TX",,,,f,1.0,...,4,2.0 bath,2.0,,[],350.0,0,,1,2.0
290,929579,5004628,Lauren,2013-02-07,"Austin, TX",,,,f,1.0,...,2,2.0 bath,2.0,,[],800.0,0,,1,2.0
326,978089,5352009,Daniel,2013-03-06,"Austin, TX",,,,f,1.0,...,2,1.0 bath,1.0,,[],200.0,1,5.00,1,1.0
474,2045501,10487205,Kirk And Linley,2013-12-07,"Austin, TX",,,0%,f,1.0,...,8,3 baths,3.0,,"[""Washer"", ""Free parking on premises"", ""Heatin...",975.0,5,5.00,1,3.0
516,2299148,3951061,Benjamin,2012-10-23,"Austin, TX",within a day,60%,52%,f,3.0,...,2,1 private bath,1.0,,"[""Free parking on premises"", ""Heating"", ""Backy...",55.0,6,4.33,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13488,811145863352252309,497674468,Sih-Chi,2023-01-23,,within an hour,100%,100%,f,1.0,...,6,2 baths,2.0,,"[""Carbon monoxide alarm"", ""Washer"", ""Fire exti...",204.0,0,,1,2.0
13489,811148816301434481,477499127,Drew,2022-08-31,"Louisville, KY",within an hour,100%,99%,t,6.0,...,11,4 baths,,,"[""Free parking on premises"", ""Heating"", ""Backy...",168.0,2,4.50,6,4.0
13491,811154636754995673,477499127,Drew,2022-08-31,"Louisville, KY",within an hour,100%,99%,t,6.0,...,11,4 baths,,,"[""Free parking on premises"", ""Heating"", ""Backy...",135.0,3,5.00,6,4.0
13944,834072988781055677,416217784,Anna,2021-07-31,,,,,f,1.0,...,12,5 baths,,,"[""Outdoor kitchen"", ""Free parking on premises""...",899.0,0,,1,5.0


In [690]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [691]:
updated_df = update_bedrooms_and_beds(updated_df)


In [692]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 6413) | 
    (updated_df['id'] == 78884) | 
    (updated_df['id'] == 219168) | 
    (updated_df['id'] == 224603) |
    (updated_df['id'] == 343889) | 
    (updated_df['id'] == 844959293264278924) | 
    (updated_df['id'] == 834072988781055677) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,2,1 bath,1.0,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0
33,78884,2157243,Sean,2012-04-17,"Austin, TX",within an hour,100%,100%,t,8.0,...,2,1 bath,1.0,2.0,"[""Paid dryer \u2013 In building"", ""Free parkin...",148.0,190,4.75,8,1.0
63,219168,1134580,Kalu,2011-09-09,"Austin, TX",,,,f,1.0,...,2,1 bath,1.0,1.0,"[""Essentials"", ""Free parking on premises"", ""He...",250.0,8,4.88,1,1.0
65,224603,1169129,David And Kristy,2011-09-16,"Austin, TX",within a day,100%,70%,t,2.0,...,2,1 bath,1.0,1.0,"[""Free parking on premises"", ""Bed linens"", ""Ho...",112.0,114,4.87,2,1.0
113,343889,1744639,Darcy,2012-02-13,"Austin, TX",within a few hours,100%,100%,f,1.0,...,2,1 bath,1.0,1.0,"[""Clothing storage: wardrobe"", ""Free parking o...",47.0,35,4.69,1,1.0
13944,834072988781055677,416217784,Anna,2021-07-31,,,,,f,1.0,...,12,5 baths,5.0,5.0,"[""Outdoor kitchen"", ""Free parking on premises""...",899.0,0,,1,5.0
14319,844959293264278924,482174715,Suite Life,2022-10-04,,within an hour,99%,76%,t,620.0,...,4,1 bath,1.0,2.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",460.0,0,,30,1.0


In [693]:
# check for the previous NA rows in beds
updated_df[
    (updated_df['id'] == 929579) | 
    (updated_df['id'] == 978089) | 
    (updated_df['id'] == 2045501) | 
    (updated_df['id'] == 811145863352252309) |
    (updated_df['id'] == 841380979307790669) | 
    (updated_df['id'] == 834072988781055677) | 
    (updated_df['id'] == 811154636754995673) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
290,929579,5004628,Lauren,2013-02-07,"Austin, TX",,,,f,1.0,...,2,2.0 bath,2.0,2.0,[],800.0,0,,1,2.0
326,978089,5352009,Daniel,2013-03-06,"Austin, TX",,,,f,1.0,...,2,1.0 bath,1.0,1.0,[],200.0,1,5.0,1,1.0
474,2045501,10487205,Kirk And Linley,2013-12-07,"Austin, TX",,,0%,f,1.0,...,8,3 baths,3.0,3.0,"[""Washer"", ""Free parking on premises"", ""Heatin...",975.0,5,5.0,1,3.0
13488,811145863352252309,497674468,Sih-Chi,2023-01-23,,within an hour,100%,100%,f,1.0,...,6,2 baths,2.0,2.0,"[""Carbon monoxide alarm"", ""Washer"", ""Fire exti...",204.0,0,,1,2.0
13491,811154636754995673,477499127,Drew,2022-08-31,"Louisville, KY",within an hour,100%,99%,t,6.0,...,11,4 baths,4.0,4.0,"[""Free parking on premises"", ""Heating"", ""Backy...",135.0,3,5.0,6,4.0
13944,834072988781055677,416217784,Anna,2021-07-31,,,,,f,1.0,...,12,5 baths,5.0,5.0,"[""Outdoor kitchen"", ""Free parking on premises""...",899.0,0,,1,5.0
14163,841380979307790669,202724860,Anna,2018-07-16,"Austin, TX",within an hour,80%,100%,f,1.0,...,2,1 bath,1.0,1.0,"[""Free parking on premises"", ""Indoor fireplace...",100.0,1,5.0,1,1.0


In [694]:
updated_df['beds'].unique()

array([  2.,   1.,   4.,   3.,  14.,   5.,  13.,   9.,   6.,   7.,  10.,
         8.,  16.,  12.,  15.,  22.,  11.,  36.,  18.,  39.,  61., 132.,
        17.,  26.,  24.,  21.,  20.,  23.,  25.])

In [695]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [696]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  5.,  4.,  8.,  6.,  7., 13.,  0., 15., 23., 14.,
        9., 10., 12.])

In [697]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_since

In [698]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 2 missing values in the "host_since" column.


In [699]:
updated_df['host_since'].isnull().sum()

0

In [700]:
updated_df['host_since'].unique()

array(['2009-02-16', '2009-02-19', '2009-04-17', ..., '2023-03-12',
       '2023-03-13', '2014-04-04'], dtype=object)

In [701]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_location

In [702]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 2062 missing values in the "host_location" column.


In [703]:
updated_df['host_location'].isnull().sum()

0

In [704]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

2062

In [705]:
updated_df['host_location'].unique()

array(['Austin, TX', 'unknown', 'New Orleans, LA', 'Dallas, TX',
       'Houston, TX', 'Fair Oaks Ranch, TX', 'Eugene, OR',
       'San Francisco, CA', 'New York, United States',
       'Dripping Springs, TX', 'Honolulu, HI', 'Boston, MA',
       'Texas, United States', 'Washington, DC', 'Bastrop, TX',
       'Gonzales, TX', 'Round Rock, TX', 'New York, NY', 'Santa Rosa, CA',
       'Oakland, CA', 'San Antonio, TX', 'Thun, Switzerland',
       'Ontario, Canada', 'Denver, CO', 'Minneapolis, MN', 'Edinburg, TX',
       'Richmond, VA', 'Longview, TX', 'Manchaca, TX',
       'New Braunfels, TX', 'Pagosa Springs, CO', 'Salado, TX',
       'Waco, TX', 'Blacklick, OH', 'Lucerne, Switzerland',
       'Westfield, IN', 'Portland, OR', 'Nashville, TN',
       'Los Angeles, CA', 'Vigo, Spain', 'Framingham, MA',
       'Baltimore, MD', 'Blacksburg, VA', 'Santa Clarita, CA',
       'Omaha, NE', 'Miami, FL', 'Australia', 'Brechin, United Kingdom',
       'Indianapolis, IN', 'United States', 'Paris, F

In [706]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [707]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_is_superhost

In [708]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 3 missing values in the "host_is_superhost" column.


In [709]:
updated_df['host_is_superhost'].isnull().sum()

0

In [710]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

9409

In [711]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [712]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 2 missing values in the "host_listings_count" column.


In [713]:
updated_df['host_listings_count'].isnull().sum()

0

In [714]:
updated_df['host_listings_count'].unique()

array([2.000e+00, 1.000e+00, 3.000e+00, 8.000e+00, 1.200e+01, 1.000e+01,
       5.000e+00, 2.500e+01, 4.000e+00, 2.700e+01, 1.700e+01, 7.000e+00,
       6.000e+00, 9.000e+00, 2.300e+01, 1.100e+01, 6.500e+01, 1.400e+01,
       3.500e+01, 8.000e+01, 3.900e+01, 2.800e+01, 1.900e+01, 1.300e+01,
       4.900e+01, 8.700e+01, 3.400e+01, 9.200e+01, 4.400e+01, 2.900e+01,
       1.160e+02, 7.650e+02, 1.629e+03, 1.500e+01, 3.040e+02, 1.190e+02,
       2.600e+01, 3.880e+02, 4.000e+01, 3.200e+01, 4.300e+01, 3.800e+01,
       1.670e+02, 5.900e+01, 8.500e+01, 2.930e+02, 2.400e+01, 7.800e+01,
       1.910e+02, 4.100e+01, 6.800e+01, 4.290e+02, 3.540e+02, 3.900e+02,
       4.100e+02, 1.550e+02, 1.600e+01, 3.700e+01, 4.490e+02, 4.200e+01,
       9.800e+01, 3.710e+02, 5.000e+01, 7.400e+01, 4.800e+01, 2.460e+02,
       6.840e+02, 2.870e+02, 1.800e+01, 1.700e+02, 1.100e+02, 3.294e+03,
       1.680e+02, 5.800e+01, 8.040e+02, 5.600e+01, 3.600e+01, 1.200e+02,
       4.600e+01, 6.630e+02, 3.840e+02, 4.806e+03, 

# host_total_listings_count

In [715]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 2 missing values in the "host_total_listings_count" column.


In [716]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [717]:
updated_df['host_total_listings_count'].unique()

array([4.000e+00, 1.000e+00, 2.000e+00, 5.000e+00, 3.000e+00, 3.000e+01,
       9.000e+00, 1.900e+01, 3.300e+01, 1.800e+01, 1.000e+01, 6.000e+00,
       3.400e+01, 6.900e+01, 2.800e+01, 1.300e+01, 8.000e+00, 8.500e+01,
       7.000e+00, 2.100e+01, 4.200e+01, 1.500e+01, 1.200e+01, 1.400e+01,
       4.700e+01, 2.600e+01, 1.060e+02, 3.500e+01, 2.200e+01, 2.500e+01,
       1.600e+01, 1.100e+01, 3.900e+01, 1.400e+02, 4.000e+01, 2.900e+01,
       2.400e+01, 1.700e+01, 5.100e+01, 2.000e+01, 5.400e+01, 3.100e+01,
       8.800e+01, 2.300e+01, 7.700e+01, 7.900e+01, 2.260e+02, 3.700e+01,
       9.700e+01, 3.200e+01, 3.920e+02, 1.486e+03, 1.930e+03, 4.210e+02,
       2.050e+02, 2.700e+01, 5.120e+02, 7.100e+01, 1.770e+02, 2.620e+02,
       3.600e+01, 4.800e+01, 9.200e+01, 3.870e+02, 2.470e+02, 8.000e+01,
       4.680e+02, 4.400e+01, 1.280e+02, 8.970e+02, 3.910e+02, 7.700e+02,
       5.700e+02, 2.670e+02, 4.100e+01, 4.910e+02, 1.340e+02, 1.360e+02,
       6.400e+02, 1.670e+02, 1.050e+02, 2.940e+02, 

# host_verifications

In [718]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 2 missing values in the "host_verifications" column.


In [719]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']",
       "['email', 'work_email']", '[]', 'None',
       "['email', 'phone', 'photographer', 'work_email']"], dtype=object)

In [720]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
739,3979017,20622469,Katie,2014-08-27,"Austin, TX",,,,f,1.0,...,2,1 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Washer"", ""Essential...",72.0,128,4.78,1,1.0
5762,41692584,323798409,Edwin & Rebecca,2020-01-02,"Austin, TX",within an hour,100%,71%,f,2.0,...,4,1 bath,3.0,2.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",115.0,20,4.45,2,1.0
7891,51805850,323798409,Edwin & Rebecca,2020-01-02,"Austin, TX",within an hour,100%,71%,f,2.0,...,4,2 baths,3.0,2.0,"[""Outdoor dining area"", ""Bed linens"", ""Dishwas...",121.0,4,3.25,2,2.0
9883,613816981640175932,456682563,April,2022-04-29,"Austin, TX",,,,f,1.0,...,7,3 baths,3.0,3.0,"[""Free parking on premises"", ""Outdoor dining a...",1315.0,23,4.91,1,3.0


In [721]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [722]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']",
       "['email', 'work_email']", 'None',
       "['email', 'phone', 'photographer', 'work_email']"], dtype=object)

# host_identity_verified

In [723]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 2 missing values in the "host_identity_verified" column.


In [724]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [725]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [726]:
updated_df['calculated_host_listings_count'].unique()

array([  2,   1,   3,   8,   4,  12,  10,   5,  11,  22,  17,  16,   6,
        27,  13,   9,  25,  48,   7,  80,  20,  19,  24,  14,  86,  77,
        28,  34,  64,  54,  23,  32,  51,  37,  62,  26,  29,  74, 162,
        21,  41, 103,  46,  30, 134,  52,  15,  35,  56,  57,  18, 102])

# host_name

In [727]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
834,4356661,21791967,,March 2023,unknown,,,,f,1.0,...,2,1 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Washer"", ""Fire exti...",75.0,1,,1,1.0
1422,8214182,21556779,,March 2023,unknown,,,,f,1.0,...,1,1 bath,1.0,1.0,"[""Washer"", ""Essentials"", ""Fire extinguisher"", ...",27.0,4,5.0,1,1.0


# host_has_profile_pic

In [728]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
834,4356661,21791967,,March 2023,unknown,,,,f,1.0,...,2,1 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Washer"", ""Fire exti...",75.0,1,,1,1.0
1422,8214182,21556779,,March 2023,unknown,,,,f,1.0,...,1,1 bath,1.0,1.0,"[""Washer"", ""Essentials"", ""Fire extinguisher"", ...",27.0,4,5.0,1,1.0


In [729]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f', nan], dtype=object)

In [730]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 2 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [731]:
updated_df['neighbourhood'].unique()

array(['Austin, Texas, United States', nan, 'Austin Texas, United States',
       'West Lake Hills, Texas, United States',
       'Lakeway, Texas, United States',
       'Sunset Valley, Texas, United States',
       'Dripping Springs, Texas, United States',
       'Bouldin, Austin, Texas, United States',
       'Rollingwood, Texas, United States',
       'Westlake Hills, Austin, Texas, United States',
       'Austin , Texas, United States', 'Del Valle, Texas, United States',
       'The Hills, Texas, United States', 'Austin, United States',
       'Clarksville, Texas, United States', 'Austin, Tx, United States',
       'Lake Travis, Texas, United States',
       'Round Rock, Texas, United States',
       'Travis County, Texas, United States',
       'Bee Cave, Texas, United States', 'Manchaca, Texas, United States',
       'Leander, Texas, United States',
       'Cedar Park, Texas, United States'], dtype=object)

In [732]:
updated_df['neighbourhood'].isnull().sum()

5598

In [733]:
updated_df['neighbourhood_cleansed'].unique()

array([78702, 78729, 78704, 78741, 78745, 78703, 78757, 78731, 78758,
       78705, 78727, 78722, 78733, 78701, 78723, 78752, 78751, 78736,
       78732, 78746, 78754, 78730, 78724, 78756, 78725, 78749, 78759,
       78721, 78737, 78748, 78734, 78744, 78738, 78726, 78753, 78735,
       78728, 78739, 78750, 78742, 78747, 78717, 78712, 78719])

### We are going to drop neighbourhood column for Austin Dataset

In [734]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# 

# host_response_time

In [737]:
updated_df['host_response_time'].unique()

array(['within an hour', 'within a few hours', nan, 'within a day',
       'a few days or more'], dtype=object)

# host_response_rate

In [738]:
updated_df['host_response_rate'].unique()

array(['100%', nan, '60%', '83%', '80%', '86%', '70%', '98%', '90%',
       '91%', '99%', '92%', '0%', '89%', '97%', '50%', '88%', '67%',
       '93%', '75%', '40%', '82%', '96%', '33%', '95%', '85%', '71%',
       '20%', '94%', '22%', '30%', '84%', '57%', '25%', '47%', '87%',
       '43%', '72%', '17%', '29%', '18%', '68%', '56%', '63%', '62%',
       '76%', '64%', '74%', '11%', '39%'], dtype=object)

# host_acceptance_rate

In [741]:
updated_df['host_acceptance_rate'].unique()

array(['95%', '100%', '70%', '94%', '98%', '91%', '86%', nan, '25%',
       '71%', '80%', '82%', '83%', '87%', '97%', '50%', '92%', '62%',
       '67%', '0%', '58%', '60%', '81%', '99%', '84%', '96%', '43%',
       '15%', '89%', '69%', '73%', '88%', '93%', '21%', '75%', '78%',
       '17%', '90%', '77%', '85%', '29%', '33%', '20%', '56%', '39%',
       '66%', '65%', '52%', '72%', '46%', '68%', '54%', '61%', '63%',
       '40%', '23%', '57%', '53%', '45%', '74%', '76%', '59%', '36%',
       '47%', '64%', '79%', '55%', '38%', '44%', '22%', '24%', '30%',
       '11%', '27%', '12%', '41%', '16%', '13%', '31%', '48%', '14%',
       '49%', '32%', '18%', '37%'], dtype=object)

# Adding new columns for city

Add new column and put 'Austin' for all rows for later joining purpose



In [742]:
updated_df['city'] = 'Austin'


In [743]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city
0,5456,8028,Sylvia,2009-02-16,"Austin, TX",within an hour,100%,95%,t,2.0,...,1 bath,1.0,2.0,"[""Heating"", ""Backyard"", ""Bed linens"", ""Hot wat...",176.0,630,4.79,2,1.0,Austin
1,5769,8186,Elizabeth,2009-02-19,"Austin, TX",within a few hours,100%,95%,t,1.0,...,1 shared bath,1.0,1.0,"[""Private backyard"", ""Free parking on premises...",42.0,275,4.92,1,1.0,Austin
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,1 bath,1.0,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0,Austin
3,6448,14156,Amy,2009-04-20,"Austin, TX",within an hour,100%,100%,t,1.0,...,1 bath,1.0,2.0,"[""Pack \u2019n play/Travel crib"", ""Free parkin...",240.0,295,4.9,1,1.0,Austin
4,8502,25298,Karen,2009-07-11,"Austin, TX",within a day,60%,70%,f,1.0,...,1 bath,1.0,1.0,"[""Essentials"", ""Heating"", ""Central air conditi...",85.0,48,4.57,1,1.0,Austin


# Count number of amenities

In [744]:
import ast

# Convert the 'amenities' string to a list using ast.literal_eval
updated_df['amenities_list'] = updated_df['amenities'].apply(lambda x: ast.literal_eval(x))

# Count the number of amenities in each list and create a new column with these counts
updated_df['amenities_count'] = updated_df['amenities_list'].apply(lambda x: len(x))

updated_df.drop('amenities_list', axis=1, inplace=True)


In [745]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
0,5456,8028,Sylvia,2009-02-16,"Austin, TX",within an hour,100%,95%,t,2.0,...,1.0,2.0,"[""Heating"", ""Backyard"", ""Bed linens"", ""Hot wat...",176.0,630,4.79,2,1.0,Austin,27
1,5769,8186,Elizabeth,2009-02-19,"Austin, TX",within a few hours,100%,95%,t,1.0,...,1.0,1.0,"[""Private backyard"", ""Free parking on premises...",42.0,275,4.92,1,1.0,Austin,36
2,6413,13879,Todd,2009-04-17,"Austin, TX",,,100%,t,1.0,...,1.0,1.0,"[""Heating"", ""Outdoor dining area"", ""Dove body ...",109.0,122,4.93,1,1.0,Austin,48
3,6448,14156,Amy,2009-04-20,"Austin, TX",within an hour,100%,100%,t,1.0,...,1.0,2.0,"[""Pack \u2019n play/Travel crib"", ""Free parkin...",240.0,295,4.90,1,1.0,Austin,61
4,8502,25298,Karen,2009-07-11,"Austin, TX",within a day,60%,70%,f,1.0,...,1.0,1.0,"[""Essentials"", ""Heating"", ""Central air conditi...",85.0,48,4.57,1,1.0,Austin,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14363,847159076072987428,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,4.0,5.0,"[""Washer"", ""Fire extinguisher"", ""Free parking ...",129.0,0,,41,2.0,Austin,8
14364,847171709264672413,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,1.0,1.0,"[""Heating"", ""Bed linens"", ""Dishwasher"", ""Hot w...",54.0,0,,41,1.0,Austin,38
14365,847178203609366885,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,2.0,3.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",145.0,0,,41,1.0,Austin,43
14366,847256590826352221,499116561,Edwin,2023-02-01,"Phoenix, AZ",within an hour,100%,100%,f,2.0,...,2.0,4.0,"[""Free parking on premises"", ""Heating"", ""Bed l...",144.0,0,,1,2.0,Austin,39


In [746]:
# sanity check

updated_df[updated_df['id'] == 847159076072987428]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
14363,847159076072987428,276920863,Susana,2019-07-17,"Austin, TX",within an hour,95%,100%,f,43.0,...,4.0,5.0,"[""Washer"", ""Fire extinguisher"", ""Free parking ...",129.0,0,,41,2.0,Austin,8


In [767]:
# sanity check
updated_df.loc[updated_df['id'] == 847159076072987428, ['amenities', 'amenities_count']]


Unnamed: 0,amenities,amenities_count
14363,"[""Washer"", ""Fire extinguisher"", ""Free parking ...",8


In [777]:
see = updated_df.loc[updated_df['id'] == 847159076072987428]


amenities_text = see['amenities'].iloc[0]

# Print the extracted 'amenities' list
print(amenities_text)

["Washer", "Fire extinguisher", "Free parking on premises", "TV", "Smoke alarm", "Wifi", "Kitchen", "Air conditioning"]


# Final look into missing values

In [778]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            2
host_since                           0
host_location                        0
host_response_time                3791
host_response_rate                3791
host_acceptance_rate              3188
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

In [779]:
# Save as a new csv file
updated_df.to_csv('clean-data/listings_detailed_clean_austin.csv', index=False)