# Nashville

In [162]:
import pandas as pd
import numpy as np

In [163]:
listings_detailed = pd.read_csv('usa/Nashville/listings_detailed-nashville.csv')

In [164]:
listings_detailed.shape

(8548, 75)

# Delete Unnecessary Columns

In [165]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [166]:
listings_detailed.shape

(8548, 29)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [167]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [168]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1649,27633928,133709475,Eddy,2017-06-06,"Nashville, TN",,,,f,1,...,1,,1 bath,1.0,2.0,"[""Iron"", ""Free parking on premises"", ""Wifi"", ""...",10.0,1,,1
7227,746483094722568519,475332725,Dwell,2022-08-16,,within an hour,100%,100%,t,4,...,10,,2.5 baths,3.0,5.0,"[""Iron"", ""Dishwasher"", ""Private entrance"", ""Ke...",10.0,2,5.0,4
7479,766061472708688520,475332725,Dwell,2022-08-16,,within an hour,100%,100%,t,4,...,10,,3.5 baths,3.0,7.0,"[""Iron"", ""Dishwasher"", ""Private entrance"", ""Ke...",10.0,1,5.0,4
8476,834225962533411195,475332725,Dwell,2022-08-16,,within an hour,100%,100%,t,4,...,8,,3.5 baths,3.0,3.0,"[""Iron"", ""Dishwasher"", ""Freezer"", ""Stove"", ""Mi...",10.0,0,,4


In [169]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


1


In [170]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
3857,49069442,267913366,Hotel,2019-06-11,"Nashville, TN",within an hour,100%,92%,,4,...,0,,,,,"[""Airport shuttle"", ""Free wifi"", ""TV"", ""Fitnes...",0.0,431,4.23,1


In [171]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
3857,49069442,267913366,Hotel,2019-06-11,"Nashville, TN",within an hour,100%,92%,,4,...,0,,,,,"[""Airport shuttle"", ""Free wifi"", ""TV"", ""Fitnes...",0.0,431,4.23,1


## Create Function to Identify the Zero/NA values and Remove them

In [172]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [173]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 1


In [174]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,6422,12172,Michele,2009-04-03,"Nashville, TN",within an hour,100%,0%,f,1,...,1,,1 private bath,1.0,1.0,"[""Lock on bedroom door"", ""Iron"", ""Washer \u201...",43.0,674,4.98,1
1,39870,171184,Evelyn,2010-07-18,"Nashville, TN",within an hour,100%,92%,t,1,...,2,,1 private bath,1.0,1.0,"[""Iron"", ""Keypad"", ""Luggage dropoff allowed"", ...",70.0,349,4.94,1
2,3648549,931636,Debby,2011-08-06,"Nashville, TN",within an hour,100%,100%,t,1,...,4,,1 bath,1.0,3.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",143.0,141,4.93,1
3,72906,176117,Richard,2010-07-21,"Nashville, TN",within an hour,100%,100%,t,1,...,4,,1 bath,2.0,2.0,"[""Iron"", ""Luggage dropoff allowed"", ""Wifi"", ""W...",100.0,677,4.89,1
4,421290,1684051,Rick,2012-02-01,"Nashville, TN",within an hour,98%,99%,t,7,...,2,,1 bath,1.0,1.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",261.0,979,4.93,7


In [175]:
cleaned_listings.shape

(8547, 29)

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [176]:
cleaned_listings['bathrooms_text'].unique()

array(['1 private bath', '1 bath', '2.5 baths', '1 shared bath',
       '1.5 baths', '3 baths', '2 baths', '1.5 shared baths', '6 baths',
       '3.5 baths', '4.5 baths', '7 baths', '4 baths', '3.5 shared baths',
       '5.5 baths', '8 baths', '0 baths', '2.5 shared baths', '5 baths',
       '2 shared baths', '9.5 baths', 'Private half-bath', '7.5 baths',
       '14 baths', '3 shared baths', '6.5 baths', nan, '12 baths',
       '17 baths', '9 baths', '16 baths', '4 shared baths', '10.5 baths',
       '12.5 baths', '13.5 baths', '8.5 baths', '18 baths', '17.5 baths',
       '0 shared baths'], dtype=object)

In [177]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [178]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 private bath' '1 bath' '2.5 baths' '1 shared bath' '1.5 baths'
 '3 baths' '2 baths' '1.5 shared baths' '6 baths' '3.5 baths' '4.5 baths'
 '7 baths' '4 baths' '3.5 shared baths' '5.5 baths' '8 baths' '0 baths'
 '2.5 shared baths' '5 baths' '2 shared baths' '9.5 baths' '0.5 baths'
 '7.5 baths' '14 baths' '3 shared baths' '6.5 baths' '' '12 baths'
 '17 baths' '9 baths' '16 baths' '4 shared baths' '10.5 baths'
 '12.5 baths' '13.5 baths' '8.5 baths' '18 baths' '17.5 baths'
 '0 shared baths']


### Check for the Nan for bathrooms_text

In [179]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2382,35644278,44450435,Ciara,2015-09-17,"Nashville, TN",within an hour,92%,100%,t,10,...,2,,,1.0,1.0,"[""Bed linens"", ""Wifi"", ""Coffee maker"", ""Iron"",...",244.0,7,4.43,10
6923,721757973601684008,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,,4.0,8.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",545.0,0,,84
6924,721759643866892259,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,,6.0,9.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",884.0,1,3.0,84
6925,721761310299636935,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,,6.0,10.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",762.0,3,5.0,84
6943,721776642303422756,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,,18.0,30.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Dedica...",2905.0,0,,84


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [180]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [181]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [182]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,6422,1.0,1.0,1 private bath
1,39870,1.0,1.0,1 private bath
2,3648549,1.0,3.0,1 bath
3,72906,2.0,2.0,1 bath
4,421290,1.0,1.0,1 bath
...,...,...,...,...
8543,848693383117239438,3.0,3.0,1 bath
8544,849122847900478398,6.0,6.0,6.5 baths
8545,849220798724539462,2.0,3.0,1 bath
8546,849386345308910137,2.0,2.0,1 bath


In [183]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 35644278) | 
    (filled_cleaned_listings['id'] == 721757973601684008) | 
    (filled_cleaned_listings['id'] == 721759643866892259) | 
    (filled_cleaned_listings['id'] == 721761310299636935) |
    (filled_cleaned_listings['id'] == 721776642303422756) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2382,35644278,44450435,Ciara,2015-09-17,"Nashville, TN",within an hour,92%,100%,t,10,...,2,,1.0 bath,1.0,1.0,"[""Bed linens"", ""Wifi"", ""Coffee maker"", ""Iron"",...",244.0,7,4.43,10
6923,721757973601684008,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,4.0 bath,4.0,8.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",545.0,0,,84
6924,721759643866892259,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,6.0 bath,6.0,9.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",884.0,1,3.0,84
6925,721761310299636935,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,6.0 bath,6.0,10.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Coffee...",762.0,3,5.0,84
6943,721776642303422756,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,,18.0 bath,18.0,30.0,"[""Wine glasses"", ""Bed linens"", ""Wifi"", ""Dedica...",2905.0,0,,84


### Function to make numerical values for # of bath

In [184]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [185]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
0     1 private bath       1.0
1     1 private bath       1.0
2             1 bath       1.0
3             1 bath       1.0
4             1 bath       1.0
...              ...       ...
8543          1 bath       1.0
8544       6.5 baths       6.5
8545          1 bath       1.0
8546          1 bath       1.0
8547         2 baths       2.0

[8547 rows x 2 columns]


In [186]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,6422,12172,Michele,2009-04-03,"Nashville, TN",within an hour,100%,0%,f,1,...,,1 private bath,1.0,1.0,"[""Lock on bedroom door"", ""Iron"", ""Washer \u201...",43.0,674,4.98,1,1.0
1,39870,171184,Evelyn,2010-07-18,"Nashville, TN",within an hour,100%,92%,t,1,...,,1 private bath,1.0,1.0,"[""Iron"", ""Keypad"", ""Luggage dropoff allowed"", ...",70.0,349,4.94,1,1.0
2,3648549,931636,Debby,2011-08-06,"Nashville, TN",within an hour,100%,100%,t,1,...,,1 bath,1.0,3.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",143.0,141,4.93,1,1.0
3,72906,176117,Richard,2010-07-21,"Nashville, TN",within an hour,100%,100%,t,1,...,,1 bath,2.0,2.0,"[""Iron"", ""Luggage dropoff allowed"", ""Wifi"", ""W...",100.0,677,4.89,1,1.0
4,421290,1684051,Rick,2012-02-01,"Nashville, TN",within an hour,98%,99%,t,7,...,,1 bath,1.0,1.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",261.0,979,4.93,7,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8543,848693383117239438,93273148,Savannah,2016-09-03,"Nashville, TN",within a few hours,100%,,f,1,...,,1 bath,3.0,3.0,"[""Free parking on premises"", ""Wifi"", ""Washer"",...",101.0,0,,1,1.0
8544,849122847900478398,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,,6.5 baths,6.0,6.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Freezer...",411.0,0,,84,6.5
8545,849220798724539462,484163143,James,2022-10-18,,within an hour,100%,98%,t,5,...,,1 bath,2.0,3.0,"[""Iron"", ""Dishwasher"", ""Stove"", ""Luggage dropo...",112.0,0,,5,1.0
8546,849386345308910137,355260892,Chloe,2020-07-09,"Norman, OK",within a few hours,100%,100%,f,1,...,,1 bath,2.0,2.0,"[""Free parking on premises"", ""Security cameras...",78.0,0,,1,1.0


In [187]:
updated_df['num_bath'].isnull().sum()

0

In [188]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [189]:
updated_df['num_bath'].unique()

array([ 1. ,  2.5,  1.5,  3. ,  2. ,  6. ,  3.5,  4.5,  7. ,  4. ,  5.5,
        8. ,  0. ,  5. ,  9.5,  0.5,  7.5, 14. ,  6.5, 12. , 17. ,  9. ,
       16. , 10.5, 12.5, 13.5,  8.5, 18. , 17.5])

In [190]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [191]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,6422,12172,Michele,2009-04-03,"Nashville, TN",within an hour,100%,0%,f,1,...,1,1 private bath,1.0,1.0,"[""Lock on bedroom door"", ""Iron"", ""Washer \u201...",43.0,674,4.98,1,1.0
1,39870,171184,Evelyn,2010-07-18,"Nashville, TN",within an hour,100%,92%,t,1,...,2,1 private bath,1.0,1.0,"[""Iron"", ""Keypad"", ""Luggage dropoff allowed"", ...",70.0,349,4.94,1,1.0
2,3648549,931636,Debby,2011-08-06,"Nashville, TN",within an hour,100%,100%,t,1,...,4,1 bath,1.0,3.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",143.0,141,4.93,1,1.0
3,72906,176117,Richard,2010-07-21,"Nashville, TN",within an hour,100%,100%,t,1,...,4,1 bath,2.0,2.0,"[""Iron"", ""Luggage dropoff allowed"", ""Wifi"", ""W...",100.0,677,4.89,1,1.0
4,421290,1684051,Rick,2012-02-01,"Nashville, TN",within an hour,98%,99%,t,7,...,2,1 bath,1.0,1.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",261.0,979,4.93,7,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8543,848693383117239438,93273148,Savannah,2016-09-03,"Nashville, TN",within a few hours,100%,,f,1,...,4,1 bath,3.0,3.0,"[""Free parking on premises"", ""Wifi"", ""Washer"",...",101.0,0,,1,1.0
8544,849122847900478398,338667310,AvantStay Nashville,2020-02-24,,within an hour,99%,99%,t,84,...,16,6.5 baths,6.0,6.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Freezer...",411.0,0,,84,6.5
8545,849220798724539462,484163143,James,2022-10-18,,within an hour,100%,98%,t,5,...,6,1 bath,2.0,3.0,"[""Iron"", ""Dishwasher"", ""Stove"", ""Luggage dropo...",112.0,0,,5,1.0
8546,849386345308910137,355260892,Chloe,2020-07-09,"Norman, OK",within a few hours,100%,100%,f,1,...,4,1 bath,2.0,2.0,"[""Free parking on premises"", ""Security cameras...",78.0,0,,1,1.0


# Impute Bedrooms and Beds

In [192]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  4.,  8., nan,  5.,  7.,  6., 10., 14., 12., 15.,
       16.,  9., 11., 18., 13.])

In [193]:
updated_df['beds'].unique()

array([  1.,   3.,   2.,   5.,   4.,  nan,   9.,  10.,   7.,   6.,   8.,
        12.,  11.,  13.,  15.,  22.,  14.,  17.,  16.,  32.,  31.,  18.,
        24.,  27.,  30.,  20.,  19.,  40.,  26., 111.,  29.,  28.,  21.,
        25.,  36.,  23.])

In [194]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
73,1546787,6855810,UrbanNashville,2013-06-11,"Nashville, TN",within an hour,100%,99%,f,152,...,3,1 bath,,2.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",105.0,473,4.84,88,1.0
74,4033674,14657565,Daphne,2014-04-23,"Nashville, TN",within an hour,100%,99%,f,5,...,3,1 bath,,1.0,"[""Iron"", ""Keypad"", ""Stove"", ""Luggage dropoff a...",86.0,853,4.68,5,1.0
79,5808960,30139823,Jessica,2015-03-28,"Nashville, TN",within an hour,100%,100%,t,1,...,4,1 bath,,2.0,"[""Wine glasses"", ""Bluetooth sound system"", ""Ir...",147.0,391,4.93,1,1.0
94,4170603,16831211,Chris,2014-06-15,"Nashville, TN",within an hour,100%,100%,t,1,...,4,1 bath,,,"[""Iron"", ""Keypad"", ""Free parking on premises"",...",92.0,376,4.79,1,1.0
119,6198601,9012461,Geoff,2013-09-23,"Nashville, TN",within an hour,100%,100%,t,2,...,2,1 bath,,1.0,"[""Wifi"", ""Fire extinguisher"", ""Essentials"", ""F...",139.0,1,5.00,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8438,833910053772427759,411546857,Anna,2021-07-07,,within an hour,98%,41%,t,663,...,4,1 bath,,2.0,"[""Iron"", ""Dishwasher"", ""Building staff"", ""Lugg...",210.0,0,,24,1.0
8442,833910159540696334,411546857,Anna,2021-07-07,,within an hour,98%,41%,t,663,...,4,1 bath,,2.0,"[""Iron"", ""Dishwasher"", ""Building staff"", ""Lugg...",210.0,0,,24,1.0
8452,833910234875189939,411546857,Anna,2021-07-07,,within an hour,98%,41%,t,663,...,4,1 bath,,2.0,"[""Iron"", ""Dishwasher"", ""Building staff"", ""Lugg...",210.0,0,,24,1.0
8455,833910325675645025,411546857,Anna,2021-07-07,,within an hour,98%,41%,t,663,...,4,1 bath,,2.0,"[""Iron"", ""Dishwasher"", ""Building staff"", ""Lugg...",210.0,0,,24,1.0


In [195]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
23,3703925,441439,Angela,2011-03-14,"Nashville, TN",within an hour,100%,100%,f,4,...,3,1 bath,1.0,,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Keypad""...",120.0,615,4.73,4,1.0
94,4170603,16831211,Chris,2014-06-15,"Nashville, TN",within an hour,100%,100%,t,1,...,4,1 bath,,,"[""Iron"", ""Keypad"", ""Free parking on premises"",...",92.0,376,4.79,1,1.0
322,5217231,16087739,Lisa,2014-05-28,"Nashville, TN",within an hour,100%,98%,t,3,...,8,2.5 baths,3.0,,"[""Iron"", ""Dishwasher"", ""Stove"", ""Free parking ...",302.0,124,4.8,2,2.5
816,16363762,13337952,Josh,2014-03-20,"Nashville, TN",within an hour,100%,86%,t,1,...,4,1 bath,1.0,,"[""Wine glasses"", ""Iron"", ""Free dryer \u2013 In...",75.0,217,4.85,1,1.0
1716,28631887,211231188,Studio 154 Luxury,2018-08-22,"Nashville, TN",within an hour,90%,100%,f,11,...,2,1 private bath,,,"[""Iron"", ""Dishwasher"", ""Keypad"", ""Stove"", ""TV""...",529.0,1,5.0,11,1.0
2185,33493801,221074792,Chad,2018-10-17,"Nashville, TN",,,,f,2,...,2,1 private bath,1.0,,"[""Lock on bedroom door"", ""Iron"", ""TV"", ""Wifi"",...",71.0,0,,2,1.0
2414,36277473,224851351,Sonder (Nashville),2018-11-08,"Nashville, TN",within an hour,100%,99%,f,65,...,2,1 bath,,,"[""Iron"", ""Dishwasher"", ""Keypad"", ""Stove"", ""TV""...",184.0,141,4.43,65,1.0
2827,40254286,18765050,E,2014-07-23,"Nashville, TN",within an hour,90%,91%,f,4,...,1,1 shared bath,1.0,,"[""Lock on bedroom door"", ""Stove"", ""TV"", ""Free ...",75.0,3,4.0,2,1.0
2844,41268083,31306951,Jessica,2015-04-15,"Nashville, TN",within an hour,93%,56%,t,9,...,1,3 shared baths,4.0,,"[""Dishwasher"", ""Stove"", ""Free parking on premi...",34.0,2,4.5,9,3.0
2916,40990752,31306951,Jessica,2015-04-15,"Nashville, TN",within an hour,93%,56%,t,9,...,1,1 shared bath,1.0,,"[""Dishwasher"", ""Stove"", ""Free parking on premi...",33.0,1,5.0,9,1.0


In [196]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [197]:
updated_df = update_bedrooms_and_beds(updated_df)


In [198]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 48471281) | 
    (updated_df['id'] == 826812096887579137) | 
    (updated_df['id'] == 844502066450874816) | 
    (updated_df['id'] == 711505580789078313) |
    (updated_df['id'] == 706060594582363588) | 
    (updated_df['id'] == 634837162467437440) | 
    (updated_df['id'] == 52796577) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
3780,48471281,8038534,Berry,2013-08-08,"Nashville, TN",within an hour,100%,99%,t,4,...,2,1 bath,1.0,1.0,"[""Private entrance"", ""Keypad"", ""TV"", ""Free par...",106.0,152,4.86,3,1.0
4678,52796577,224851351,Sonder (Nashville),2018-11-08,"Nashville, TN",within an hour,100%,99%,f,65,...,6,2 baths,2.0,2.0,"[""Iron"", ""Dishwasher"", ""Keypad"", ""Stove"", ""TV""...",364.0,215,4.58,65,2.0
6060,634837162467437440,88959664,Julian,2016-08-09,"Nashville, TN",within an hour,100%,100%,t,45,...,10,3.5 baths,3.0,3.0,"[""Wine glasses"", ""Bluetooth sound system"", ""Ir...",283.0,25,4.88,45,3.5
6744,711505580789078313,9730317,Alexa & Emily,2013-10-31,"Nashville, TN",within an hour,100%,99%,t,37,...,10,3 baths,1.0,1.0,"[""Free dryer \u2013 In unit"", ""Dishwasher"", ""P...",201.0,1,4.0,19,3.0
6781,706060594582363588,152708736,Wendy,2017-09-30,"Tennessee, United States",,,,f,1,...,1,0 baths,2.0,2.0,"[""Wifi"", ""Washer""]",40.0,0,,1,0.0
8315,826812096887579137,501053494,Alexandra,2023-02-14,,within an hour,100%,100%,f,20,...,2,1 private bath,1.0,1.0,"[""Free parking on premises"", ""Security cameras...",112.0,1,5.0,20,1.0
8474,844502066450874816,253095,Cynthia,2010-10-04,"Los Angeles, CA",within a few hours,100%,97%,f,2,...,1,0 shared baths,2.0,2.0,"[""Clothing storage: closet and walk-in closet""...",57.0,0,,2,0.0


In [199]:
updated_df['beds'].unique()

array([  1.,   3.,   2.,   5.,   4.,   9.,  10.,   7.,   6.,   8.,  12.,
        11.,  13.,  15.,  22.,  14.,  17.,  16.,  32.,  31.,  18.,  24.,
        27.,  30.,  20.,  19.,  40.,  26., 111.,  29.,  28.,  21.,  25.,
        36.,  23.])

In [200]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [201]:
updated_df['bedrooms'].unique()

array([ 1.,  2.,  3.,  4.,  8.,  5.,  7.,  6., 10.,  0., 14., 12., 15.,
       16.,  9., 11., 18., 13.])

In [202]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_since

In [203]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [204]:
updated_df['host_since'].isnull().sum()

0

In [205]:
updated_df['host_since'].unique()

array(['2009-04-03', '2010-07-18', '2011-08-06', ..., '2019-11-20',
       '2015-04-23', '2016-06-03'], dtype=object)

# host_location

In [206]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 1453 missing values in the "host_location" column.


In [207]:
updated_df['host_location'].isnull().sum()

0

In [208]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

1453

In [209]:
updated_df['host_location'].unique()

array(['Nashville, TN', 'Franklin, TN', 'unknown', 'Seattle, WA',
       'Louisville, KY', 'Tennessee, United States', 'Portland, OR',
       'Orlando, FL', 'Oak View, CA', 'New York, NY', 'Asheville, NC',
       'San Francisco, CA', 'United States', 'Sarasota, FL', 'Denver, CO',
       'Los Gatos, CA', 'Washington, DC', 'Flagstaff, AZ', 'Cromberg, CA',
       'Mount Juliet, TN', 'Hendersonville, TN', 'Austin, TX',
       'Goodlettsville, TN', 'Brentwood, TN', 'Paris, TN',
       'Mikonos, Greece', 'Los Angeles, CA', 'Tampa, FL', 'Chandler, AZ',
       'Chattanooga, TN', 'Cocoa Beach, FL', 'Texas, United States',
       'Edmonton, Canada', 'Royal Oak, MI', 'Atlanta, GA', 'Brandon, MS',
       'Gallatin, TN', 'Huntsville, AL', 'Huntsville, Canada',
       'Columbus, GA', 'La Vergne, TN', 'Greenbrier, TN',
       'Tallahassee, FL', 'Berkeley, CA', 'Kingston Springs, TN',
       'Oak Park, IL', 'Bluffton, SC', 'Fort Collins, CO',
       'Clarksville, TN', 'Manhattan Beach, CA', 'Pegram, T

In [210]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_is_superhost

In [211]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [212]:
updated_df['host_is_superhost'].isnull().sum()

0

In [213]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

4157

In [214]:
updated_df['host_is_superhost'].unique()

array(['f', 't'], dtype=object)

# host_listings_count

In [215]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [216]:
updated_df['host_listings_count'].isnull().sum()

0

In [217]:
updated_df['host_listings_count'].unique()

array([   1,    7,   19,   18,    2,    3,    4,   45,   27,   10,   15,
         20,   11,    5,  152,   17,    8,  432,    6,   16,   89,   12,
         28,    9,   51,   22,  133,  202,   13,   54,   57,   98,   33,
         21,   26,  293,  205,   30,  727,   24,  305,   23,   46,   14,
         31,  533,  147,  142,   36,  116,  120,   40,   37,   65,  409,
        315,   29,  190,  230,   47,   39,   25,   84,   66,  449,  287,
        246,   87,   43,  148,   38,  630,   52, 3297,  101,  319,  314,
        631,   34,   92,   35,   85,   41,  663,  104,   62,  422,  406,
        256,   73,  143,   59,  132,  372, 2504,   58,   76,  127, 2329,
         44,  234,   74,   42,   79,  212,  804,  348,  156,  298,  150,
         88])

# host_total_listings_count

In [218]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [219]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [220]:
updated_df['host_total_listings_count'].unique()

array([   1,    3,    7,   24,   19,    2,    4,   10,   61,    5,   37,
         15,   26,    8,    6,  186,   21,    9,  897,   11,   16,   18,
        148,   20,   12,   45,   27,   43,   57,   56,   13,   17,   71,
         22,   32,   23,  279,  330,   14,   54,  107,   44,   36,  269,
         78,   34,  387,  366,   46, 1453,  392,   58,   51,   48,   31,
         29,   85,  662,   65,  351,  213,  104,  301,  171,   66,   47,
        118,  570,  465,   79,  462,   62,   42,  102,   68,  158,  491,
        300,  335,  294,  163,   50,   89,  303,  156,  736, 4531,  161,
       1151,  167,  522, 1302,   25,  110,   52,   92,   91,  123,  197,
         33,  672,  125,   69,   64,  714,  641,  270,  258,  140,  103,
        199,  130,   75,  150,  372, 2775,  100,   59,   76,  146, 7800,
        244,   81,   60,   74,   28,  849,  353,  178,  308,  326,   30,
         97])

# host_verifications

In [221]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [222]:
updated_df['host_verifications'].unique()

array(["['phone']", "['email', 'phone']",
       "['email', 'phone', 'work_email']", "['phone', 'work_email']",
       "['email']", "['email', 'phone', 'photographer']"], dtype=object)

# host_identity_verified

In [223]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [224]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [225]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [226]:
updated_df['calculated_host_listings_count'].unique()

array([  1,   7,   8,  18,   2,   3,   4,  45,  14,  10,  11,  88,   5,
         6,  19,  13,  15,  76,  20,  22,  31,  27,   9, 131, 185,  12,
        54,  56,  96,  30,  21,  44,  24,  36,  42,  28,  32,  33,  65,
       173,  29,  55,  39,  84,  43,  51,  53,  72, 151,  37,  73,  57,
        40])

# host_name

In [227]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_has_profile_pic

In [228]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [229]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

# Neighborhood

In [230]:
updated_df['neighbourhood'].unique()

array(['Nashville, Tennessee, United States', nan,
       'Brentwood, Tennessee, United States',
       'Goodlettsville, Tennessee, United States',
       'Hermitage, Tennessee, United States',
       'Nashville , Tennessee, United States',
       'Hermitage , Tennessee, United States',
       'Nashville\nHermitage, Tennessee, United States',
       'Nashville Hermitage, Tennessee, United States',
       'Pegram, Tennessee, United States', 'United States',
       'Antioch, Tennessee, United States',
       'Nashville , Tn, United States', 'Nashville, United States',
       'Greenbrier, Tennessee, United States',
       'NASHVILLE, Tennessee, United States',
       'Berry Hill, Tennessee, United States',
       'Old Hickory, Tennessee, United States',
       'Madison, Tennessee, United States',
       'Wilson County, Tennessee, United States',
       'Nashville-Davidson, Davidson County, Tennessee, United States',
       'Ashland City, Tennessee, United States'], dtype=object)

In [231]:
updated_df['neighbourhood'].isnull().sum()

2419

In [232]:
updated_df['neighbourhood_cleansed'].unique()

array(['District 6', 'District 18', 'District 25', 'District 19',
       'District 14', 'District 12', 'District 17', 'District 21',
       'District 24', 'District 7', 'District 5', 'District 23',
       'District 13', 'District 10', 'District 30', 'District 16',
       'District 15', 'District 26', 'District 33', 'District 20',
       'District 2', 'District 34', 'District 27', 'District 4',
       'District 8', 'District 29', 'District 9', 'District 11',
       'District 35', 'District 1', 'District 22', 'District 31',
       'District 3', 'District 28', 'District 32'], dtype=object)

In [233]:
updated_df['neighbourhood_cleansed'].isnull().sum()

0

### We are going to drop neighbourhood column for Nashville Dataset

In [234]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# host_response_time

In [235]:
updated_df['host_response_time'].unique()

array(['within an hour', nan, 'within a few hours', 'within a day',
       'a few days or more'], dtype=object)

# host_response_rate

In [236]:
updated_df['host_response_rate'].unique()

array(['100%', '98%', nan, '99%', '0%', '92%', '95%', '90%', '96%', '97%',
       '88%', '67%', '87%', '50%', '86%', '75%', '94%', '83%', '60%',
       '89%', '80%', '91%', '71%', '70%', '93%', '74%', '76%', '69%',
       '79%', '85%', '82%', '48%', '33%', '63%', '65%', '84%', '61%',
       '78%', '43%', '81%', '40%', '73%', '51%', '10%', '25%', '57%'],
      dtype=object)

# host_acceptance_rate

In [237]:
updated_df['host_acceptance_rate'].unique()

array(['0%', '92%', '100%', '99%', '78%', '98%', '68%', nan, '96%', '87%',
       '95%', '89%', '94%', '67%', '80%', '97%', '51%', '93%', '38%',
       '86%', '88%', '83%', '42%', '81%', '71%', '90%', '52%', '91%',
       '73%', '75%', '50%', '79%', '84%', '58%', '40%', '85%', '74%',
       '33%', '46%', '13%', '82%', '77%', '53%', '30%', '69%', '70%',
       '60%', '27%', '56%', '64%', '47%', '45%', '61%', '65%', '43%',
       '20%', '6%', '22%', '66%', '44%', '11%', '39%', '41%', '25%',
       '48%', '63%', '72%', '10%', '76%', '21%', '36%', '57%', '23%',
       '31%', '55%', '62%', '14%', '16%', '28%', '37%', '18%'],
      dtype=object)

# Adding new columns for city

Add new column and put 'Nashville' for all rows for later joining purpose



In [238]:
updated_df['city'] = 'Nashville'
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city
0,6422,12172,Michele,2009-04-03,"Nashville, TN",within an hour,100%,0%,f,1,...,1 private bath,1.0,1.0,"[""Lock on bedroom door"", ""Iron"", ""Washer \u201...",43.0,674,4.98,1,1.0,Nashville
1,39870,171184,Evelyn,2010-07-18,"Nashville, TN",within an hour,100%,92%,t,1,...,1 private bath,1.0,1.0,"[""Iron"", ""Keypad"", ""Luggage dropoff allowed"", ...",70.0,349,4.94,1,1.0,Nashville
2,3648549,931636,Debby,2011-08-06,"Nashville, TN",within an hour,100%,100%,t,1,...,1 bath,1.0,3.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",143.0,141,4.93,1,1.0,Nashville
3,72906,176117,Richard,2010-07-21,"Nashville, TN",within an hour,100%,100%,t,1,...,1 bath,2.0,2.0,"[""Iron"", ""Luggage dropoff allowed"", ""Wifi"", ""W...",100.0,677,4.89,1,1.0,Nashville
4,421290,1684051,Rick,2012-02-01,"Nashville, TN",within an hour,98%,99%,t,7,...,1 bath,1.0,1.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",261.0,979,4.93,7,1.0,Nashville


# Count number of amenities

In [239]:
import ast

# Convert the 'amenities' string to a list using ast.literal_eval
updated_df['amenities_list'] = updated_df['amenities'].apply(lambda x: ast.literal_eval(x))

# Count the number of amenities in each list and create a new column with these counts
updated_df['amenities_count'] = updated_df['amenities_list'].apply(lambda x: len(x))

updated_df.drop('amenities_list', axis=1, inplace=True)


In [240]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
0,6422,12172,Michele,2009-04-03,"Nashville, TN",within an hour,100%,0%,f,1,...,1.0,1.0,"[""Lock on bedroom door"", ""Iron"", ""Washer \u201...",43.0,674,4.98,1,1.0,Nashville,52
1,39870,171184,Evelyn,2010-07-18,"Nashville, TN",within an hour,100%,92%,t,1,...,1.0,1.0,"[""Iron"", ""Keypad"", ""Luggage dropoff allowed"", ...",70.0,349,4.94,1,1.0,Nashville,23
2,3648549,931636,Debby,2011-08-06,"Nashville, TN",within an hour,100%,100%,t,1,...,1.0,3.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",143.0,141,4.93,1,1.0,Nashville,39
3,72906,176117,Richard,2010-07-21,"Nashville, TN",within an hour,100%,100%,t,1,...,2.0,2.0,"[""Iron"", ""Luggage dropoff allowed"", ""Wifi"", ""W...",100.0,677,4.89,1,1.0,Nashville,27
4,421290,1684051,Rick,2012-02-01,"Nashville, TN",within an hour,98%,99%,t,7,...,1.0,1.0,"[""Wine glasses"", ""Iron"", ""Body soap"", ""Private...",261.0,979,4.93,7,1.0,Nashville,40


In [241]:
# sanity check

see = updated_df.loc[updated_df['id'] == 39870]


amenities_text = see['amenities'].iloc[0]

# Print the extracted 'amenities' list
print(amenities_text)

["Iron", "Keypad", "Luggage dropoff allowed", "Free parking on premises", "Wifi", "Washer", "Coffee maker", "Essentials", "Refrigerator", "Self check-in", "Extra pillows and blankets", "Hair dryer", "Shampoo", "Fire extinguisher", "Hot water", "Dryer", "Smoke alarm", "Air conditioning", "Carbon monoxide alarm", "Bed linens", "Microwave", "Hangers", "Heating"]


# Final look into missing values

In [242]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                 801
host_response_rate                 801
host_acceptance_rate               615
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

In [243]:
# Save as a new csv file
updated_df.to_csv('clean-data/listings_detailed_clean_nashville.csv', index=False)