# Chicago

In [253]:
import pandas as pd
import numpy as np

In [254]:
listings_detailed = pd.read_csv('usa/Chicago/listings_detailed.csv')

In [255]:
listings_detailed.shape

(7747, 75)

# Delete Unnecessary Columns

In [256]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [257]:
listings_detailed.shape

(7747, 29)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [258]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [259]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
975,20235190,127235673,Alex,2017-04-23,"Chicago, IL",within an hour,92%,100%,f,10,...,2,,1.5 shared baths,1.0,1.0,"[""Self check-in"", ""32\"" HDTV with HBO Max, Hul...",10.0,51,4.72,8
1029,20869579,127235673,Alex,2017-04-23,"Chicago, IL",within an hour,92%,100%,f,10,...,2,,1 shared bath,1.0,1.0,"[""Self check-in"", ""32\"" HDTV with HBO Max, Hul...",10.0,37,4.33,8
3200,45487005,6962723,Shane,2013-06-17,United States,,,,f,1,...,1,,Half-bath,,1.0,"[""Heating"", ""Air conditioning"", ""Outdoor dinin...",10.0,0,,1
5152,600622184724170928,100782278,Chicago Furnished,2016-10-22,,within a few hours,98%,97%,t,34,...,5,,1 bath,2.0,2.0,"[""Samsung stainless steel gas stove"", ""Fast wi...",10.0,1,5.0,34
7017,779953812225353399,170551048,Lyn,2018-01-29,"Chicago, IL",within an hour,100%,99%,t,23,...,2,,1 bath,,1.0,"[""Laundromat nearby"", ""Radiant heating"", ""Dedi...",10.0,1,1.0,23


In [260]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


2


In [261]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2778,41740623,273328293,The Hoxton,2019-07-03,,within an hour,100%,100%,,2,...,2,,,,,"[""Bed sheets and pillows"", ""45\"" HDTV"", ""First...",0.0,68,4.69,1
2966,43078540,315027620,Found Hotel Chicago River North,2019-12-06,"Chicago, IL",within an hour,100%,100%,f,6,...,0,,,,,"[""Hair dryer"", ""Smoke alarm"", ""Carbon monoxide...",0.0,875,4.33,2


In [262]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2966,43078540,315027620,Found Hotel Chicago River North,2019-12-06,"Chicago, IL",within an hour,100%,100%,f,6,...,0,,,,,"[""Hair dryer"", ""Smoke alarm"", ""Carbon monoxide...",0.0,875,4.33,2


## Create Function to Identify the Zero/NA values and Remove them

In [263]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [264]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 2


In [265]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,2384,2613,Rebecca,2008-08-29,"Chicago, IL",within an hour,100%,97%,t,1,...,1,,1 shared bath,1.0,1.0,"[""Dedicated workspace"", ""Host greets you"", ""Ba...",90.0,212,4.93,1
1,1772920,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,7,,2 baths,3.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",379.0,13,4.58,6
2,1773021,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,9,,2 baths,4.0,5.0,"[""Laundromat nearby"", ""Oven"", ""Baking sheet"", ...",479.0,26,4.85,6
3,1773025,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,9,,2 baths,4.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",479.0,21,4.76,6
4,1810118,9483312,Ryan,2013-10-17,"Chicago, IL",within an hour,100%,100%,t,2,...,3,,1 private bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Chil...",79.0,355,4.88,2


In [266]:
cleaned_listings.shape

(7745, 29)

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [267]:
cleaned_listings['bathrooms_text'].unique()

array(['1 shared bath', '2 baths', '1 private bath', '1 bath',
       '1.5 shared baths', '11 shared baths', '1.5 baths', '2.5 baths',
       '3 baths', '2 shared baths', '0 shared baths', nan, '3.5 baths',
       '2.5 shared baths', '3 shared baths', '4 baths',
       'Shared half-bath', '4.5 baths', '0 baths', '5 baths',
       'Private half-bath', '11.5 shared baths', '4 shared baths',
       '6.5 baths', '7 baths', '5.5 baths', '6 baths', '12.5 baths',
       'Half-bath', '10 baths', '9.5 baths', '7.5 baths', '8 baths',
       '8.5 baths'], dtype=object)

In [268]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [269]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 shared bath' '2 baths' '1 private bath' '1 bath' '1.5 shared baths'
 '11 shared baths' '1.5 baths' '2.5 baths' '3 baths' '2 shared baths'
 '0 shared baths' '' '3.5 baths' '2.5 shared baths' '3 shared baths'
 '4 baths' '0.5 baths' '4.5 baths' '0 baths' '5 baths' '11.5 shared baths'
 '4 shared baths' '6.5 baths' '7 baths' '5.5 baths' '6 baths' '12.5 baths'
 '10 baths' '9.5 baths' '7.5 baths' '8 baths' '8.5 baths']


### Check for the Nan for bathrooms_text

In [270]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
124,4793938,24705990,Rick,2014-12-10,"Chicago, IL",within an hour,90%,100%,t,11,...,4,,,2.0,2.0,"[""Self check-in"", ""Children\u2019s books and t...",239.0,82,4.84,11
1629,28510810,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",178.0,5,4.6,10
1630,28510957,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Heating - split type ductless system"", ""Beek...",164.0,17,4.65,10
1631,28511244,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,3,,,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",190.0,3,5.0,10
1632,28511296,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",180.0,4,4.75,10
1633,28511402,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,3,,,1.0,1.0,"[""Pack \u2019n play/Travel crib - available up...",178.0,22,4.77,10
1634,28511620,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",202.0,10,4.8,10
1635,28511708,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,3,,,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",160.0,37,4.76,10
1636,28511827,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Pack \u2019n play/Travel crib - available up...",173.0,5,4.6,10
1637,28511937,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,,1.0,1.0,"[""Pack \u2019n play/Travel crib - available up...",185.0,21,4.76,10


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [271]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [272]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [273]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,2384,1.0,1.0,1 shared bath
1,1772920,3.0,5.0,2 baths
2,1773021,4.0,5.0,2 baths
3,1773025,4.0,5.0,2 baths
4,1810118,1.0,1.0,1 private bath
...,...,...,...,...
7742,25879,2.0,3.0,1 bath
7743,28749,3.0,3.0,2 baths
7744,37738,1.0,1.0,1.5 shared baths
7745,71930,1.0,1.0,1 shared bath


In [274]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 4793938) | 
    (filled_cleaned_listings['id'] == 28510810) | 
    (filled_cleaned_listings['id'] == 28510957) | 
    (filled_cleaned_listings['id'] == 28511244) |
    (filled_cleaned_listings['id'] == 28511296) | 
    (filled_cleaned_listings['id'] == 28511402) | 
    (filled_cleaned_listings['id'] == 1732652) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
124,4793938,24705990,Rick,2014-12-10,"Chicago, IL",within an hour,90%,100%,t,11,...,4,,2.0 bath,2.0,2.0,"[""Self check-in"", ""Children\u2019s books and t...",239.0,82,4.84,11
1629,28510810,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,1.0 bath,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",178.0,5,4.6,10
1630,28510957,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,1.0 bath,1.0,1.0,"[""Heating - split type ductless system"", ""Beek...",164.0,17,4.65,10
1631,28511244,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,3,,1.0 bath,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",190.0,3,5.0,10
1632,28511296,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,2,,1.0 bath,1.0,1.0,"[""Heating - split type ductless system"", ""Pack...",180.0,4,4.75,10
1633,28511402,81779,Laura,2010-02-16,"Chicago, IL",within an hour,100%,99%,t,10,...,3,,1.0 bath,1.0,1.0,"[""Pack \u2019n play/Travel crib - available up...",178.0,22,4.77,10
7734,1732652,3965428,Rob,2012-10-24,"Chicago, IL",within an hour,100%,74%,t,83,...,2,,1.0 bath,1.0,1.0,"[""Self check-in"", ""Oven"", ""First aid kit"", ""Be...",153.0,7,4.5,75


### Function to make numerical values for # of bath

In [275]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [276]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

        bathrooms_text  num_bath
0        1 shared bath       1.0
1              2 baths       2.0
2              2 baths       2.0
3              2 baths       2.0
4       1 private bath       1.0
...                ...       ...
7742            1 bath       1.0
7743           2 baths       2.0
7744  1.5 shared baths       1.5
7745     1 shared bath       1.0
7746     1 shared bath       1.0

[7745 rows x 2 columns]


In [277]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,2384,2613,Rebecca,2008-08-29,"Chicago, IL",within an hour,100%,97%,t,1,...,,1 shared bath,1.0,1.0,"[""Dedicated workspace"", ""Host greets you"", ""Ba...",90.0,212,4.93,1,1.0
1,1772920,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,,2 baths,3.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",379.0,13,4.58,6,2.0
2,1773021,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,,2 baths,4.0,5.0,"[""Laundromat nearby"", ""Oven"", ""Baking sheet"", ...",479.0,26,4.85,6,2.0
3,1773025,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,,2 baths,4.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",479.0,21,4.76,6,2.0
4,1810118,9483312,Ryan,2013-10-17,"Chicago, IL",within an hour,100%,100%,t,2,...,,1 private bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Chil...",79.0,355,4.88,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7742,25879,101521,Red,2010-03-31,"Chicago, IL",within a few hours,100%,75%,f,8,...,,1 bath,2.0,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",86.0,51,4.35,3,1.0
7743,28749,27506,Lauri,2009-07-25,Italy,within a few hours,100%,85%,t,1,...,,2 baths,3.0,3.0,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Fir...",157.0,169,4.67,1,2.0
7744,37738,162364,Mat And Randy,2010-07-09,"Chicago, IL",,,,f,2,...,,1.5 shared baths,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",110.0,250,4.98,2,1.5
7745,71930,334241,Michael And Veronica,2011-01-03,"Chicago, IL",within an hour,90%,97%,t,2,...,,1 shared bath,1.0,1.0,"[""Laundromat nearby"", ""Dedicated workspace"", ""...",84.0,101,4.80,2,1.0


In [278]:
updated_df['num_bath'].isnull().sum()

0

In [279]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [280]:
updated_df['num_bath'].unique()

array([ 1. ,  2. ,  1.5, 11. ,  2.5,  3. ,  0. ,  3.5,  4. ,  0.5,  4.5,
        5. , 11.5,  6.5,  7. ,  5.5,  6. , 12.5, 10. ,  9.5,  7.5,  8. ,
        8.5])

In [281]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [282]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
0,2384,2613,Rebecca,2008-08-29,"Chicago, IL",within an hour,100%,97%,t,1,...,1,1 shared bath,1.0,1.0,"[""Dedicated workspace"", ""Host greets you"", ""Ba...",90.0,212,4.93,1,1.0
1,1772920,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,7,2 baths,3.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",379.0,13,4.58,6,2.0
2,1773021,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,9,2 baths,4.0,5.0,"[""Laundromat nearby"", ""Oven"", ""Baking sheet"", ...",479.0,26,4.85,6,2.0
3,1773025,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,9,2 baths,4.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",479.0,21,4.76,6,2.0
4,1810118,9483312,Ryan,2013-10-17,"Chicago, IL",within an hour,100%,100%,t,2,...,3,1 private bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Chil...",79.0,355,4.88,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7742,25879,101521,Red,2010-03-31,"Chicago, IL",within a few hours,100%,75%,f,8,...,6,1 bath,2.0,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",86.0,51,4.35,3,1.0
7743,28749,27506,Lauri,2009-07-25,Italy,within a few hours,100%,85%,t,1,...,6,2 baths,3.0,3.0,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Fir...",157.0,169,4.67,1,2.0
7744,37738,162364,Mat And Randy,2010-07-09,"Chicago, IL",,,,f,2,...,2,1.5 shared baths,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",110.0,250,4.98,2,1.5
7745,71930,334241,Michael And Veronica,2011-01-03,"Chicago, IL",within an hour,90%,97%,t,2,...,2,1 shared bath,1.0,1.0,"[""Laundromat nearby"", ""Dedicated workspace"", ""...",84.0,101,4.80,2,1.0


# Impute Bedrooms and Beds

In [283]:
updated_df['bedrooms'].unique()

array([ 1.,  3.,  4.,  2.,  6., nan,  5.,  7.,  9., 12., 10.,  8., 11.])

In [284]:
updated_df['beds'].unique()

array([ 1.,  5.,  2.,  4.,  3., 10.,  8.,  9.,  6., 11.,  7., nan, 18.,
       17., 14., 15., 12., 13., 21., 20., 19., 24., 16.])

In [285]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
62,3485081,11848299,Anton,2014-02-01,"Chicago, IL",,,100%,f,2,...,3,1 bath,,1.0,"[""Oven"", ""Host greets you"", ""Private entrance""...",112.0,180,4.50,2,1.0
69,3666681,3965428,Rob,2012-10-24,"Chicago, IL",within an hour,100%,74%,t,83,...,2,1 bath,,1.0,"[""Self check-in"", ""Oven"", ""Carbon monoxide ala...",170.0,14,4.46,75,1.0
167,5861971,11710780,Robert,2014-01-27,"Chicago, IL",within an hour,100%,99%,t,2,...,4,1 bath,,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",72.0,172,4.69,2,1.0
190,6363328,33154489,Sara,2015-05-11,"Chicago, IL",within a few hours,93%,54%,f,1,...,2,1 bath,,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",85.0,205,4.82,1,1.0
217,6588681,5212119,Mark,2013-02-24,"Chicago, IL",within an hour,100%,97%,t,1,...,2,1 bath,,1.0,"[""Self check-in"", ""Children\u2019s books and t...",82.0,429,4.87,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7607,843634450900497936,144369871,Cinny,2017-08-04,,,,,f,6,...,2,1 bath,,1.0,"[""Smoke alarm"", ""Carbon monoxide alarm"", ""Kitc...",91.0,0,,2,1.0
7617,844403551944794676,252408635,Andie,2019-03-31,"Chicago, IL",within an hour,98%,99%,f,34,...,2,1 bath,,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",78.0,0,,31,1.0
7619,845594853964391358,343454544,Janay,2020-04-08,"Chicago, IL",within a day,50%,80%,f,2,...,2,1 bath,,2.0,"[""Dedicated workspace"", ""Oven"", ""Host greets y...",89.0,0,,1,1.0
7625,846400873610326525,490752710,Nell,2022-12-08,"Chicago, IL",within an hour,96%,100%,f,3,...,2,1 bath,,1.0,"[""Oven"", ""First aid kit"", ""Bed linens"", ""Condi...",90.0,0,,2,1.0


In [286]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
687,15496152,45828803,Heather + JP,2015-10-05,"Chicago, IL",within an hour,100%,99%,t,2,...,3,1 bath,,,"[""Self check-in"", ""Laundromat nearby"", ""Dedica...",112.0,269,4.90,2,1.0
712,15735536,25553049,Emily,2015-01-03,"Elmwood Park, IL",within an hour,100%,91%,t,2,...,5,1.5 baths,3.0,,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Pri...",190.0,32,4.97,2,1.5
1001,20503258,68127227,Rocio,2016-04-20,"Chicago, IL",,,,f,1,...,2,1 bath,1.0,,"[""Hair dryer"", ""Smoke alarm"", ""Carbon monoxide...",1200.0,0,,1,1.0
1220,23376509,174206662,MaTennessee,2018-02-19,"Chicago, IL",,,,f,3,...,2,1 shared bath,1.0,,"[""Children\u2019s books and toys"", ""Oven"", ""Fi...",50.0,11,4.55,3,1.0
1223,23467251,691000,Nonya,2011-06-11,"Chicago, IL",within a few hours,100%,96%,f,11,...,4,2 shared baths,1.0,,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Fir...",25.0,32,4.91,11,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,832728479157812136,25662899,Missy,2015-01-05,"Chicago, IL",,,,f,3,...,1,1 bath,1.0,,"[""Self check-in"", ""Dedicated workspace"", ""Firs...",72.0,0,,1,1.0
7491,834183947687691462,8244394,Lina,2013-08-18,"Chicago, IL",within an hour,100%,100%,f,1,...,3,1 bath,1.0,,"[""Self check-in"", ""Dedicated workspace"", ""Blen...",67.0,0,,1,1.0
7498,835066647677257371,85647159,Masato Or Akiko,2016-07-24,"Chicago, IL",within an hour,100%,100%,f,5,...,3,1 private bath,,,"[""Self check-in"", ""Children\u2019s books and t...",50.0,1,5.00,5,1.0
7520,837651273534264559,435105119,Sonder (Chicago),2021-12-07,"Chicago, IL",within an hour,96%,99%,f,31,...,4,1 bath,1.0,,"[""Self check-in"", ""Fast wifi \u2013 56 Mbps"", ...",277.0,0,,31,1.0


In [287]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [288]:
updated_df = update_bedrooms_and_beds(updated_df)


In [289]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 3485081) | 
    (updated_df['id'] == 3666681) | 
    (updated_df['id'] == 5861971) | 
    (updated_df['id'] == 6363328) |
    (updated_df['id'] == 6588681) | 
    (updated_df['id'] == 843634450900497936) | 
    (updated_df['id'] == 844403551944794676) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
62,3485081,11848299,Anton,2014-02-01,"Chicago, IL",,,100%,f,2,...,3,1 bath,1.0,1.0,"[""Oven"", ""Host greets you"", ""Private entrance""...",112.0,180,4.5,2,1.0
69,3666681,3965428,Rob,2012-10-24,"Chicago, IL",within an hour,100%,74%,t,83,...,2,1 bath,1.0,1.0,"[""Self check-in"", ""Oven"", ""Carbon monoxide ala...",170.0,14,4.46,75,1.0
167,5861971,11710780,Robert,2014-01-27,"Chicago, IL",within an hour,100%,99%,t,2,...,4,1 bath,1.0,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",72.0,172,4.69,2,1.0
190,6363328,33154489,Sara,2015-05-11,"Chicago, IL",within a few hours,93%,54%,f,1,...,2,1 bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",85.0,205,4.82,1,1.0
217,6588681,5212119,Mark,2013-02-24,"Chicago, IL",within an hour,100%,97%,t,1,...,2,1 bath,1.0,1.0,"[""Self check-in"", ""Children\u2019s books and t...",82.0,429,4.87,1,1.0
7607,843634450900497936,144369871,Cinny,2017-08-04,,,,,f,6,...,2,1 bath,1.0,1.0,"[""Smoke alarm"", ""Carbon monoxide alarm"", ""Kitc...",91.0,0,,2,1.0
7617,844403551944794676,252408635,Andie,2019-03-31,"Chicago, IL",within an hour,98%,99%,f,34,...,2,1 bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",78.0,0,,31,1.0


In [290]:
updated_df['beds'].unique()

array([ 1.,  5.,  2.,  4.,  3., 10.,  8.,  9.,  6., 11.,  7., 18., 17.,
       14., 15., 12., 13., 21., 20., 19., 24., 16.])

In [291]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [292]:
updated_df['bedrooms'].unique()

array([ 1.,  3.,  4.,  2.,  6.,  5.,  7.,  9., 12., 10.,  8.,  0., 11.])

In [293]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_since

In [294]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [295]:
updated_df['host_since'].isnull().sum()

0

In [296]:
updated_df['host_since'].unique()

array(['2008-08-29', '2013-10-08', '2013-10-17', ..., '2009-10-18',
       '2009-07-25', '2011-01-03'], dtype=object)

# host_location

In [297]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 1391 missing values in the "host_location" column.


In [298]:
updated_df['host_location'].isnull().sum()

0

In [299]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

1391

In [300]:
updated_df['host_location'].unique()

array(['Chicago, IL', 'Northfield, IL', 'Boulder, CO', 'Boise, ID',
       'United States', 'Illinois, United States', 'Long Beach, CA',
       'Honolulu, HI', 'Lisle, IL', 'Connecticut, United States',
       'Munster, IN', 'unknown', 'Madrid, Spain', 'Vancouver, Canada',
       'Rockville, MD', 'Evanston, Illinois', 'Barrington, IL',
       'Evergreen Park, IL', 'Oak Park, IL', 'Northbrook, IL',
       'Aurora, IL', 'Austin, TX', 'Tucson, AZ', 'Jacksonville, FL',
       'Anchorage, AK', 'Walnut, CA', 'Los Angeles, CA', 'Charlotte, NC',
       'Naperville, IL', 'Las Vegas, NV', 'Birmingham, MI',
       'Saint Ignatius, MT', 'Pinehurst, NC', 'Phoenix, AZ',
       'Atlanta, GA', 'Downers Grove, IL', 'Linda, CA',
       'Elmwood Park, IL', 'Kalamazoo, MI', 'Carolina, Puerto Rico',
       'Naples, FL', 'Paris, France', 'Salt Lake City, UT',
       'Schaumburg, IL', 'Glen Ellyn, IL', 'Montreal, Canada',
       'Denver, CO', 'Andover, MA', 'Whitewater, WI', 'San Antonio, TX',
       'Tampa,

In [301]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_is_superhost

In [302]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [303]:
updated_df['host_is_superhost'].isnull().sum()

0

In [304]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

4896

In [305]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [306]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [307]:
updated_df['host_listings_count'].isnull().sum()

0

In [308]:
updated_df['host_listings_count'].unique()

array([   1,    6,    2,    8,   83,    3,   17,    5,   41,    4,   10,
          7,   14,   11,   13,    9,   23,   18,   12,   32,  120,   21,
         24,   28,   15,   63,   16,   51,   25,   35,   48,  158,   34,
          0,   22, 4807,   27,   66,  161,   20,   31,  138,  246,   19,
         53,   64, 1328,   62, 2621,   99,   43,   40,   30,   26,   73,
        156,   46,  449,   68,  285,  171])

# host_total_listings_count

In [309]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [310]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [311]:
updated_df['host_total_listings_count'].unique()

array([   1,    6,    2,    3,   12,  224,    5,    4,   11,   31,   42,
          8,   17,    7,   19,   21,   10,   24,   14,   18,   82,   16,
         38,    9,   15,   41,   72,   22,  171,   13,   66,   25,   52,
         44,   99,  139,   33,  186,   37,   23,   26,   29,   54,   96,
         68,  184,    0, 5358,   70,  182,   35,   34,   28,  176,  294,
        104,   60,   67,   55, 2034,   43,   53,   69, 8342,  123,   46,
         20,   64,  118,  327,   56,   36,  491,   47,   86,  564,  423,
         97,   78])

# host_verifications

In [312]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [313]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']"], dtype=object)

# host_identity_verified

In [314]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [315]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [316]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [317]:
updated_df['calculated_host_listings_count'].unique()

array([  1,   6,   2,   8,  75,   3,  15,   5,  21,   4,  10,  14,  11,
        13,   9,  22,  18,  12,   7,  32,  24,  63,  17,  16,  27,  23,
        34,  19, 658,  39,  31,  25,  38,  30,  64,  33,  47,  40])

# host_name

In [318]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_has_profile_pic

In [319]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [320]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

# Neighborhood

In [321]:
updated_df['neighbourhood'].unique()

array(['Chicago, Illinois, United States', nan,
       'Oak Park, Illinois, United States',
       'Chicago Heights, Illinois, United States',
       'Chicago , Illinois, United States',
       'Bucktown, Chicago, Illinois, United States',
       'Oak Lawn, Illinois, United States', 'Chicago, United States',
       'Rogers Park, Il, United States',
       'Santa Barbara, California, United States',
       'Chicago, Il, United States', 'Norridge, Illinois, United States',
       'Harwood Heights, Illinois, United States',
       '芝加哥, Illinois, United States',
       'Evergreen Park, Illinois, United States'], dtype=object)

In [322]:
updated_df['neighbourhood'].isnull().sum()

2296

In [323]:
updated_df['neighbourhood_cleansed'].unique()

array(['Hyde Park', 'Lake View', 'Woodlawn', 'Lincoln Park', 'Avondale',
       'Near South Side', 'Logan Square', 'West Town', 'Humboldt Park',
       'Lower West Side', 'Kenwood', 'North Center', 'Portage Park',
       'Near West Side', 'Lincoln Square', 'Washington Park',
       'South Lawndale', 'Loop', 'Morgan Park', 'Dunning',
       'Near North Side', 'West Lawn', 'South Shore', 'Rogers Park',
       'Edgewater', 'East Garfield Park', 'Irving Park', 'West Ridge',
       'Grand Boulevard', 'Bridgeport', 'Uptown', 'Armour Square',
       'Albany Park', 'South Deering', 'Pullman', 'West Garfield Park',
       'Hermosa', 'Mckinley Park', 'Douglas', 'Hegewisch', 'West Elsdon',
       'Jefferson Park', 'Ashburn', 'Greater Grand Crossing',
       'North Lawndale', 'Norwood Park', 'Garfield Ridge', 'Austin',
       'North Park', 'Belmont Cragin', 'Oakland', 'Archer Heights',
       'Edison Park', 'Brighton Park', 'Englewood', 'Beverly',
       'Chicago Lawn', 'South Chicago', 'West Engl

In [324]:
updated_df['neighbourhood_cleansed'].isnull().sum()

0

### We are going to drop neighbourhood column for Chicago Dataset

In [325]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# host_response_time

In [328]:
updated_df['host_response_time'].unique()

array(['within an hour', nan, 'within a day', 'a few days or more',
       'within a few hours'], dtype=object)

# host_response_rate

In [329]:
updated_df['host_response_rate'].unique()

array(['100%', nan, '0%', '94%', '92%', '67%', '50%', '80%', '90%', '98%',
       '13%', '91%', '93%', '20%', '70%', '96%', '88%', '83%', '87%',
       '97%', '33%', '40%', '75%', '95%', '60%', '99%', '84%', '69%',
       '22%', '86%', '89%', '63%', '73%', '79%', '64%', '85%', '25%',
       '71%', '78%', '72%', '17%', '30%', '58%', '82%'], dtype=object)

# host_acceptance_rate

In [331]:
# host_acceptance_rate
updated_df['host_acceptance_rate'].unique()

array(['97%', '100%', '18%', '25%', '74%', '89%', '71%', '99%', '85%',
       '92%', nan, '57%', '91%', '94%', '69%', '50%', '96%', '43%', '78%',
       '80%', '95%', '0%', '87%', '81%', '75%', '90%', '20%', '98%',
       '86%', '66%', '73%', '67%', '33%', '76%', '88%', '93%', '82%',
       '46%', '54%', '58%', '55%', '64%', '39%', '52%', '13%', '53%',
       '68%', '83%', '51%', '44%', '60%', '84%', '40%', '23%', '30%',
       '63%', '31%', '17%', '41%', '70%', '42%', '48%', '72%', '36%',
       '56%', '11%', '62%', '79%', '77%', '29%', '61%', '1%', '65%',
       '45%', '21%', '15%'], dtype=object)

# review_scores_value

In [333]:
updated_df['review_scores_value'].unique()

array([4.93, 4.58, 4.85, 4.76, 4.88, 4.59, 4.61, 3.  , 4.62, 4.7 , 4.67,
       4.84, 4.56, 4.72, 4.48, 4.6 , 4.57, 4.81, 4.79, 4.97, 4.8 , 4.  ,
       4.77,  nan, 4.89, 4.73, 4.63, 4.68, 4.78, 4.75, 4.82, 4.66, 4.5 ,
       4.87, 4.9 , 4.83, 4.91, 4.51, 4.71, 4.17, 5.  , 3.8 , 4.46, 4.86,
       4.52, 4.96, 4.94, 4.45, 4.74, 4.64, 4.69, 4.55, 4.92, 4.98, 4.65,
       4.36, 4.4 , 3.75, 4.95, 4.29, 4.54, 4.42, 4.44, 4.53, 4.43, 4.33,
       3.67, 4.1 , 4.39, 4.3 , 4.34, 3.5 , 4.2 , 4.31, 4.38, 4.21, 4.47,
       3.25, 3.78, 4.32, 3.89, 2.5 , 4.14, 3.33, 4.49, 1.  , 4.25, 4.18,
       3.96, 3.6 , 4.99, 4.35, 4.41, 3.57, 3.83, 4.22, 4.28, 2.  , 4.06,
       4.27, 4.16, 4.23, 4.26, 4.24, 3.7 , 3.93, 3.88, 3.2 , 3.77, 3.73,
       4.08, 3.95, 3.71, 4.13, 4.02, 4.19, 3.4 , 4.37, 3.84, 4.15, 3.82,
       4.09, 4.05, 3.79, 4.11, 3.22, 3.56, 3.43, 3.3 , 3.27, 3.13, 3.17,
       2.67, 3.91])

# Adding new columns for city

Add new column and put 'Chicago' for all rows for later joining purpose



In [335]:
updated_df['city'] = 'Chicago'


In [336]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city
0,2384,2613,Rebecca,2008-08-29,"Chicago, IL",within an hour,100%,97%,t,1,...,1 shared bath,1.0,1.0,"[""Dedicated workspace"", ""Host greets you"", ""Ba...",90.0,212,4.93,1,1.0,Chicago
1,1772920,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,2 baths,3.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",379.0,13,4.58,6,2.0,Chicago
2,1773021,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,2 baths,4.0,5.0,"[""Laundromat nearby"", ""Oven"", ""Baking sheet"", ...",479.0,26,4.85,6,2.0,Chicago
3,1773025,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,2 baths,4.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",479.0,21,4.76,6,2.0,Chicago
4,1810118,9483312,Ryan,2013-10-17,"Chicago, IL",within an hour,100%,100%,t,2,...,1 private bath,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Chil...",79.0,355,4.88,2,1.0,Chicago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7742,25879,101521,Red,2010-03-31,"Chicago, IL",within a few hours,100%,75%,f,8,...,1 bath,2.0,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",86.0,51,4.35,3,1.0,Chicago
7743,28749,27506,Lauri,2009-07-25,Italy,within a few hours,100%,85%,t,1,...,2 baths,3.0,3.0,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Fir...",157.0,169,4.67,1,2.0,Chicago
7744,37738,162364,Mat And Randy,2010-07-09,"Chicago, IL",,,,f,2,...,1.5 shared baths,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",110.0,250,4.98,2,1.5,Chicago
7745,71930,334241,Michael And Veronica,2011-01-03,"Chicago, IL",within an hour,90%,97%,t,2,...,1 shared bath,1.0,1.0,"[""Laundromat nearby"", ""Dedicated workspace"", ""...",84.0,101,4.80,2,1.0,Chicago


# Count number of amenities

In [339]:
import ast

# Convert the 'amenities' string to a list using ast.literal_eval
updated_df['amenities_list'] = updated_df['amenities'].apply(lambda x: ast.literal_eval(x))

# Count the number of amenities in each list and create a new column with these counts
updated_df['amenities_count'] = updated_df['amenities_list'].apply(lambda x: len(x))

updated_df.drop('amenities_list', axis=1, inplace=True)


In [340]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
0,2384,2613,Rebecca,2008-08-29,"Chicago, IL",within an hour,100%,97%,t,1,...,1.0,1.0,"[""Dedicated workspace"", ""Host greets you"", ""Ba...",90.0,212,4.93,1,1.0,Chicago,48
1,1772920,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,3.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",379.0,13,4.58,6,2.0,Chicago,38
2,1773021,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,4.0,5.0,"[""Laundromat nearby"", ""Oven"", ""Baking sheet"", ...",479.0,26,4.85,6,2.0,Chicago,34
3,1773025,9297431,Inn At Wrigleyville,2013-10-08,"Chicago, IL",within an hour,100%,100%,t,6,...,4.0,5.0,"[""Oven"", ""Baking sheet"", ""Host greets you"", ""F...",479.0,21,4.76,6,2.0,Chicago,34
4,1810118,9483312,Ryan,2013-10-17,"Chicago, IL",within an hour,100%,100%,t,2,...,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Chil...",79.0,355,4.88,2,1.0,Chicago,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7742,25879,101521,Red,2010-03-31,"Chicago, IL",within a few hours,100%,75%,f,8,...,2.0,3.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",86.0,51,4.35,3,1.0,Chicago,40
7743,28749,27506,Lauri,2009-07-25,Italy,within a few hours,100%,85%,t,1,...,3.0,3.0,"[""Self check-in"", ""Oven"", ""Baking sheet"", ""Fir...",157.0,169,4.67,1,2.0,Chicago,44
7744,37738,162364,Mat And Randy,2010-07-09,"Chicago, IL",,,,f,2,...,1.0,1.0,"[""Self check-in"", ""Dedicated workspace"", ""Oven...",110.0,250,4.98,2,1.5,Chicago,40
7745,71930,334241,Michael And Veronica,2011-01-03,"Chicago, IL",within an hour,90%,97%,t,2,...,1.0,1.0,"[""Laundromat nearby"", ""Dedicated workspace"", ""...",84.0,101,4.80,2,1.0,Chicago,43


# Final look into missing values

In [341]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                1000
host_response_rate                1000
host_acceptance_rate               759
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

In [342]:
# Save as a new csv file
updated_df.to_csv('clean-data/listings_detailed_clean_chicago.csv', index=False)