# Los Angeles

In [1]:
import pandas as pd
import numpy as np

In [2]:
listings_detailed = pd.read_csv('usa/Los Angeles/listings_detailed.csv')

In [3]:
listings_detailed.shape

(42451, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(42451, 29)

In [6]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_la.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [7]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 10]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
3450,8954605,46218841,Spyro,2015-10-10,"Glendale, CA",,,,f,1.0,...,2,,3 baths,4.0,4.0,"[""Ethernet connection"", ""Smoke alarm"", ""Washer...",10.0,4,5.0,1
7121,38491365,177333046,Carmine,2018-03-08,"Long Beach, CA",,,,f,3.0,...,1,,1.5 shared baths,1.0,2.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Kitchen""...",10.0,5,5.0,3
10565,9476994,49119691,Roberta,2015-11-15,"Pasadena, CA",within a day,50%,76%,f,2.0,...,2,,1.5 baths,1.0,1.0,"[""Ethernet connection"", ""Smoke alarm"", ""Washer...",10.0,61,4.93,2
25253,48400268,14105430,Marcel,2014-04-10,"Los Angeles, CA",,,,f,1.0,...,2,,1 shared bath,1.0,1.0,[],10.0,1,5.0,1
26117,50380922,213959518,Blanca,2018-09-06,"Los Angeles, CA",within an hour,99%,100%,f,16.0,...,2,,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Shampoo"", ""55\"" HDTV with Net...",10.0,7,4.57,16
28482,50177386,33772749,Will,2015-05-19,"Long Beach, CA",within an hour,100%,75%,t,10.0,...,2,,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Kitchen""...",10.0,3,5.0,10
29223,50999233,136613504,Laila,2017-06-23,"Los Angeles, CA",,,,f,1.0,...,2,,1 bath,1.0,1.0,"[""Hot tub"", ""Wifi"", ""Outdoor dining area"", ""Ca...",10.0,0,,1
29479,580384048712284299,64081146,Wendy,2016-03-22,"Long Beach, CA",within a few hours,100%,50%,f,1.0,...,2,,1 private bath,1.0,1.0,"[""Valley view"", ""Central heating"", ""Smoke alar...",10.0,0,,1
38987,23132645,72573217,Allison,2016-05-17,"Los Angeles, CA",,,,f,1.0,...,1,,1 bath,,1.0,"[""Wifi"", ""Gym"", ""Smoke alarm"", ""Washer"", ""Heat...",10.0,0,,1
42423,800992081953714938,4691557,Cary,2013-01-13,"Los Angeles, CA",within an hour,100%,67%,f,2.0,...,3,,1 bath,1.0,2.0,"[""Fast wifi \u2013 417 Mbps"", ""Central heating...",10.0,0,,2


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


14


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1574,42829972,299524070,Glendale Express Hotel,2019-10-02,"Glendale, CA",,,,,2.0,...,0,,,,,"[""Smoke alarm"", ""Laundry services"", ""Complimen...",0.0,16,3.5,1
3233,43247438,309273784,LA Best Inn,2019-11-13,"Los Angeles, CA",,,,,4.0,...,0,,,,,"[""Safe"", ""Bed sheets and pillows"", ""Carbon mon...",0.0,1,1.0,1
8128,48535258,391634388,Anuja,2021-03-08,,,,,,2.0,...,0,,,,,[],0.0,0,,1
10049,42384662,296569793,Viceroy L’Ermitage,2019-09-20,,,,100%,,9.0,...,2,,,,,"[""Mini fridge"", ""Baby bath"", ""Smoke alarm"", ""N...",0.0,10,4.2,1
12644,42229033,311439116,Nite Inn @,2019-11-22,,within an hour,100%,100%,,6.0,...,8,,,,,"[""Limited housekeeping \u2014 on request"", ""Mi...",0.0,45,4.2,1
14610,42829631,98347870,Beverly Hills Plaza Hotel & Spa,2016-10-06,,,,,,2.0,...,0,,,,,"[""Onsite restaurant \u2014 Le Petit Caf\u00e9 ...",0.0,0,,1
25927,46396590,375439118,Shore Hotel,2020-11-12,,within an hour,100%,35%,,7.0,...,0,,,,,"[""Limited housekeeping \u2014 on request"", ""Sm...",0.0,3,4.33,1
26189,43096355,306014339,The Prospect Hollywood,2019-11-01,"Los Angeles, CA",within an hour,100%,100%,,2.0,...,0,,,,,"[""Smoke alarm"", ""Complimentary self parking"", ...",0.0,61,4.75,1
27366,40561078,209561894,Gold Diggers,2018-08-14,"Los Angeles, CA",within an hour,100%,98%,,13.0,...,2,,,,,"[""Sonos sound system"", ""Mini fridge"", ""Smoke a...",0.0,111,4.7,2
28165,43247459,323989168,Lexen Hotel North Hollywood,2020-01-03,,,,,,3.0,...,0,,,,,"[""42\"" HDTV"", ""Safe"", ""Bed sheets and pillows""...",0.0,0,,1


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1574,42829972,299524070,Glendale Express Hotel,2019-10-02,"Glendale, CA",,,,,2.0,...,0,,,,,"[""Smoke alarm"", ""Laundry services"", ""Complimen...",0.0,16,3.5,1
3233,43247438,309273784,LA Best Inn,2019-11-13,"Los Angeles, CA",,,,,4.0,...,0,,,,,"[""Safe"", ""Bed sheets and pillows"", ""Carbon mon...",0.0,1,1.0,1
8128,48535258,391634388,Anuja,2021-03-08,,,,,,2.0,...,0,,,,,[],0.0,0,,1
14610,42829631,98347870,Beverly Hills Plaza Hotel & Spa,2016-10-06,,,,,,2.0,...,0,,,,,"[""Onsite restaurant \u2014 Le Petit Caf\u00e9 ...",0.0,0,,1
25927,46396590,375439118,Shore Hotel,2020-11-12,,within an hour,100%,35%,,7.0,...,0,,,,,"[""Limited housekeeping \u2014 on request"", ""Sm...",0.0,3,4.33,1
26189,43096355,306014339,The Prospect Hollywood,2019-11-01,"Los Angeles, CA",within an hour,100%,100%,,2.0,...,0,,,,,"[""Smoke alarm"", ""Complimentary self parking"", ...",0.0,61,4.75,1
28165,43247459,323989168,Lexen Hotel North Hollywood,2020-01-03,,,,,,3.0,...,0,,,,,"[""42\"" HDTV"", ""Safe"", ""Bed sheets and pillows""...",0.0,0,,1
39462,46393222,375341994,Daniel,2020-11-11,,within an hour,100%,100%,,5.0,...,0,,,,,"[""Limited housekeeping \u2014 on request"", ""Mi...",0.0,13,4.85,1


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 14


In [14]:
cleaned_listings.shape

(42437, 29)

# Count 'amenities'

In [15]:
import ast
def count_amenities(amenities_str):
    try:
        # Convert the string representation of the list back into a list
        amenities_list = ast.literal_eval(amenities_str)
        # Return the count of items in the list
        return len(amenities_list)
    except (ValueError, SyntaxError):
        # In case of any error during conversion, return 0 (or you may choose to return NaN)
        return 0

# Apply the function to each row in the 'amenities' column and create a new column 'amenities_count'
cleaned_listings['amenities_count'] = cleaned_listings['amenities'].apply(count_amenities)

In [16]:
cleaned_listings['amenities_count']

0        10
1        15
2        19
3        58
4        56
         ..
42446    73
42447    33
42448    19
42449    41
42450    35
Name: amenities_count, Length: 42437, dtype: int64

# Add 'city' column

In [17]:
# Add in a city column to each dataframe so that we can use this as part of the primary key/use it to conduct groupby for
# future EDA or additional analysis as final table will contain all of our listings data
cleaned_listings['city'] = 'Los Angeles'

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [18]:
cleaned_listings['bathrooms_text'].unique()

array(['3.5 baths', '1 shared bath', '1 bath', '2 baths', 'Half-bath',
       '2.5 baths', '3 baths', '1 private bath', '1.5 shared baths',
       '1.5 baths', '5 baths', '0 baths', '6 baths', '2 shared baths',
       '4 baths', '7.5 baths', '5.5 baths', '4.5 baths', '0 shared baths',
       '11 baths', '2.5 shared baths', nan, '8 baths', 'Shared half-bath',
       '3 shared baths', '7 baths', '8.5 baths', '8 shared baths',
       '6.5 baths', '9 baths', '11 shared baths', '3.5 shared baths',
       '9.5 baths', '10 baths', '11.5 baths', '5 shared baths',
       '4 shared baths', '8.5 shared baths', 'Private half-bath',
       '5.5 shared baths', '21.5 baths', '13.5 baths', '4.5 shared baths',
       '12 baths', '21 baths', '20 baths', '10 shared baths',
       '12.5 baths', '10.5 baths', '13 baths', '15 baths',
       '11.5 shared baths'], dtype=object)

In [19]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [20]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['3.5 baths' '1 shared bath' '1 bath' '2 baths' '0.5 baths' '2.5 baths'
 '3 baths' '1 private bath' '1.5 shared baths' '1.5 baths' '5 baths'
 '0 baths' '6 baths' '2 shared baths' '4 baths' '7.5 baths' '5.5 baths'
 '4.5 baths' '0 shared baths' '11 baths' '2.5 shared baths' '' '8 baths'
 '3 shared baths' '7 baths' '8.5 baths' '8 shared baths' '6.5 baths'
 '9 baths' '11 shared baths' '3.5 shared baths' '9.5 baths' '10 baths'
 '11.5 baths' '5 shared baths' '4 shared baths' '8.5 shared baths'
 '5.5 shared baths' '21.5 baths' '13.5 baths' '4.5 shared baths'
 '12 baths' '21 baths' '20 baths' '10 shared baths' '12.5 baths'
 '10.5 baths' '13 baths' '15 baths' '11.5 shared baths']


In [21]:
count_half_baths = (cleaned_listings['bathrooms_text'] == '0.5 baths').sum()
count_half_baths

95

### Check for the Nan for bathrooms_text

In [22]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
359,47159462,2288402,Ug,2012-05-04,"Los Angeles, CA",,,,f,2.0,...,,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",240.0,0,,2,12,Los Angeles
1084,39705326,271698446,Robert,2019-06-27,"Los Angeles, CA",,,,f,1.0,...,,,,"[""Hangers"", ""Essentials"", ""Pets allowed""]",40.0,1,1.0,1,3,Los Angeles
1140,1053404,5799839,Barbara,2013-04-06,"Los Angeles, CA",,,,f,1.0,...,,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",115.0,0,,1,12,Los Angeles
2939,52989672,428985128,Xueping,2021-10-25,,,,,f,2.0,...,,,,"[""Smoke alarm"", ""Shampoo"", ""Kitchen"", ""Hair dr...",120.0,1,1.0,2,27,Los Angeles
4142,708127514159735039,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,,1.0,1.0,"[""Central heating"", ""Smoke alarm"", ""Washer"", ""...",99.0,16,4.94,6,33,Los Angeles
8247,732599207114852422,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,,1.0,1.0,"[""Wifi"", ""Indoor fireplace"", ""Carbon monoxide ...",99.0,3,4.67,6,11,Los Angeles
9045,51382976,3097566,Frist,2012-07-30,"Los Angeles, CA",within an hour,98%,80%,f,43.0,...,,1.0,2.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Hair dry...",65.0,0,,38,33,Los Angeles
10528,405296,2019725,Larisa,2012-03-27,"Los Angeles, CA",within a day,100%,50%,f,1.0,...,,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",89.0,32,4.74,1,10,Los Angeles
11893,45383114,69113169,Olt,2016-04-26,"Los Angeles, CA",,,,f,1.0,...,,1.0,1.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Kitchen""...",145.0,0,,1,29,Los Angeles
12978,69015,341419,Sharda,2011-01-09,"Chennai, India",,,,f,1.0,...,,1.0,1.0,"[""Air conditioning"", ""Elevator""]",901.0,0,,1,2,Los Angeles


In [23]:
empty_bathrooms_text_ids = cleaned_listings[cleaned_listings['bathrooms_text'] == '']['id']
print(len(empty_bathrooms_text_ids))

27


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [24]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df


In [25]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [26]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,777451666060243581,4.0,3.0,3.5 baths
1,698097753730921190,1.0,1.0,1 shared bath
2,13063118,1.0,1.0,1 bath
3,39337076,1.0,1.0,1 shared bath
4,13077628,3.0,4.0,2 baths
...,...,...,...,...
42446,17286704,1.0,1.0,1 shared bath
42447,751077061221390362,1.0,1.0,2.5 baths
42448,663030240787490751,1.0,1.0,1 shared bath
42449,15876540,1.0,1.0,1 private bath


In [27]:
# # check for the previous NA rows in bathrooms text
filled_cleaned_listings[filled_cleaned_listings['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
359,47159462,2288402,Ug,2012-05-04,"Los Angeles, CA",,,,f,2.0,...,1.0 bath,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",240.0,0,,2,12,Los Angeles
1084,39705326,271698446,Robert,2019-06-27,"Los Angeles, CA",,,,f,1.0,...,1 bath,,,"[""Hangers"", ""Essentials"", ""Pets allowed""]",40.0,1,1.0,1,3,Los Angeles
1140,1053404,5799839,Barbara,2013-04-06,"Los Angeles, CA",,,,f,1.0,...,1.0 bath,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",115.0,0,,1,12,Los Angeles
2939,52989672,428985128,Xueping,2021-10-25,,,,,f,2.0,...,1 bath,,,"[""Smoke alarm"", ""Shampoo"", ""Kitchen"", ""Hair dr...",120.0,1,1.0,2,27,Los Angeles
4142,708127514159735039,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,1.0 bath,1.0,1.0,"[""Central heating"", ""Smoke alarm"", ""Washer"", ""...",99.0,16,4.94,6,33,Los Angeles
8247,732599207114852422,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,1.0 bath,1.0,1.0,"[""Wifi"", ""Indoor fireplace"", ""Carbon monoxide ...",99.0,3,4.67,6,11,Los Angeles
9045,51382976,3097566,Frist,2012-07-30,"Los Angeles, CA",within an hour,98%,80%,f,43.0,...,1.0 bath,1.0,2.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Hair dry...",65.0,0,,38,33,Los Angeles
10528,405296,2019725,Larisa,2012-03-27,"Los Angeles, CA",within a day,100%,50%,f,1.0,...,1.0 bath,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",89.0,32,4.74,1,10,Los Angeles
11893,45383114,69113169,Olt,2016-04-26,"Los Angeles, CA",,,,f,1.0,...,1.0 bath,1.0,1.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Kitchen""...",145.0,0,,1,29,Los Angeles
12978,69015,341419,Sharda,2011-01-09,"Chennai, India",,,,f,1.0,...,1.0 bath,1.0,1.0,"[""Air conditioning"", ""Elevator""]",901.0,0,,1,2,Los Angeles


Note: Looking back to the oringinal data original data, the entries with id '49229073' was deleted in the begging as it had 0 accomodates 

### Function to make numerical values for # of bath

In [28]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [29]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

       bathrooms_text  num_bath
0           3.5 baths       3.5
1       1 shared bath       1.0
2              1 bath       1.0
3       1 shared bath       1.0
4             2 baths       2.0
...               ...       ...
42446   1 shared bath       1.0
42447       2.5 baths       2.5
42448   1 shared bath       1.0
42449  1 private bath       1.0
42450  3 shared baths       3.0

[42437 rows x 2 columns]


In [30]:
updated_df['num_bath'].isnull().sum()

0

In [31]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [32]:
updated_df['num_bath'].unique()

array([ 3.5,  1. ,  2. ,  0.5,  2.5,  3. ,  1.5,  5. ,  0. ,  6. ,  4. ,
        7.5,  5.5,  4.5, 11. ,  8. ,  7. ,  8.5,  6.5,  9. ,  9.5, 10. ,
       11.5, 21.5, 13.5, 12. , 21. , 20. , 12.5, 10.5, 13. , 15. ])

In [33]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [34]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,777451666060243581,490709958,Dana,2022-12-08,,within an hour,100%,0%,f,1.0,...,4.0,3.0,"[""Wifi"", ""Exercise equipment"", ""Carbon monoxid...",399.0,0,,1,10,Los Angeles,3.5
1,698097753730921190,444692513,Hanna,2022-02-12,,within an hour,100%,100%,t,4.0,...,1.0,1.0,"[""Bed linens"", ""Wifi"", ""Carbon monoxide alarm""...",46.0,5,4.4,4,15,Los Angeles,1.0
2,13063118,41736985,Beth,2015-08-17,,within an hour,100%,86%,f,15.0,...,1.0,1.0,"[""Shared hot tub"", ""Wifi"", ""Refrigerator"", ""Gy...",408.0,53,4.83,10,19,Los Angeles,1.0
3,39337076,208375458,Jae,2018-08-09,"Rancho Palos Verdes, CA",within an hour,100%,99%,t,4.0,...,1.0,1.0,"[""Smoke alarm"", ""Freezer"", ""Shampoo"", ""Park vi...",80.0,44,4.82,4,58,Los Angeles,1.0
4,13077628,4182067,Jeremy,2012-11-18,"Malibu, CA",within an hour,100%,,f,1.0,...,3.0,4.0,"[""Sonos sound system"", ""Central heating"", ""Smo...",2138.0,15,4.36,1,56,Los Angeles,2.0


# Impute Bedrooms and Beds

In [35]:
updated_df['bedrooms'].unique()

array([ 4.,  1.,  3., nan,  2.,  5.,  6.,  7.,  8., 11.,  9., 12., 13.,
       10., 24., 23., 16.])

In [36]:
updated_df['beds'].unique()

array([ 3.,  1.,  4.,  2., nan,  5.,  7.,  6.,  9., 11.,  8., 10., 12.,
       20., 19., 15., 13., 16., 24., 32., 17., 14., 21., 50., 18.])

In [37]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
10,49043337,202801114,Guido,2018-07-16,"California, United States",within an hour,100%,100%,f,2.0,...,,,"[""Smoking allowed"", ""Wifi"", ""Pets allowed"", ""K...",1000.0,0,,1,5,Los Angeles,0.5
11,2943450,114555,Annet,2010-04-27,"Lomita, CA",within an hour,100%,97%,t,1.0,...,,2.0,"[""Ethernet connection"", ""Mini fridge"", ""Free d...",195.0,185,4.94,1,77,Los Angeles,1.0
49,663097266091425758,210632612,Scott,2018-08-20,"Huntington Beach, CA",a few days or more,0%,100%,f,1.0,...,,1.0,"[""Security cameras on property""]",35.0,0,,1,1,Los Angeles,0.0
52,840053916898021945,211899887,Mauricio,2018-08-26,"Long Beach, CA",within an hour,89%,98%,f,5.0,...,,1.0,"[""Wifi"", ""Dedicated workspace"", ""Air condition...",94.0,0,,5,5,Los Angeles,1.0
55,820495083192405533,497709887,Lourdson,2023-01-24,,within an hour,100%,,f,1.0,...,,1.0,[],74.0,0,,1,0,Los Angeles,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42337,13032893,54354899,Regina,2016-01-12,"Los Angeles, CA",within an hour,100%,100%,t,2.0,...,,2.0,"[""Ethernet connection"", ""Fast wifi \u2013 329 ...",160.0,368,4.78,1,50,Los Angeles,1.0
42343,51760564,26196192,Jon,2015-01-16,"Los Angeles, CA",within an hour,100%,100%,t,3.0,...,,1.0,"[""Smoke alarm"", ""Freezer"", ""Shampoo"", ""Wine gl...",200.0,5,4.60,3,49,Los Angeles,1.0
42354,14466452,70829936,Michelle,2016-05-07,"Los Angeles, CA",within an hour,100%,89%,f,1.0,...,,4.0,"[""Central heating"", ""Smoke alarm"", ""Freezer"", ...",116.0,182,4.97,1,40,Los Angeles,1.0
42411,20890567,147106089,Hometang,2017-08-21,"Los Angeles, CA",within an hour,100%,97%,t,6.0,...,,2.0,"[""Smoke alarm"", ""Bread maker"", ""Shampoo"", ""Kit...",162.0,147,4.75,4,40,Los Angeles,1.0


In [38]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
10,49043337,202801114,Guido,2018-07-16,"California, United States",within an hour,100%,100%,f,2.0,...,,,"[""Smoking allowed"", ""Wifi"", ""Pets allowed"", ""K...",1000.0,0,,1,5,Los Angeles,0.5
110,48865514,134267499,StayCozy,2017-06-09,"California, United States",within an hour,96%,90%,f,103.0,...,,,"[""Hangers"", ""Wifi"", ""Carbon monoxide alarm"", ""...",65.0,2,5.00,102,14,Los Angeles,1.0
141,42517485,106980640,Rose,2016-12-12,"Los Angeles, CA",within an hour,100%,97%,f,21.0,...,1.0,,"[""Gym"", ""Smoke alarm"", ""Washer"", ""Shampoo"", ""K...",40.0,6,5.00,17,23,Los Angeles,1.0
153,42866036,34666172,Maria,2015-05-31,"Whittier, CA",,,,f,7.0,...,1.0,,"[""Carbon monoxide alarm"", ""Smoke alarm"", ""Lock...",33.0,0,,4,9,Los Angeles,1.0
156,45195301,343535252,J,2020-04-09,"Sierra Madre, CA",,,,f,1.0,...,,,"[""Smoking allowed""]",51.0,0,,1,1,Los Angeles,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42104,16393418,94384746,Justin,2016-09-09,"West Hollywood, CA",within an hour,100%,100%,t,3.0,...,1.0,,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",131.0,45,4.91,3,16,Los Angeles,1.0
42336,49567342,253105222,Sonder (Los Angeles),2019-04-03,"Los Angeles, CA",within an hour,99%,99%,f,45.0,...,1.0,,"[""Bed linens"", ""Carbon monoxide alarm"", ""Smoke...",291.0,9,3.78,30,18,Los Angeles,1.0
42379,44999114,102870708,Joseph,2016-11-07,,within an hour,100%,95%,f,5.0,...,1.0,,"[""Smoke alarm"", ""Paid dryer \u2013 In building...",136.0,9,4.44,5,33,Los Angeles,1.0
42390,807897329584457702,488672310,Juan,2022-11-21,,within an hour,100%,100%,f,2.0,...,2.0,,"[""Central heating"", ""Smoke alarm"", ""Shampoo"", ...",232.0,3,5.00,2,29,Los Angeles,2.0


In [39]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [40]:
updated_df = update_bedrooms_and_beds(updated_df)


In [41]:
# check for the previous NA rows in bedrooms
updated_df[updated_df['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
359,47159462,2288402,Ug,2012-05-04,"Los Angeles, CA",,,,f,2.0,...,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",240.0,0,,2,12,Los Angeles,1.0
1084,39705326,271698446,Robert,2019-06-27,"Los Angeles, CA",,,,f,1.0,...,1.0,1.0,"[""Hangers"", ""Essentials"", ""Pets allowed""]",40.0,1,1.0,1,3,Los Angeles,1.0
1140,1053404,5799839,Barbara,2013-04-06,"Los Angeles, CA",,,,f,1.0,...,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",115.0,0,,1,12,Los Angeles,1.0
2939,52989672,428985128,Xueping,2021-10-25,,,,,f,2.0,...,1.0,1.0,"[""Smoke alarm"", ""Shampoo"", ""Kitchen"", ""Hair dr...",120.0,1,1.0,2,27,Los Angeles,1.0
4142,708127514159735039,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,1.0,1.0,"[""Central heating"", ""Smoke alarm"", ""Washer"", ""...",99.0,16,4.94,6,33,Los Angeles,1.0
8247,732599207114852422,446995407,Vicky,2022-02-26,,within an hour,100%,100%,f,6.0,...,1.0,1.0,"[""Wifi"", ""Indoor fireplace"", ""Carbon monoxide ...",99.0,3,4.67,6,11,Los Angeles,1.0
9045,51382976,3097566,Frist,2012-07-30,"Los Angeles, CA",within an hour,98%,80%,f,43.0,...,1.0,2.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Hair dry...",65.0,0,,38,33,Los Angeles,1.0
10528,405296,2019725,Larisa,2012-03-27,"Los Angeles, CA",within a day,100%,50%,f,1.0,...,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",89.0,32,4.74,1,10,Los Angeles,1.0
11893,45383114,69113169,Olt,2016-04-26,"Los Angeles, CA",,,,f,1.0,...,1.0,1.0,"[""Smoke alarm"", ""Washer"", ""Shampoo"", ""Kitchen""...",145.0,0,,1,29,Los Angeles,1.0
12978,69015,341419,Sharda,2011-01-09,"Chennai, India",,,,f,1.0,...,1.0,1.0,"[""Air conditioning"", ""Elevator""]",901.0,0,,1,2,Los Angeles,1.0


In [42]:
updated_df['beds'].unique()

array([ 3.,  1.,  4.,  2.,  5.,  7.,  6.,  0.,  9., 11.,  8., 10., 12.,
       20., 19., 15., 13., 16., 24., 32., 17., 14., 21., 50., 18.])

In [43]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [44]:
updated_df['bedrooms'].unique()

array([ 4.,  1.,  3.,  2.,  5.,  0.,  6.,  7.,  8., 11.,  9., 12., 13.,
       10., 24., 23., 16.])

In [45]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_since

In [46]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 2 missing values in the "host_since" column.


In [47]:
updated_df['host_since'].isnull().sum()

0

In [48]:
updated_df['host_since'].unique()

array(['2022-12-08', '2022-02-12', '2015-08-17', ..., '2012-08-16',
       '2011-03-18', '2023-01-16'], dtype=object)

In [49]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_location

In [50]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 8453 missing values in the "host_location" column.


In [51]:
updated_df['host_location'].isnull().sum()

0

In [52]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

8453

In [53]:
updated_df['host_location'].unique()

array(['unknown', 'Rancho Palos Verdes, CA', 'Malibu, CA',
       'Los Angeles, CA', 'Chino, CA', 'Lake Los Angeles, CA',
       'Torrance, CA', 'California, United States', 'Lomita, CA',
       'Whittier, CA', 'Lancaster, CA', 'Walnut, CA', 'Russ Place, CA',
       'Santa Clarita, CA', 'San Francisco, CA', 'Marina del Rey, CA',
       'Rowland Heights, CA', 'Toronto, Canada', 'La Mirada, CA',
       'Carson, CA', 'Long Beach, CA', 'Hermosa Beach, CA',
       'Pasadena, CA', 'Huntington Beach, CA', 'Hallandale Beach, FL',
       'Calabasas, CA', 'Jackson, MS', 'Quartz Hill, CA', 'Monrovia, CA',
       'Arcadia, CA', 'Redondo Beach, CA', 'Monterey Park, CA',
       'Fullerton, CA', 'Agoura Hills, CA', 'Portland, OR',
       'Lawndale, CA', 'Gardena, CA', 'Beverly Hills, CA', 'Altadena, CA',
       'Pomona, CA', 'West Hollywood, CA', 'Istanbul, Turkey',
       'El Segundo, CA', 'New York, NY', 'Sierra Madre, CA',
       'Glendora, CA', 'Agua Dulce, CA', 'Costa Mesa, CA',
       'San Gabr

In [54]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [55]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_is_superhost

In [56]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 145 missing values in the "host_is_superhost" column.


In [57]:
updated_df['host_is_superhost'].isnull().sum()

0

In [58]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

27983

In [59]:
updated_df['host_is_superhost'].unique()

array(['f', 't'], dtype=object)

# host_listings_count

In [60]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 2 missing values in the "host_listings_count" column.


In [61]:
updated_df['host_listings_count'].isnull().sum()

0

In [62]:
updated_df['host_listings_count'].unique()

array([1.000e+00, 4.000e+00, 1.500e+01, 3.000e+00, 2.000e+00, 3.600e+01,
       4.500e+01, 1.200e+01, 1.900e+01, 5.000e+00, 3.500e+01, 6.000e+00,
       8.000e+00, 2.400e+01, 1.300e+01, 5.000e+01, 1.700e+01, 9.000e+00,
       6.500e+01, 6.070e+02, 4.000e+01, 5.500e+01, 2.500e+01, 5.800e+01,
       1.030e+02, 7.000e+00, 1.000e+01, 1.005e+03, 2.000e+01, 1.400e+01,
       6.650e+02, 6.800e+01, 2.900e+01, 5.600e+01, 2.100e+01, 1.290e+02,
       6.600e+01, 6.000e+01, 1.430e+02, 7.000e+01, 7.740e+02, 5.100e+01,
       4.800e+01, 3.300e+01, 6.610e+02, 5.400e+01, 2.690e+02, 1.100e+01,
       4.300e+01, 2.200e+01, 2.800e+01, 5.900e+01, 1.600e+01, 3.800e+01,
       3.900e+01, 2.300e+01, 2.700e+01, 2.631e+03, 4.784e+03, 8.900e+01,
       1.510e+03, 9.600e+01, 9.400e+01, 1.150e+02, 5.700e+01, 1.730e+02,
       3.400e+01, 4.600e+01, 3.000e+01, 3.700e+01, 2.600e+01, 1.800e+01,
       7.990e+02, 7.700e+01, 2.072e+03, 6.400e+01, 3.286e+03, 2.040e+02,
       4.200e+01, 7.200e+01, 3.100e+01, 7.300e+01, 

# host_total_listings_count

In [63]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 2 missing values in the "host_total_listings_count" column.


In [64]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [65]:
updated_df['host_total_listings_count'].unique()

array([1.000e+00, 5.000e+00, 2.200e+01, 6.000e+00, 4.000e+00, 1.900e+01,
       2.000e+00, 4.000e+01, 3.000e+00, 1.850e+02, 3.300e+01, 9.000e+00,
       2.300e+01, 1.120e+02, 1.100e+01, 8.800e+01, 8.000e+00, 4.200e+01,
       7.000e+00, 2.400e+01, 6.500e+01, 1.800e+01, 8.700e+01, 1.200e+01,
       2.900e+01, 6.900e+01, 9.820e+02, 7.400e+01, 1.170e+02, 2.800e+01,
       1.700e+02, 1.000e+01, 1.007e+03, 2.500e+01, 6.760e+02, 9.100e+01,
       1.060e+02, 2.700e+01, 8.750e+02, 5.600e+01, 7.200e+01, 2.450e+02,
       7.700e+01, 2.100e+01, 1.600e+01, 6.100e+01, 2.130e+02, 3.700e+01,
       1.496e+03, 9.900e+01, 1.190e+02, 5.800e+01, 1.745e+03, 5.700e+01,
       1.300e+01, 3.100e+02, 3.600e+01, 5.500e+01, 1.700e+01, 3.000e+01,
       5.900e+01, 1.090e+02, 6.300e+01, 3.200e+01, 8.316e+03, 5.323e+03,
       2.950e+02, 1.964e+03, 1.500e+01, 1.070e+02, 2.200e+02, 3.500e+01,
       1.400e+01, 1.830e+02, 1.670e+02, 3.100e+01, 1.240e+02, 4.500e+01,
       4.600e+01, 6.640e+02, 1.590e+02, 1.130e+02, 

# host_verifications

In [66]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [67]:
updated_df['host_verifications'].unique()

array(["['phone']", "['email', 'phone']",
       "['email', 'phone', 'work_email']", "['phone', 'work_email']",
       "['email']", '[]', 'None'], dtype=object)

In [68]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
4193,20470416,299073616,Patrik,2019-09-30,unknown,,,0%,f,2.0,...,5.0,6.0,"[""Sauna"", ""Gym"", ""Steam room"", ""Washer"", ""Brea...",9995.0,0,,2,28,Los Angeles,5.5
5156,24231542,299073616,Patrik,2019-09-30,unknown,,,0%,f,2.0,...,3.0,3.0,"[""Dryer"", ""Free parking garage on premises"", ""...",6995.0,0,,2,20,Los Angeles,4.5
12536,2110830,4086579,Pablo,2012-11-07,"Los Angeles, CA",,,,f,1.0,...,1.0,1.0,"[""TV with standard cable"", ""Hot tub"", ""Wifi"", ...",135.0,2,5.0,1,12,Los Angeles,1.5
14688,44595088,360916271,Rebecca,2020-08-06,"Long Beach, CA",,,,f,1.0,...,3.0,2.0,"[""Wifi"", ""Gym"", ""Carbon monoxide alarm"", ""Smok...",180.0,0,,1,16,Los Angeles,3.0
18725,47472448,263297093,Dorna,2019-05-21,"Los Angeles, CA",,,100%,f,1.0,...,2.0,2.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",375.0,1,5.0,1,19,Los Angeles,2.0
22659,39757424,305582795,Rada,2019-10-30,unknown,,,,f,1.0,...,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",129.0,0,,1,16,Los Angeles,1.0
29530,20473154,217695242,Helen,2018-09-28,unknown,,,31%,f,1.0,...,3.0,4.0,"[""Ironing board"", ""Washer"", ""Breakfast bar"", ""...",2295.0,3,,1,28,Los Angeles,3.5
32475,42646093,299739906,Alex,2019-10-03,unknown,,,,f,2.0,...,2.0,1.0,"[""Hot tub"", ""Wifi"", ""Gym"", ""Smoke alarm"", ""Was...",100.0,0,,1,16,Los Angeles,1.0
33487,43179913,172946356,Nery,2018-02-12,"Los Angeles, CA",,,,f,1.0,...,1.0,1.0,"[""Smoking allowed"", ""Wifi"", ""Heating"", ""Essent...",200.0,0,,1,14,Los Angeles,1.0


In [69]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [70]:
updated_df['host_verifications'].unique()

array(["['phone']", "['email', 'phone']",
       "['email', 'phone', 'work_email']", "['phone', 'work_email']",
       "['email']", 'None'], dtype=object)

# host_identity_verified

In [71]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 2 missing values in the "host_identity_verified" column.


In [72]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [73]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [74]:
updated_df['calculated_host_listings_count'].unique()

array([   1,    4,   10,    2,   22,    3,   12,    5,   28,    8,   24,
         11,   27,    6,   64,   37,   55,   19,   54,  102,    7, 1003,
         13,  663,   66,   43,   29,   56,    9,   17,  129,   15,   60,
         70,   14,   79,   47,   16,   38,   21,   18,   59,   39,   30,
        717,   35,  134,   87,   42,   20,  110,   34,   32,   48,   31,
         25,   68,   46,   45,   40,   73,   80,   62,  136,   23,   52,
         78,   36,   63,   76,   51,   81,   26,   50,   75,   57,  109])

# host_name

In [75]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
11232,10034835,23571738,,March 2023,unknown,,,,f,1.0,...,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",41.0,2,,1,12,Los Angeles,1.0
35307,6900415,36159573,,March 2023,unknown,,,,f,1.0,...,1.0,2.0,"[""Wifi"", ""Washer"", ""Essentials"", ""Fire extingu...",99.0,0,,1,8,Los Angeles,1.0


# host_has_profile_pic

In [76]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
11232,10034835,23571738,,March 2023,unknown,,,,f,1.0,...,1.0,1.0,"[""Wifi"", ""Carbon monoxide alarm"", ""Smoke alarm...",41.0,2,,1,12,Los Angeles,1.0
35307,6900415,36159573,,March 2023,unknown,,,,f,1.0,...,1.0,2.0,"[""Wifi"", ""Washer"", ""Essentials"", ""Fire extingu...",99.0,0,,1,8,Los Angeles,1.0


In [77]:
updated_df['host_has_profile_pic'].unique()

array(['f', 't', nan], dtype=object)

In [78]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 2 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [79]:
updated_df['neighbourhood'].unique()

array([nan, 'Rancho Palos Verdes, California, United States',
       'Los Angeles, California, United States',
       'Torrance, California, United States',
       'Lomita, California, United States',
       'Whittier, California, United States',
       'Norwalk, California, United States',
       'Lancaster, California, United States',
       'Malibu, California, United States',
       'Lynwood, California, United States',
       'Downey, California, United States',
       'Diamond Bar, California, United States',
       'Walnut, California, United States',
       'Bellflower, California, United States',
       'Santa Clarita, California, United States',
       'Hawaiian Gardens, California, United States',
       'Redondo Beach, California, United States',
       'Montebello, California, United States',
       'Monterey Park, California, United States',
       'Agoura Hills, California, United States',
       'Hawthorne, California, United States',
       'Gardena, California, United

In [80]:
updated_df['neighbourhood'].isnull().sum()

17479

In [81]:
updated_df['neighbourhood_cleansed'].unique()

array(['Castaic Canyons', 'Pomona', 'Rancho Palos Verdes', 'Malibu',
       'San Pedro', 'Diamond Bar', 'West Compton',
       'Northeast Antelope Valley', 'West Carson',
       'Unincorporated Santa Susana Mountains', 'Lomita', 'Whittier',
       'Norwalk', 'Rolling Hills Estates', 'Lancaster', 'Artesia',
       'Bellflower', 'La Puente', 'Gardena', 'Lynwood', 'Downey',
       'Palmdale', 'Southeast Antelope Valley',
       'Unincorporated Santa Monica Mountains', 'San Fernando',
       'Santa Clarita', 'Walnut', 'Bell Gardens', 'Carson',
       'Stevenson Ranch', 'Palos Verdes Estates', 'Acton', 'Quartz Hill',
       'Porter Ranch', 'Granada Hills', 'Hawaiian Gardens',
       'Vermont Knolls', 'Northridge', 'Hacienda Heights', 'Torrance',
       'Rowland Heights', 'South San Gabriel', 'Montebello',
       'West Covina', 'Monterey Park', 'Vermont Square',
       'Florence-Firestone', 'Agoura Hills', 'Vernon', 'Brentwood',
       'Pacific Palisades', 'Hawthorne', 'La Verne', 'Claremont

### We are going to drop neighbourhood column for LA Dataset

In [82]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# Final look into missing values

In [83]:
updated_df.isnull().sum()

id                                    0
host_id                               0
host_name                             2
host_since                            0
host_location                         0
host_response_time                 9368
host_response_rate                 9368
host_acceptance_rate               8337
host_is_superhost                     0
host_listings_count                   0
host_total_listings_count             0
host_verifications                    0
host_has_profile_pic                  0
host_identity_verified                0
neighbourhood_cleansed                0
latitude                              0
longitude                             0
room_type                             0
accommodates                          0
bathrooms_text                        0
bedrooms                              0
beds                                  0
amenities                             0
price                                 0
number_of_reviews                     0


# Save the Final Dataframe

In [84]:
updated_df.to_csv('data/listings_detailed_after_na_la.csv', index=False)