# Portland

In [1]:
import pandas as pd
import numpy as np

In [2]:
listings_detailed = pd.read_csv('usa/Portland/listings_detailed.csv')

In [3]:
listings_detailed.shape

(4587, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(4587, 29)

In [6]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_portland.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [7]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 0]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2202,42384560,267899000,Hotel Lucia,2019-06-11,,within an hour,100%,66%,,10,...,4,,,,,"[""Carbon monoxide alarm"", ""Pack \u2019n play/T...",0.0,292,4.72,1
2589,49089774,267675881,The Heathman Hotel,2019-06-10,,within an hour,100%,100%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""First aid kit"", ""TV...",0.0,62,4.55,1
2668,49725603,290375303,Woodlark,2019-08-30,,within an hour,100%,84%,,15,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,62,4.52,1
2732,50342231,268592756,Hotel DeLuxe,2019-06-14,,,,89%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Pack \u2019n play/T...",0.0,18,3.94,1
2746,50481448,268188940,Sentinel Hotel,2019-06-12,,,,80%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,6,4.33,1
2957,52566619,290348655,Dossier,2019-08-30,,,,,,2,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,16,4.81,1


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


6


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2202,42384560,267899000,Hotel Lucia,2019-06-11,,within an hour,100%,66%,,10,...,4,,,,,"[""Carbon monoxide alarm"", ""Pack \u2019n play/T...",0.0,292,4.72,1
2589,49089774,267675881,The Heathman Hotel,2019-06-10,,within an hour,100%,100%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""First aid kit"", ""TV...",0.0,62,4.55,1
2668,49725603,290375303,Woodlark,2019-08-30,,within an hour,100%,84%,,15,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,62,4.52,1
2732,50342231,268592756,Hotel DeLuxe,2019-06-14,,,,89%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Pack \u2019n play/T...",0.0,18,3.94,1
2746,50481448,268188940,Sentinel Hotel,2019-06-12,,,,80%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,6,4.33,1
2957,52566619,290348655,Dossier,2019-08-30,,,,,,2,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,16,4.81,1


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
2589,49089774,267675881,The Heathman Hotel,2019-06-10,,within an hour,100%,100%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""First aid kit"", ""TV...",0.0,62,4.55,1
2668,49725603,290375303,Woodlark,2019-08-30,,within an hour,100%,84%,,15,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,62,4.52,1
2732,50342231,268592756,Hotel DeLuxe,2019-06-14,,,,89%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Pack \u2019n play/T...",0.0,18,3.94,1
2746,50481448,268188940,Sentinel Hotel,2019-06-12,,,,80%,,12,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,6,4.33,1
2957,52566619,290348655,Dossier,2019-08-30,,,,,,2,...,0,,,,,"[""Carbon monoxide alarm"", ""Bed sheets and pill...",0.0,16,4.81,1


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 6


In [14]:
cleaned_listings.shape

(4581, 29)

# Count 'amenities'

In [15]:
import ast
def count_amenities(amenities_str):
    try:
        # Convert the string representation of the list back into a list
        amenities_list = ast.literal_eval(amenities_str)
        # Return the count of items in the list
        return len(amenities_list)
    except (ValueError, SyntaxError):
        # In case of any error during conversion, return 0 (or you may choose to return NaN)
        return 0

# Apply the function to each row in the 'amenities' column and create a new column 'amenities_count'
cleaned_listings['amenities_count'] = cleaned_listings['amenities'].apply(count_amenities)

In [16]:
cleaned_listings['amenities_count']

0       35
1       27
2       44
3       36
4       44
        ..
4582    67
4583    60
4584    28
4585    43
4586    47
Name: amenities_count, Length: 4581, dtype: int64

# Add 'city' column

In [17]:
# Add in a city column to each dataframe so that we can use this as part of the primary key/use it to conduct groupby for
# future EDA or additional analysis as final table will contain all of our listings data
cleaned_listings['city'] = 'Portland'

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [18]:
cleaned_listings['bathrooms_text'].unique()

array(['1 bath', '1 shared bath', '1 private bath', '2 baths',
       '1.5 baths', '1.5 shared baths', '3 baths', '2.5 baths',
       '2 shared baths', '3 shared baths', '3.5 baths', '4 shared baths',
       '2.5 shared baths', '4 baths', 'Shared half-bath',
       'Private half-bath', nan, '5 baths', '3.5 shared baths',
       '11 shared baths', '12 shared baths', '12 baths', '0 shared baths',
       'Half-bath', '6 baths', '5 shared baths', '5.5 baths', '18 baths'],
      dtype=object)

In [19]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [20]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 bath' '1 shared bath' '1 private bath' '2 baths' '1.5 baths'
 '1.5 shared baths' '3 baths' '2.5 baths' '2 shared baths'
 '3 shared baths' '3.5 baths' '4 shared baths' '2.5 shared baths'
 '4 baths' '0.5 baths' '' '5 baths' '3.5 shared baths' '11 shared baths'
 '12 shared baths' '12 baths' '0 shared baths' '6 baths' '5 shared baths'
 '5.5 baths' '18 baths']


In [21]:
count_half_baths = (cleaned_listings['bathrooms_text'] == '0.5 baths').sum()
count_half_baths

5

### Check for the Nan for bathrooms_text

In [22]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
1131,21422556,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,,1.0,2.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",115.0,92,4.88,5,25,Portland
1217,22599038,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",93.0,128,4.86,5,21,Portland
1220,22614995,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",99.0,125,4.88,5,24,Portland
1603,29996383,224911430,Robin,2018-11-09,"Portland, OR",within an hour,100%,100%,t,6,...,,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",160.0,1,5.0,6,28,Portland


In [23]:
empty_bathrooms_text_ids = cleaned_listings[cleaned_listings['bathrooms_text'] == '']['id']
print(len(empty_bathrooms_text_ids))

4


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [24]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df


In [25]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [26]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,12899,2.0,2.0,1 bath
1,1360216,1.0,1.0,1 bath
2,1386231,1.0,1.0,1 shared bath
3,1386922,1.0,1.0,1 bath
4,1408895,1.0,1.0,1 bath
...,...,...,...,...
4582,413956,2.0,3.0,1 bath
4583,448195,4.0,5.0,2 baths
4584,449461,1.0,1.0,1 shared bath
4585,450488,1.0,1.0,1 shared bath


In [27]:
# # check for the previous NA rows in bathrooms text
filled_cleaned_listings[filled_cleaned_listings['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
1131,21422556,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0 bath,1.0,2.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",115.0,92,4.88,5,25,Portland
1217,22599038,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",93.0,128,4.86,5,21,Portland
1220,22614995,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",99.0,125,4.88,5,24,Portland
1603,29996383,224911430,Robin,2018-11-09,"Portland, OR",within an hour,100%,100%,t,6,...,1.0 bath,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",160.0,1,5.0,6,28,Portland


### Function to make numerical values for # of bath

In [28]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [29]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

     bathrooms_text  num_bath
0            1 bath       1.0
1            1 bath       1.0
2     1 shared bath       1.0
3            1 bath       1.0
4            1 bath       1.0
...             ...       ...
4582         1 bath       1.0
4583        2 baths       2.0
4584  1 shared bath       1.0
4585  1 shared bath       1.0
4586  1 shared bath       1.0

[4581 rows x 2 columns]


In [30]:
updated_df['num_bath'].isnull().sum()

0

In [31]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [32]:
updated_df['num_bath'].unique()

array([ 1. ,  2. ,  1.5,  3. ,  2.5,  3.5,  4. ,  0.5,  5. , 11. , 12. ,
        0. ,  6. ,  5.5, 18. ])

In [33]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [34]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,12899,49682,Ali And David,2009-10-29,"Portland, OR",,,100%,t,1,...,2.0,2.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",80.0,617,4.94,1,35,Portland,1.0
1,1360216,1186353,Fred,2011-09-20,"Palm Springs, CA",within a day,100%,78%,t,6,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",56.0,403,4.78,2,27,Portland,1.0
2,1386231,2236909,Shane,2012-04-27,"Portland, OR",within a few hours,100%,95%,f,6,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",45.0,68,4.76,6,44,Portland,1.0
3,1386922,3859198,Justin,2012-10-13,"Portland, OR",within an hour,100%,93%,f,2,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",75.0,190,4.54,1,36,Portland,1.0
4,1408895,942377,Emily,2011-08-08,"Portland, OR",within a few hours,100%,44%,t,1,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Lockbox"", ""Refriger...",99.0,74,4.74,1,44,Portland,1.0


# Impute Bedrooms and Beds

In [35]:
updated_df['bedrooms'].unique()

array([ 2.,  1., nan,  3.,  5.,  4.,  8.,  7.,  6.,  9., 18.])

In [36]:
updated_df['beds'].unique()

array([ 2.,  1.,  3.,  5.,  6., 12.,  4.,  7., nan,  9.,  8., 16., 19.,
       11., 13., 10., 15., 18.])

In [37]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
8,1525272,8138263,Elizabeth,2013-08-13,"Portland, OR",within an hour,100%,73%,t,1,...,,1.0,"[""Shampoo"", ""Essentials"", ""Carbon monoxide ala...",60.0,15,4.67,1,12,Portland,1.0
18,1646847,8739391,Chris,2013-09-09,"Portland, OR",within an hour,100%,50%,f,2,...,,1.0,"[""Shampoo"", ""Bed linens"", ""Essentials"", ""Carbo...",89.0,116,4.77,2,19,Portland,1.0
19,1650401,1178421,Stephanie,2011-09-19,"Portland, OR",within a day,100%,95%,t,1,...,,3.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Pac...",50.0,460,4.88,1,28,Portland,1.0
24,1703888,8036326,Martin,2013-08-08,"Portland, OR",within an hour,100%,20%,t,1,...,,2.0,"[""Carbon monoxide alarm"", ""Coffee maker: pour-...",103.0,128,4.90,1,38,Portland,1.0
25,1717606,9055739,Andy,2013-09-25,"Portland, OR",within a few hours,100%,100%,t,1,...,,2.0,"[""Carbon monoxide alarm"", ""Whirlpool refrigera...",110.0,39,4.95,1,66,Portland,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4478,637362,869146,Gina And Andy,2011-07-25,"Portland, OR",within an hour,100%,100%,t,2,...,,1.0,"[""Carbon monoxide alarm"", ""Shared outdoor pool...",98.0,305,4.92,1,53,Portland,1.0
4490,760737,2646777,Kyle,2012-06-15,"Portland, OR",within an hour,100%,94%,t,1,...,,2.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",86.0,195,4.93,1,43,Portland,1.0
4509,951387,5161731,Debra & Ludger,2013-02-20,"Portland, OR",within an hour,100%,100%,t,1,...,,1.0,"[""Drying rack for clothing"", ""Carbon monoxide ...",76.0,557,4.83,1,53,Portland,1.0
4522,1103529,6060622,Norah,2013-04-23,"Portland, OR",within an hour,100%,100%,t,3,...,,2.0,"[""Coffee maker: pour-over coffee"", ""Refrigerat...",69.0,732,4.89,2,44,Portland,1.0


In [38]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
111,3562307,17615383,Patrick,2014-07-03,"Portland, OR",within a day,100%,50%,f,1,...,2.0,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Bea...",36.0,275,4.79,1,30,Portland,1.0
167,4363516,13243736,Deb,2014-03-17,"Portland, OR",within a few hours,100%,100%,f,1,...,,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",80.0,0,,1,39,Portland,1.0
334,7398659,6465877,Todd & Lea,2013-05-19,"Portland, OR",,,100%,t,1,...,,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Bar...",100.0,296,4.96,1,54,Portland,1.0
412,8981600,7644885,Joanna,2013-07-22,"Portland, OR",within an hour,100%,100%,f,1,...,,,"[""Carbon monoxide alarm"", ""Trader Joes Tea Tre...",65.0,240,4.92,1,36,Portland,1.0
1065,20458269,48307943,Amy And Derek,2015-11-06,"Portland, OR",,,67%,f,1,...,,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",49.0,135,4.92,1,52,Portland,1.0
1427,26267199,5029238,Jessica,2013-02-09,"Portland, OR",within an hour,100%,100%,t,1,...,,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",51.0,271,4.93,1,39,Portland,1.0
1550,29129259,112893366,Sam,2017-01-23,"Portland, OR",within an hour,100%,100%,f,2,...,1.0,,"[""Drying rack for clothing"", ""Refrigerator"", ""...",34.0,63,4.83,2,56,Portland,1.0
1649,31284993,123968362,Arvell,2017-04-02,"Portland, OR",within an hour,100%,83%,f,4,...,,,"[""Carbon monoxide alarm"", ""Smoke alarm"", ""Cook...",250.0,2,4.5,4,24,Portland,1.0
1678,31950837,65277088,Reza,2016-03-31,"Portland, OR",within an hour,100%,100%,t,1,...,,,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",75.0,90,4.84,1,47,Portland,1.0
1905,36701090,231707604,Charles,2018-12-20,"Portland, OR",,,,f,2,...,1.0,,"[""Shampoo"", ""Essentials"", ""Carbon monoxide ala...",87.0,101,4.79,2,17,Portland,1.0


In [39]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [40]:
updated_df = update_bedrooms_and_beds(updated_df)


In [41]:
# check for the previous NA rows in bedrooms
updated_df[updated_df['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
1131,21422556,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0,2.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",115.0,92,4.88,5,25,Portland,1.0
1217,22599038,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",93.0,128,4.86,5,21,Portland,1.0
1220,22614995,2915808,Stephen,2012-07-12,"Portland, OR",within an hour,100%,98%,t,5,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",99.0,125,4.88,5,24,Portland,1.0
1603,29996383,224911430,Robin,2018-11-09,"Portland, OR",within an hour,100%,100%,t,6,...,1.0,1.0,"[""Carbon monoxide alarm"", ""Refrigerator"", ""Smo...",160.0,1,5.0,6,28,Portland,1.0


In [42]:
updated_df['beds'].unique()

array([ 2.,  1.,  3.,  5.,  6., 12.,  4.,  7.,  9.,  8., 16., 19., 11.,
       13., 10., 15., 18.])

In [43]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [44]:
updated_df['bedrooms'].unique()

array([ 2.,  1.,  3.,  5.,  4.,  8.,  7.,  6.,  9., 18.])

In [45]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_since

In [46]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [47]:
updated_df['host_since'].isnull().sum()

0

In [48]:
updated_df['host_since'].unique()

array(['2009-10-29', '2011-09-20', '2012-04-27', ..., '2011-08-12',
       '2009-09-15', '2011-04-24'], dtype=object)

In [49]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_location

In [50]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 640 missing values in the "host_location" column.


In [51]:
updated_df['host_location'].isnull().sum()

0

In [52]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

640

In [53]:
updated_df['host_location'].unique()

array(['Portland, OR', 'Palm Springs, CA', 'Stevenson, WA', 'Boise, ID',
       'unknown', 'New York, NY', 'Eugene, OR', 'Grants Pass, OR',
       'Hood River, OR', 'United States', 'Cathedral City, CA',
       'Silverton, OR', 'Seattle, WA', 'Honolulu, HI', 'Los Angeles, CA',
       'Beaverton, OR', 'Ashland, OR', 'Vancouver, Canada', 'Corbett, OR',
       'San Francisco, CA', 'Mission Viejo, CA', 'Palm Desert, CA',
       'Rivergrove, OR', 'Bend, OR', 'Berkeley, CA',
       'Maine, United States', 'San Diego, CA', 'Canby, OR',
       'Washington, United States', 'Anaheim, CA', 'Laramie, WY',
       'Oregon, United States', 'Zagreb, Croatia', 'Lake Oswego, OR',
       'Santa Cruz, CA', 'Corvallis, OR', 'Mountain View, CA',
       'San Carlos, CA', 'Camas, WA', 'Redlands, CA', 'Washington, DC',
       'Allentown, PA', 'Houston, TX', 'Keizer, OR', 'Santa Rosa, CA',
       'Hillsboro, OR', 'Baton Rouge, LA', 'Omaha, NE', 'Auburn, WA',
       'Birmingham, AL', 'Georgetown, TX', 'Flinders,

In [54]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [55]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_is_superhost

In [56]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [57]:
updated_df['host_is_superhost'].isnull().sum()

0

In [58]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

2015

In [59]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [60]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [61]:
updated_df['host_listings_count'].isnull().sum()

0

In [62]:
updated_df['host_listings_count'].unique()

array([   1,    6,    2,    3,    8,    4,   34,    7,    5,   42,   12,
          9,   14,   10,   17,   15,   11,   16,  301,  118,   13,  236,
        151,  187,  129,   30,   45, 2642,   96,   27,  148,  201, 3302,
        203,   20, 1046, 1503,   25,   26, 2307,  515,   60,   22,   53,
         21,  659,  372,  385,  337,  287,   89,   19,   61, 2504,  218,
        829,  805])

# host_total_listings_count

In [63]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [64]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [65]:
updated_df['host_total_listings_count'].unique()

array([   1,    6,   10,    2,    4,    3,   11,    5,   43,    7,    9,
          8,   13,   16,   12,   17,   33,   14,   40,   21,  504,  121,
        264,   19,  286,   42,   22,  292,   35,   49,   36, 8410,   99,
        303,  239, 4532,  236,   20,   15, 1841, 1967,  125,   30, 7813,
        724,  124,   48,   93,  716,  372,  465,  542,  378,  109,  165,
       2776,  277, 1101,  851,   18])

# host_verifications

In [66]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [67]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", '[]', "['email']"],
      dtype=object)

In [68]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
2161,41588378,327787591,Katrina,2020-01-15,"Portland, OR",within an hour,88%,100%,f,1,...,1.0,1.0,"[""Shared pool"", ""Carbon monoxide alarm"", ""Refr...",60.0,5,4.4,1,33,Portland,1.0
2692,50000480,420965581,Nan,2021-08-31,unknown,,,,f,1,...,1.0,2.0,"[""Drying rack for clothing"", ""Carbon monoxide ...",96.0,53,5.0,1,64,Portland,1.0


In [69]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [70]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", 'None', "['email']"],
      dtype=object)

# host_identity_verified

In [71]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [72]:
updated_df['host_identity_verified'].unique()

array(['f', 't'], dtype=object)

# calculated_host_listings_count

In [73]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [74]:
updated_df['calculated_host_listings_count'].unique()

array([ 1,  2,  6,  3,  8,  4, 28,  5, 26, 12,  7,  9, 13, 11, 16, 33, 15,
       18, 40, 60, 56, 19, 24, 14, 20, 10])

# host_name

In [75]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_has_profile_pic

In [76]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [77]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

In [78]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 0 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [79]:
updated_df['neighbourhood'].unique()

array(['Portland, Oregon, United States', nan,
       'Lake Oswego, Oregon, United States', 'United States',
       'Portland , Oregon, United States',
       'Happy Valley, Oregon, United States'], dtype=object)

In [80]:
updated_df['neighbourhood'].isnull().sum()

1108

In [81]:
updated_df['neighbourhood_cleansed'].unique()

array(['Concordia', 'Northwest District', 'King', 'Humboldt', 'Roseway',
       'Richmond', 'South Tabor', 'Cully', 'Irvington', 'Piedmont',
       'Portland Downtown', 'Eliot', 'Pearl',
       'Sellwood-Moreland Improvement League', 'Hillsdale', 'Montavilla',
       'Mt. Tabor', 'Woodlawn', 'Rose City Park', 'Laurelhurst',
       'Sunnyside', 'Eastmoreland', 'North Tabor', 'Hosford-Abernethy',
       'Overlook', 'Old Town/Chinatown', 'Vernon', 'Kerns', 'Boise',
       'Arbor Lodge', 'Buckman', 'West Portland Park', 'Sabin',
       'Forest Park', 'Southwest Hills', 'Kenton',
       'Brooklyn Action Corps', 'Grant Park', 'University Park',
       'St. Johns', "Sullivan's Gulch", 'Beaumont-Wilshire', 'Portsmouth',
       'Foster-Powell', 'Parkrose', 'Reed', 'Alameda', 'Multnomah',
       'Hazelwood', 'Cathedral Park', 'Homestead', 'Woodstock',
       'Arlington Heights', 'South Burlingame', 'Goose Hollow',
       'Mt. Scott-Arleta', 'South Portland', 'Argay', 'Hillside',
       'Creston-

### We are going to drop neighbourhood column for Portland Dataset

In [82]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# Final look into missing values

In [83]:
updated_df.isnull().sum()

id                                  0
host_id                             0
host_name                           0
host_since                          0
host_location                       0
host_response_time                681
host_response_rate                681
host_acceptance_rate              388
host_is_superhost                   0
host_listings_count                 0
host_total_listings_count           0
host_verifications                  0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
latitude                            0
longitude                           0
room_type                           0
accommodates                        0
bathrooms_text                      0
bedrooms                            0
beds                                0
amenities                           0
price                               0
number_of_reviews                   0
review_scores_value               625
calculated_h

# Save the Final Dataframe

In [84]:
updated_df.to_csv('data/listings_detailed_after_na_portland.csv', index=False)