# New Orleans

In [1]:
import pandas as pd
import numpy as np

In [2]:
listings_detailed = pd.read_csv('usa/New Orleans/listings_detailed-nola.csv')

In [3]:
listings_detailed.shape

(7056, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(7056, 29)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [6]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 120]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,4,,1 bath,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.00,2
9,52655969,49568908,Robert,2015-11-21,"New York, NY",within a day,100%,50%,t,4,...,4,,1 bath,,1.0,"[""Air conditioning"", ""Smoke alarm"", ""Essential...",100.0,2,5.00,3
53,46541621,266619399,Tim And Kaelyn,2019-06-05,"Lafayette, LA",within an hour,100%,100%,f,7,...,6,,1 bath,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",100.0,107,4.39,7
112,719124317899114856,140311305,Charles,2017-07-13,"New Orleans, LA",within an hour,100%,,f,1,...,3,,1 bath,1.0,1.0,"[""Air conditioning"", ""Exercise equipment"", ""Sm...",100.0,0,,1
157,14264317,86915355,Jessica,2016-07-30,,within an hour,100%,94%,t,1,...,4,,1 bath,1.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",100.0,112,4.92,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6921,21477155,16783088,Nicole,2014-06-14,"New Orleans, LA",,,,f,2,...,5,,1 bath,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Game console""...",100.0,34,4.94,2
6949,19458946,124312955,Matt,2017-04-04,"New Orleans, LA",within an hour,100%,100%,t,1,...,2,,1 bath,1.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",100.0,323,4.91,1
6985,24312359,29120500,Kendall,2015-03-10,"Esmont, VA",,,,f,1,...,2,,1.5 baths,1.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",100.0,5,5.00,1
7018,25511034,1918545,Henry,2012-03-13,"New Orleans, LA",,,,f,1,...,2,,1 bath,,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",100.0,28,4.93,1


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


8


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,49751833,267642269,The Old No. 77 Hotel & Chandlery,2019-06-10,,within an hour,100%,95%,,11,...,0,,,,,"[""Smoke alarm"", ""Free wifi"", ""Safe"", ""Bed shee...",0.0,284,4.45,1
16,43309170,311900534,Melrose Suites,2019-11-24,,,,,,2,...,0,,,,,"[""Smoke alarm"", ""Coffee maker"", ""Free wifi"", ""...",0.0,0,,1
103,43309178,311388965,Streetcar Inn,2019-11-22,,,,,,4,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,0,,1
105,42535097,315008938,The Drifter Hotel,2019-12-06,,,,,,5,...,8,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,0,,1
226,43309185,311388674,Chateau Hotel,2019-11-22,,,,,,4,...,0,,,,,"[""Toiletries"", ""Hair dryer"", ""Smoke alarm"", ""C...",0.0,0,,1
227,43309194,311388025,Audubon Cottages,2019-11-22,,,,,,7,...,0,,,,,"[""Air conditioning"", ""42\"" TV"", ""Toiletries"", ...",0.0,0,,1
3404,43309157,311387788,Hotel Royal,2019-11-22,,,,,,5,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,0,,1
5674,42875637,266877580,Federal City,2019-06-07,,,,20%,,2,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,3,1.67,1


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]




Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
0,49751833,267642269,The Old No. 77 Hotel & Chandlery,2019-06-10,,within an hour,100%,95%,,11,...,0,,,,,"[""Smoke alarm"", ""Free wifi"", ""Safe"", ""Bed shee...",0.0,284,4.45,1
16,43309170,311900534,Melrose Suites,2019-11-24,,,,,,2,...,0,,,,,"[""Smoke alarm"", ""Coffee maker"", ""Free wifi"", ""...",0.0,0,,1
103,43309178,311388965,Streetcar Inn,2019-11-22,,,,,,4,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,0,,1
226,43309185,311388674,Chateau Hotel,2019-11-22,,,,,,4,...,0,,,,,"[""Toiletries"", ""Hair dryer"", ""Smoke alarm"", ""C...",0.0,0,,1
227,43309194,311388025,Audubon Cottages,2019-11-22,,,,,,7,...,0,,,,,"[""Air conditioning"", ""42\"" TV"", ""Toiletries"", ...",0.0,0,,1
3404,43309157,311387788,Hotel Royal,2019-11-22,,,,,,5,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,0,,1
5674,42875637,266877580,Federal City,2019-06-07,,,,20%,,2,...,0,,,,,"[""Air conditioning"", ""Toiletries"", ""Hair dryer...",0.0,3,1.67,1


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 8


In [14]:
cleaned_listings.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1,54052921,227505133,Dan,2018-11-25,"New Orleans, LA",,,100%,t,5,...,1,,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Clothing storage: closet"", ""D...",42.0,2,5.0,5
2,611584572696788093,244418201,Jabari,2019-02-20,"Nola, Italy",within an hour,100%,100%,t,2,...,10,,2 baths,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",263.0,24,4.71,2
3,806067969324909110,4423079,Guy,2012-12-17,"New Orleans, LA",within an hour,100%,,f,1,...,6,,2 baths,3.0,5.0,"[""Air conditioning"", ""TV"", ""Indoor fireplace"",...",255.0,0,,1
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,4,,1 bath,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.0,2
5,557502553052386952,12305030,Jason,2014-02-16,"New Orleans, LA",within an hour,100%,94%,t,1,...,2,,1 private bath,1.0,1.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",90.0,27,4.89,1


In [15]:
cleaned_listings.shape

(7048, 29)

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [16]:
cleaned_listings['bathrooms_text'].unique()

array(['1 shared bath', '2 baths', '1 bath', '1 private bath',
       '2.5 baths', '3 baths', '4.5 baths', '1.5 baths', '4 baths',
       '3.5 baths', '1.5 shared baths', '5 baths', '2 shared baths',
       '5.5 baths', '9 baths', nan, '2.5 shared baths', '0 shared baths',
       '8.5 baths', '6 baths', '0 baths', '7 baths', '9.5 baths',
       '10.5 baths', '13.5 baths', '8 baths', '3 shared baths',
       '18 baths', '6.5 baths', '11 baths', '25 baths', '11.5 baths',
       '6.5 shared baths'], dtype=object)

In [17]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [18]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['1 shared bath' '2 baths' '1 bath' '1 private bath' '2.5 baths' '3 baths'
 '4.5 baths' '1.5 baths' '4 baths' '3.5 baths' '1.5 shared baths'
 '5 baths' '2 shared baths' '5.5 baths' '9 baths' '' '2.5 shared baths'
 '0 shared baths' '8.5 baths' '6 baths' '0 baths' '7 baths' '9.5 baths'
 '10.5 baths' '13.5 baths' '8 baths' '3 shared baths' '18 baths'
 '6.5 baths' '11 baths' '25 baths' '11.5 baths' '6.5 shared baths']


### Check for the Nan for bathrooms_text

In [19]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
967,24247731,121931799,Monica,2017-03-21,"New Orleans, LA",,,33%,f,1,...,16,,,11.0,9.0,"[""Baby monitor"", ""Smoke alarm"", ""Refrigerator""...",5400.0,0,,1
1728,665259,1451330,Leron And Dennis,2011-11-28,"New Orleans, LA",within an hour,100%,100%,t,2,...,2,,,1.0,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",89.0,227,4.69,2
3181,18019499,104792142,Glen,2016-11-22,"New Orleans, LA",within a day,100%,82%,f,4,...,4,,,2.0,2.0,"[""Smoke alarm"", ""Patio or balcony"", ""Pool"", ""H...",300.0,5,4.8,4
5400,35807105,268586331,Vic,2019-06-14,"New Orleans, LA",within an hour,100%,99%,t,7,...,4,,,2.0,3.0,"[""Smoke alarm"", ""Essentials"", ""Fire extinguish...",339.0,48,4.68,7
5731,35807260,268586331,Vic,2019-06-14,"New Orleans, LA",within an hour,100%,99%,t,7,...,4,,,2.0,4.0,"[""Smoke alarm"", ""Essentials"", ""Fire extinguish...",359.0,25,4.88,7


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [20]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df



In [21]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [22]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
1,54052921,1.0,1.0,1 shared bath
2,611584572696788093,3.0,3.0,2 baths
3,806067969324909110,3.0,5.0,2 baths
4,35649853,2.0,2.0,1 bath
5,557502553052386952,1.0,1.0,1 private bath
...,...,...,...,...
7051,29336328,2.0,4.0,2 baths
7052,29516680,3.0,3.0,2 baths
7053,29779203,2.0,2.0,1 bath
7054,29799160,2.0,2.0,2 baths


In [23]:
# check for the previous NA rows in bathrooms text
filled_cleaned_listings[
    (filled_cleaned_listings['id'] == 35807260) | 
    (filled_cleaned_listings['id'] == 35807105) | 
    (filled_cleaned_listings['id'] == 18019499) | 
    (filled_cleaned_listings['id'] == 665259) ]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count
1728,665259,1451330,Leron And Dennis,2011-11-28,"New Orleans, LA",within an hour,100%,100%,t,2,...,2,,1.0 bath,1.0,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",89.0,227,4.69,2
3181,18019499,104792142,Glen,2016-11-22,"New Orleans, LA",within a day,100%,82%,f,4,...,4,,2.0 bath,2.0,2.0,"[""Smoke alarm"", ""Patio or balcony"", ""Pool"", ""H...",300.0,5,4.8,4
5400,35807105,268586331,Vic,2019-06-14,"New Orleans, LA",within an hour,100%,99%,t,7,...,4,,2.0 bath,2.0,3.0,"[""Smoke alarm"", ""Essentials"", ""Fire extinguish...",339.0,48,4.68,7
5731,35807260,268586331,Vic,2019-06-14,"New Orleans, LA",within an hour,100%,99%,t,7,...,4,,2.0 bath,2.0,4.0,"[""Smoke alarm"", ""Essentials"", ""Fire extinguish...",359.0,25,4.88,7


### Function to make numerical values for # of bath

In [24]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [25]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
1      1 shared bath       1.0
2            2 baths       2.0
3            2 baths       2.0
4             1 bath       1.0
5     1 private bath       1.0
...              ...       ...
7051         2 baths       2.0
7052         2 baths       2.0
7053          1 bath       1.0
7054         2 baths       2.0
7055       1.5 baths       1.5

[7048 rows x 2 columns]


In [26]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
1,54052921,227505133,Dan,2018-11-25,"New Orleans, LA",,,100%,t,5,...,,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Clothing storage: closet"", ""D...",42.0,2,5.0,5,1.0
2,611584572696788093,244418201,Jabari,2019-02-20,"Nola, Italy",within an hour,100%,100%,t,2,...,,2 baths,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",263.0,24,4.71,2,2.0
3,806067969324909110,4423079,Guy,2012-12-17,"New Orleans, LA",within an hour,100%,,f,1,...,,2 baths,3.0,5.0,"[""Air conditioning"", ""TV"", ""Indoor fireplace"",...",255.0,0,,1,2.0
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,,1 bath,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.0,2,1.0
5,557502553052386952,12305030,Jason,2014-02-16,"New Orleans, LA",within an hour,100%,94%,t,1,...,,1 private bath,1.0,1.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",90.0,27,4.89,1,1.0


In [27]:
updated_df['num_bath'].isnull().sum()

0

In [28]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [29]:
updated_df['num_bath'].unique()

array([ 1. ,  2. ,  2.5,  3. ,  4.5,  1.5,  4. ,  3.5,  5. ,  5.5,  9. ,
       11. ,  0. ,  8.5,  6. ,  7. ,  9.5, 10.5, 13.5,  8. , 18. ,  6.5,
       25. , 11.5])

In [30]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [31]:
updated_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
1,54052921,227505133,Dan,2018-11-25,"New Orleans, LA",,,100%,t,5,...,1,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Clothing storage: closet"", ""D...",42.0,2,5.00,5,1.0
2,611584572696788093,244418201,Jabari,2019-02-20,"Nola, Italy",within an hour,100%,100%,t,2,...,10,2 baths,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",263.0,24,4.71,2,2.0
3,806067969324909110,4423079,Guy,2012-12-17,"New Orleans, LA",within an hour,100%,,f,1,...,6,2 baths,3.0,5.0,"[""Air conditioning"", ""TV"", ""Indoor fireplace"",...",255.0,0,,1,2.0
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,4,1 bath,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.00,2,1.0
5,557502553052386952,12305030,Jason,2014-02-16,"New Orleans, LA",within an hour,100%,94%,t,1,...,2,1 private bath,1.0,1.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",90.0,27,4.89,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7051,29336328,220963373,Marlee,2018-10-16,"Hammond, LA",,,,f,1,...,7,2 baths,2.0,4.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",149.0,20,4.60,1,2.0
7052,29516680,2591964,Celeste,2012-06-09,"New Orleans, LA",,,,f,1,...,8,2 baths,3.0,3.0,"[""Smoke alarm"", ""Babysitter recommendations"", ...",140.0,1,4.00,1,2.0
7053,29779203,103153653,Robert,2016-11-09,"New Orleans, LA",,,,f,3,...,4,1 bath,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",69.0,19,4.89,1,1.0
7054,29799160,15524853,Meggan,2014-05-14,"New Orleans, LA",,,,f,3,...,3,2 baths,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",71.0,39,4.92,2,2.0


# Impute Bedrooms and Beds

In [32]:
updated_df['bedrooms'].unique()

array([ 1.,  3.,  2., nan,  4.,  5.,  6.,  9.,  7., 11.,  8., 10., 15.,
       17., 18., 25., 23.])

In [33]:
updated_df['beds'].unique()

array([ 1.,  3.,  5.,  2.,  4., nan,  8.,  7.,  6., 10.,  9., 13., 17.,
       12., 16., 11., 14., 15., 20., 21., 24., 19., 31.])

In [34]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
9,52655969,49568908,Robert,2015-11-21,"New York, NY",within a day,100%,50%,t,4,...,4,1 bath,,1.0,"[""Air conditioning"", ""Smoke alarm"", ""Essential...",100.0,2,5.00,3,1.0
25,39487985,120132877,David,2017-03-10,"Setauket- East Setauket, NY",within an hour,90%,97%,t,45,...,4,1 shared bath,,2.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",150.0,0,,3,1.0
50,579636870207674260,442922842,Canal Express Studios,2022-01-31,,within an hour,100%,99%,f,14,...,4,1 bath,,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",115.0,8,4.13,14,1.0
69,580258457203568391,442922842,Canal Express Studios,2022-01-31,,within an hour,100%,99%,f,14,...,3,1 bath,,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",94.0,7,4.14,14,1.0
90,19472402,12613718,Matthew & Miranda,2014-02-26,"Chicago, IL",within an hour,100%,100%,f,343,...,2,1 bath,,1.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",106.0,0,,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6978,23397681,157921349,James,2017-11-08,"New Orleans, LA",,,,f,10,...,2,1 bath,,1.0,"[""Smoke alarm"", ""Coffee maker"", ""Shampoo"", ""Se...",65.0,73,4.67,6,1.0
7018,25511034,1918545,Henry,2012-03-13,"New Orleans, LA",,,,f,1,...,2,1 bath,,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",100.0,28,4.93,1,1.0
7027,26888889,140393442,Dorise,2017-07-14,"New Orleans, LA",,,,f,2,...,2,1 bath,,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",95.0,56,4.98,2,1.0
7034,28158610,98693456,James,2016-10-07,,,,,f,5,...,2,1 bath,,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",75.0,52,4.83,4,1.0


In [35]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
15,42670637,93924717,Diana,2016-09-06,"New York, NY",,,,f,1,...,6,2 baths,2.0,,"[""Smoke alarm"", ""Hot tub"", ""Pool"", ""Shampoo"", ...",140.0,0,,1,2.0
44,589710011371284658,342910048,LiivUP,2020-03-31,"Baton Rouge, LA",within an hour,100%,100%,t,16,...,4,2 baths,2.0,,"[""Hair dryer"", ""Air conditioning"", ""Shampoo"", ...",283.0,7,4.57,16,2.0
145,9979215,48187766,Olive,2015-11-04,"New Orleans, LA",within an hour,100%,86%,f,24,...,3,1 shared bath,1.0,,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",49.0,88,4.59,24,1.0
363,42538906,338825631,Elliott,2020-02-25,"New Orleans, LA",within a few hours,100%,94%,t,1,...,4,1 bath,2.0,,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",155.0,18,4.89,1,1.0
846,563629325142763733,64823547,Brenna And Nick,2016-03-28,"New Orleans, LA",within an hour,100%,100%,t,78,...,2,1 bath,1.0,,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",43.0,3,5.0,74,1.0
959,42692683,325629344,Dorothy,2020-01-08,"New Orleans, LA",within an hour,100%,100%,t,2,...,2,1 shared bath,1.0,,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",99.0,22,4.77,2,1.0
1079,52533707,185244265,One Stop Property Shop,2018-04-19,"Harvey, LA",within an hour,100%,100%,t,28,...,2,1 private bath,1.0,,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",97.0,1,5.0,22,1.0
1299,635419755040311462,64823547,Brenna And Nick,2016-03-28,"New Orleans, LA",within an hour,100%,100%,t,78,...,2,1 bath,1.0,,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Se...",70.0,2,5.0,74,1.0
1336,41804195,65665251,Jeanette,2016-04-03,"Slidell, LA",,,100%,f,5,...,2,1.5 shared baths,1.0,,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",257.0,0,,5,1.5
1750,823866116424585046,243202376,Terrene,2019-02-14,,within a day,100%,77%,f,2,...,6,2 baths,2.0,,"[""Air conditioning"", ""Smoke alarm"", ""Fire exti...",209.0,0,,2,2.0


In [36]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [37]:
updated_df = update_bedrooms_and_beds(updated_df)


In [38]:
# check for the previous NA rows in bedrooms
updated_df[
    (updated_df['id'] == 22882224) | 
    (updated_df['id'] == 54316788) | 
    (updated_df['id'] == 562957805140895010) 
]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
3818,54316788,64823547,Brenna And Nick,2016-03-28,"New Orleans, LA",within an hour,100%,100%,t,78,...,2,1 bath,1.0,1.0,"[""Smoke alarm"", ""Babysitter recommendations"", ...",63.0,2,5.0,74,1.0
4572,22882224,118004286,Colleen,2017-02-25,"New Orleans, LA",within an hour,100%,88%,t,3,...,2,1 bath,1.0,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",113.0,118,4.81,3,1.0
6097,562957805140895010,218602737,Sonder (New Orleans),2018-10-03,"New Orleans, LA",within an hour,99%,99%,f,81,...,4,2 baths,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",362.0,138,4.64,78,2.0


In [39]:
updated_df['beds'].unique()

array([ 1.,  3.,  5.,  2.,  4.,  8.,  7.,  6., 10.,  9., 13., 17., 12.,
        0., 16., 11., 14., 15., 20., 21., 24., 19., 31.])

In [40]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [41]:
updated_df['bedrooms'].unique()

array([ 1.,  3.,  2.,  4.,  5.,  6.,  9.,  7., 11.,  8.,  0., 10., 15.,
       17., 18., 25., 23.])

In [42]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_since

In [43]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [44]:
updated_df['host_since'].isnull().sum()

0

In [45]:
updated_df[updated_df['host_since'] == 'N/A']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [46]:
updated_df['host_since'].unique()

array(['2018-11-25', '2019-02-20', '2012-12-17', ..., '2012-06-09',
       '2016-11-09', '2013-04-16'], dtype=object)

# host_location

In [47]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 1794 missing values in the "host_location" column.


In [48]:
updated_df['host_location'].isnull().sum()

0

In [49]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

1794

In [50]:
updated_df['host_location'].unique()

array(['New Orleans, LA', 'Nola, Italy', 'unknown', 'New York, NY',
       'Metairie, LA', 'Violet, LA', 'Setauket- East Setauket, NY',
       'Baton Rouge, LA', 'Orlando, FL', 'Oak Park, IL', 'Chicago, IL',
       'Ontario, CA', 'Lafayette, LA', 'Slidell, LA', 'Gretna, LA',
       'Austin, TX', 'Plantation, FL', 'Pennsylvania, United States',
       'United States', 'London, United Kingdom', 'Asheville, NC',
       'Tampa, FL', 'Picayune, MS', 'Harvey, LA', 'Albuquerque, NM',
       'Cedar Lake, IN', 'Bluffton, SC', 'Houston, TX', 'Semmes, AL',
       'Union City, GA', 'Norfolk, VA', 'Pegram, TN', 'Tustin, CA',
       'Portland, OR', 'Baltimore, MD', 'Emeryville, CA', 'Denver, CO',
       'Natchitoches, LA', 'Illinois City, IL', 'Atlanta, GA',
       'San Francisco, CA', 'Lake Charles, LA', 'Erie, CO', 'Dallas, TX',
       'Mandeville, LA', 'New York, United States', 'Las Vegas, NV',
       'Oakland, CA', 'Miramar, FL', 'Louisiana, United States',
       'Harbor, OR', 'Pensacola, FL',

In [51]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_is_superhost

In [52]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [53]:
updated_df['host_is_superhost'].isnull().sum()

0

In [54]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

4059

In [55]:
updated_df['host_is_superhost'].unique()

array(['t', 'f'], dtype=object)

# host_listings_count

In [56]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [57]:
updated_df['host_listings_count'].isnull().sum()

0

In [58]:
updated_df['host_listings_count'].unique()

array([   5,    2,    1,    4,    3,  212,    7,   45,  419,   12,   16,
        343,   14,   27,   63,   11,   24,   17,   20,    9,  904,   15,
          8,   10,   18,  119,  201,   30,   75,   31,    6,   61,   28,
         41,   68,  536,   78,   21,   46,   37,  202,   84,   19,  147,
         53,  735,  365, 3291,   39,   43,  117,  167,   81,   13, 2642,
         22,   92,   29,   23,  449,   25,   97,  166,  112, 2508,  226,
        330,   44,  187,   70,  374,  225,  630,   36,  132,   58,   50,
        355,  559,   51,   52,   40, 2072,   38,   42,  152,   62])

# host_total_listings_count

In [59]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [60]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [61]:
updated_df['host_total_listings_count'].unique()

array([   8,    2,    1,    6,    3,    4,  265,   23,    7,   57,    5,
        570,   13,   20,   18,  471,   15,   33,   21,    9,   75,   25,
         35,   12,   24,   56,   11, 1622,   10,   19,   62,  122,  239,
         30,   48,   81,   42,   65,   28,  198,  782,   14,  181,   52,
         38,  364,   86,  351,   73,   16,   34,   93, 1417,  838, 4527,
         54,  196,   17,   41,  417,   31,  153,   26,  536,   29,   27,
         51, 8331,   32,   22,  147,   53,   39,  491,   37,   78,  110,
         43,  167,  161, 2750,   79,  252, 1115,   61,  286,  298,  375,
        238,  244,  736,  190,  150,   64,  106,  392,  662,  102,   46,
       2268,   47,   55,  323,   77,   66,   90,   84,  205])

# host_verifications

In [62]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [63]:
updated_df['host_verifications'].unique()

array(["['email', 'phone', 'work_email']", "['email', 'phone']",
       "['phone']", "['phone', 'work_email']", '[]', "['email']"],
      dtype=object)

In [64]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath
1677,26556038,16694390,Beth,2016-05-20,"New Orleans, LA",within an hour,100%,93%,t,2,...,2,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",109.0,56,4.96,2,1.0
4344,26448282,16694390,Beth,2016-05-20,"New Orleans, LA",within an hour,100%,93%,t,2,...,2,1 bath,1.0,1.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",129.0,76,4.95,2,1.0
4660,806434116599513282,496567001,Edward,2023-01-17,unknown,a few days or more,0%,0%,f,1,...,6,1 bath,2.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",269.0,0,,1,1.0
5942,40611713,314702788,Hospitality NOLA,2019-12-05,unknown,,,,f,3,...,8,2 baths,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",242.0,0,,3,2.0
5945,40633979,314702788,Hospitality NOLA,2019-12-05,unknown,,,,f,3,...,4,1.5 baths,1.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Patio or balc...",187.0,0,,3,1.5
5968,40704451,314702788,Hospitality NOLA,2019-12-05,unknown,,,,f,3,...,6,2 baths,2.0,2.0,"[""Smoke alarm"", ""Refrigerator"", ""Coffee maker""...",295.0,0,,3,2.0


In [65]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [66]:
updated_df['host_verifications'].unique()

array(["['email', 'phone', 'work_email']", "['email', 'phone']",
       "['phone']", "['phone', 'work_email']", 'None', "['email']"],
      dtype=object)

# host_identity_verified

In [67]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [68]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [69]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [70]:
updated_df['calculated_host_listings_count'].unique()

array([  5,   2,   1,   3, 100,   7,  12,   4,  16,  14,  27,  58,   9,
        24,  26,  15,   8,  10,   6,  29,  28,  67,  13,  38,  20,  37,
        83,  22,  74,  11,  19,  50,  17,  78,  18,  41,  40,  34])

# host_name

In [71]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


# host_has_profile_pic

In [72]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath


In [73]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

# Neighborhood

In [74]:
updated_df['neighbourhood'].unique()

array(['New Orleans, Louisiana, United States', nan,
       'New Orleans , Louisiana, United States',
       'ByWater-Marigny, New Orleans , Louisiana, United States',
       'Arabi, Louisiana, United States', 'New Orleans, United States',
       'New Orleans , La, United States',
       'ByWater-Marigny, New Orleans, Louisiana, United States',
       'New Orleans, Louisiana, US, Louisiana, United States',
       'New orleans, Louisiana, United States',
       'New Orleans,, La, United States'], dtype=object)

In [75]:
updated_df['neighbourhood'].isnull().sum()

2029

In [76]:
updated_df['neighbourhood_cleansed'].unique()

array(['Little Woods', 'Hollygrove', 'Broadmoor',
       'Marlyville - Fontainbleau', 'Bywater', 'Mid-City', 'Lakeview',
       'Treme - Lafitte', 'Dillard', 'Seventh Ward', 'Mcdonogh',
       'St. Roch', 'Central Business District', 'Central City',
       'French Quarter', 'Audubon', 'Bayou St. John', 'St. Claude',
       'East Riverside', 'Fairgrounds', 'Uptown', 'Fillmore',
       'Village De Lest', 'St. Thomas Dev', 'Gentilly Terrace',
       'Tulane - Gravier', 'Irish Channel', 'Black Pearl', 'Plum Orchard',
       'Touro', 'Lower Garden District', 'Garden District',
       'St.  Anthony', 'Holy Cross', 'Leonidas', 'Marigny', 'Behrman',
       'Milneburg', 'West Riverside', 'City Park', 'Pines Village',
       'Algiers Point', 'East Carrollton', 'U.S. Naval Base',
       'Lake Catherine', 'West End', 'West Lake Forest', 'Milan',
       'Desire Area', 'Freret', 'Old Aurora', 'Read Blvd East',
       'Lakeshore - Lake Vista', 'Gert Town', 'Tall Timbers - Brechtel',
       'Pontchart

In [77]:
updated_df['neighbourhood_cleansed'].isnull().sum()

0

### We are going to drop neighbourhood column for NOLA Dataset

In [78]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# host_response_time

In [79]:
updated_df['host_response_time'].unique()

array([nan, 'within an hour', 'within a few hours', 'within a day',
       'a few days or more'], dtype=object)

# host_response_rate

In [80]:
updated_df['host_response_rate'].unique()

array([nan, '100%', '90%', '74%', '80%', '92%', '78%', '97%', '0%', '95%',
       '99%', '38%', '50%', '33%', '96%', '64%', '69%', '98%', '83%',
       '89%', '82%', '91%', '88%', '60%', '94%', '86%', '85%', '70%',
       '93%', '75%', '67%', '71%', '79%', '25%', '72%', '87%', '61%',
       '43%', '22%', '30%', '13%'], dtype=object)

# host_acceptance_rate

In [81]:
updated_df['host_acceptance_rate'].unique()

array(['100%', nan, '94%', '96%', '82%', '50%', '99%', '98%', '81%', '0%',
       '97%', '89%', '46%', '93%', '88%', '86%', '80%', '71%', '92%',
       '33%', '85%', '15%', '1%', '83%', '91%', '77%', '73%', '64%',
       '62%', '75%', '90%', '58%', '70%', '31%', '95%', '28%', '63%',
       '84%', '42%', '79%', '72%', '67%', '87%', '65%', '57%', '49%',
       '40%', '23%', '56%', '78%', '20%', '44%', '25%', '69%', '68%',
       '43%', '17%', '60%', '76%', '74%', '9%', '35%', '66%', '4%', '45%',
       '55%', '48%', '38%', '53%', '13%', '47%'], dtype=object)

# Adding new columns for city

Add new column and put 'New Orleans' for all rows for later joining purpose



In [83]:
updated_df['city'] = 'New Orleans'
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city
1,54052921,227505133,Dan,2018-11-25,"New Orleans, LA",,,100%,t,5,...,1 shared bath,1.0,1.0,"[""Smoke alarm"", ""Clothing storage: closet"", ""D...",42.0,2,5.0,5,1.0,New Orleans
2,611584572696788093,244418201,Jabari,2019-02-20,"Nola, Italy",within an hour,100%,100%,t,2,...,2 baths,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",263.0,24,4.71,2,2.0,New Orleans
3,806067969324909110,4423079,Guy,2012-12-17,"New Orleans, LA",within an hour,100%,,f,1,...,2 baths,3.0,5.0,"[""Air conditioning"", ""TV"", ""Indoor fireplace"",...",255.0,0,,1,2.0,New Orleans
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,1 bath,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.0,2,1.0,New Orleans
5,557502553052386952,12305030,Jason,2014-02-16,"New Orleans, LA",within an hour,100%,94%,t,1,...,1 private bath,1.0,1.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",90.0,27,4.89,1,1.0,New Orleans


# Count number of amenities

In [84]:
import ast

# Convert the 'amenities' string to a list using ast.literal_eval
updated_df['amenities_list'] = updated_df['amenities'].apply(lambda x: ast.literal_eval(x))

# Count the number of amenities in each list and create a new column with these counts
updated_df['amenities_count'] = updated_df['amenities_list'].apply(lambda x: len(x))

updated_df.drop('amenities_list', axis=1, inplace=True)


In [85]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,num_bath,city,amenities_count
1,54052921,227505133,Dan,2018-11-25,"New Orleans, LA",,,100%,t,5,...,1.0,1.0,"[""Smoke alarm"", ""Clothing storage: closet"", ""D...",42.0,2,5.0,5,1.0,New Orleans,67
2,611584572696788093,244418201,Jabari,2019-02-20,"Nola, Italy",within an hour,100%,100%,t,2,...,3.0,3.0,"[""Smoke alarm"", ""Refrigerator"", ""Shampoo"", ""Cl...",263.0,24,4.71,2,2.0,New Orleans,54
3,806067969324909110,4423079,Guy,2012-12-17,"New Orleans, LA",within an hour,100%,,f,1,...,3.0,5.0,"[""Air conditioning"", ""TV"", ""Indoor fireplace"",...",255.0,0,,1,2.0,New Orleans,7
4,35649853,127726714,Scott,2017-04-26,"New Orleans, LA",,,,f,4,...,2.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Essentials...",100.0,1,5.0,2,1.0,New Orleans,18
5,557502553052386952,12305030,Jason,2014-02-16,"New Orleans, LA",within an hour,100%,94%,t,1,...,1.0,1.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",90.0,27,4.89,1,1.0,New Orleans,14


In [86]:
# sanity check

see = updated_df.loc[updated_df['id'] == 806067969324909110]


amenities_text = see['amenities'].iloc[0]

# Print the extracted 'amenities' list
print(amenities_text)

["Air conditioning", "TV", "Indoor fireplace", "Wifi", "Washer", "Free parking on premises", "Kitchen"]


# Final look into missing values

In [88]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                1154
host_response_rate                1154
host_acceptance_rate               976
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

In [89]:
# Save as a new csv file
updated_df.to_csv('clean-data/listings_detailed_clean_neworleans.csv', index=False)