# Clark County

In [1]:
import pandas as pd
import numpy as np

In [2]:
listings_detailed = pd.read_csv('usa/Clark County NV/listings_detailed.csv')

In [3]:
listings_detailed.shape

(13933, 75)

# Delete Unnecessary Columns

In [4]:
# Columns you want to keep
columns_to_keep = [
    'id', 'host_id', 'host_name', 'host_since', 'host_location', 
    'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
    'price', 'number_of_reviews', 'review_scores_value', 'calculated_host_listings_count'
]

# Dropping the columns that are not in 'columns_to_keep'
listings_detailed = listings_detailed[columns_to_keep]

In [5]:
listings_detailed.shape

(13933, 29)

In [6]:
# Save as a new csv file
listings_detailed.to_csv('listings_detailed_clean_clark.csv', index=False)

# Remove Entries where 'price' and 'accomodates' is zero / NA

## Pre-processing on 'price' column

In [7]:
# Remove the dollar sign from the price column and change the type to float
listings_detailed['price'] = listings_detailed['price'].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Test: Correct filtering expression
filtered_df = listings_detailed[listings_detailed['price'] == 0]
filtered_df

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count


In [9]:
price_zero_or_na_count = listings_detailed['price'].isnull().sum() + (listings_detailed['price'] == 0).sum()
print(price_zero_or_na_count)


0


In [10]:
# Filter listings where price is either null or equals 0
listings_detailed.loc[listings_detailed['price'].isnull() | (listings_detailed['price'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count


In [11]:
# Filter listings where accommodates is either null or equals 0
listings_detailed.loc[listings_detailed['accommodates'].isnull() | (listings_detailed['accommodates'] == 0)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count


## Create Function to Identify the Zero/NA values and Remove them

In [12]:
def clean_listings(df):
    """
    Removes listings from the dataframe where price is zero/NA or accommodates is zero/NA.
    Counts the number of listings removed based on these criteria.
    
    Parameters:
    - df: A pandas DataFrame with the listings data.
    
    Returns:
    - cleaned_df: The cleaned DataFrame with the specified entries removed.
    - count_removed: The number of listings removed based on the criteria.
    """
    
    # Convert price from string to float
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    # Find entries to remove based on conditions
    to_remove = df[(df['price'] == 0) | (df['price'].isna()) | (df['accommodates'] == 0) | (df['accommodates'].isna())]
    
    # Count the number of listings to remove
    count_removed = len(to_remove)

    print(f"Number of listings removed: {count_removed}")
    
    # Remove the entries from the DataFrame
    cleaned_df = df.drop(to_remove.index)
    
    return cleaned_df


In [13]:
# Test the clean_listings function
cleaned_listings = clean_listings(listings_detailed)

Number of listings removed: 0


In [14]:
cleaned_listings.shape

(13933, 29)

# Count 'amenities'

In [15]:
import ast
def count_amenities(amenities_str):
    try:
        # Convert the string representation of the list back into a list
        amenities_list = ast.literal_eval(amenities_str)
        # Return the count of items in the list
        return len(amenities_list)
    except (ValueError, SyntaxError):
        # In case of any error during conversion, return 0 (or you may choose to return NaN)
        return 0

# Apply the function to each row in the 'amenities' column and create a new column 'amenities_count'
cleaned_listings['amenities_count'] = cleaned_listings['amenities'].apply(count_amenities)

In [16]:
cleaned_listings['amenities_count']

0        56
1        46
2        37
3        25
4        21
         ..
13928    32
13929    55
13930    34
13931    10
13932     8
Name: amenities_count, Length: 13933, dtype: int64

# Add 'city' column

In [17]:
# Add in a city column to each dataframe so that we can use this as part of the primary key/use it to conduct groupby for
# future EDA or additional analysis as final table will contain all of our listings data
cleaned_listings['city'] = 'Clark County NV'

# Impute 'bathroom_text'

## Pre-processing

- 0 shared bath -> 0
- half bath -> 0.5
- private half bath -> 0.5
- shared half bath -> 0.5

In [18]:
cleaned_listings['bathrooms_text'].unique()

array(['2.5 baths', '2 baths', '1.5 baths', '1 bath', '1 private bath',
       '3 baths', '1 shared bath', '3.5 baths', '4.5 baths',
       '1.5 shared baths', '4 baths', '2 shared baths', nan, '5 baths',
       '0 baths', 'Shared half-bath', '0 shared baths', '3 shared baths',
       '5.5 baths', '6 baths', '4 shared baths', '2.5 shared baths',
       '8 baths', '6.5 baths', '9.5 baths', '7 baths', '8.5 baths',
       '7.5 baths', '12 baths', '13.5 baths', 'Private half-bath',
       '11 baths', '14 baths', 'Half-bath', '5.5 shared baths',
       '10 baths', '9 baths', '3.5 shared baths'], dtype=object)

In [19]:
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].astype(str)
# Convert 'bathrooms_text' column to strings, treating NaN as empty strings
cleaned_listings['bathrooms_text'] = listings_detailed['bathrooms_text'].fillna('').astype(str)
cleaned_listings
# Verifying the conversion
print(cleaned_listings['bathrooms_text'].dtype)

object


#### Changing non-numeric bathrooms to numeric values

In [20]:
# Replace specific strings in 'bathrooms_text' column
cleaned_listings['bathrooms_text'] = cleaned_listings['bathrooms_text'].replace({
    'Half-bath': '0.5 baths',
    'Private half-bath': '0.5 baths',
    'Shared half-bath': '0.5 baths'
})

# Check the unique values to verify the changes
unique_bathrooms = cleaned_listings['bathrooms_text'].unique()
print(unique_bathrooms)


['2.5 baths' '2 baths' '1.5 baths' '1 bath' '1 private bath' '3 baths'
 '1 shared bath' '3.5 baths' '4.5 baths' '1.5 shared baths' '4 baths'
 '2 shared baths' '' '5 baths' '0 baths' '0.5 baths' '0 shared baths'
 '3 shared baths' '5.5 baths' '6 baths' '4 shared baths'
 '2.5 shared baths' '8 baths' '6.5 baths' '9.5 baths' '7 baths'
 '8.5 baths' '7.5 baths' '12 baths' '13.5 baths' '11 baths' '14 baths'
 '5.5 shared baths' '10 baths' '9 baths' '3.5 shared baths']


In [21]:
count_half_baths = (cleaned_listings['bathrooms_text'] == '0.5 baths').sum()
count_half_baths

8

### Check for the Nan for bathrooms_text

In [22]:
cleaned_listings[cleaned_listings['bathrooms_text']=='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
236,6923653,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,55,4.72,262,49,Clark County NV
308,10237778,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,8,4.13,262,48,Clark County NV
343,10426936,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,9,4.33,262,54,Clark County NV
349,11457043,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,13,4.75,262,54,Clark County NV
385,11981837,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,11,4.64,262,54,Clark County NV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,49075800,231157127,Steven,2018-12-17,"Las Vegas, NV",within an hour,90%,56%,f,17,...,,,1.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Shamp...",350.0,2,5.00,15,19,Clark County NV
13347,840112485953424789,23020599,Ramzi,2014-10-27,"Jeddah, Saudi Arabia",,,,f,1,...,,1.0,1.0,[],245.0,0,,1,0,Clark County NV
13537,846251343784208947,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",76.0,0,,39,38,Clark County NV
13733,851483198327421322,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",75.0,0,,39,30,Clark County NV


In [23]:
empty_bathrooms_text_ids = cleaned_listings[cleaned_listings['bathrooms_text'] == '']['id']
print(len(empty_bathrooms_text_ids))

194


### function to fill in NULL values in bathrooms_text

1. Use the 'bedrooms'
2. if the 'bedrooms' column is also NA, fill in NAs with 1/'beds' column

In [24]:
def fill_missing_bathrooms(df):
    """
    Fills missing or empty 'bathrooms_text' values in the DataFrame.
    - Uses the 'bedrooms' value if available.
    - If 'bedrooms' is also missing, uses half the 'beds' value.
    """
    # Loop through each row in DataFrame
    for index, row in df.iterrows():
        # Check if 'bathrooms_text' is NaN or an empty string
        if pd.isnull(row['bathrooms_text']) or row['bathrooms_text'] == '':
            # Use 'bedrooms' if not NaN and not 0
            if not pd.isnull(row['bedrooms']) and row['bedrooms'] != 0:
                df.at[index, 'bathrooms_text'] = str(row['bedrooms']) + ' bath'
            # If 'bedrooms' is NaN or 0, use half the 'beds' value
            elif not pd.isnull(row['beds']) and row['beds'] != 0:
                # Calculate half the beds and format it to match the bathroom text style
                half_beds_as_baths = row['beds'] / 2
                df.at[index, 'bathrooms_text'] = str(half_beds_as_baths) + ' bath'
            # If both 'bedrooms' and 'beds' are not available, set a default value
            else:
                df.at[index, 'bathrooms_text'] = '1 bath' # Default value or consider another logic
                
    return df


In [25]:
filled_cleaned_listings = fill_missing_bathrooms(cleaned_listings)


In [26]:
filled_cleaned_listings[['id', 'bedrooms', 'beds', 'bathrooms_text']]

Unnamed: 0,id,bedrooms,beds,bathrooms_text
0,875406,4.0,4.0,2.5 baths
1,4024858,4.0,5.0,2.5 baths
2,883590,2.0,2.0,2 baths
3,890332,3.0,5.0,2 baths
4,4066692,2.0,1.0,1.5 baths
...,...,...,...,...
13928,855016574002133491,1.0,2.0,1 bath
13929,855024616816114216,3.0,4.0,3 baths
13930,855048492607950318,1.0,1.0,1 shared bath
13931,855207553051003465,3.0,4.0,4 baths


In [27]:
# # check for the previous NA rows in bathrooms text
filled_cleaned_listings[filled_cleaned_listings['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city
236,6923653,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,1.0 bath,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,55,4.72,262,49,Clark County NV
308,10237778,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,1.0 bath,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,8,4.13,262,48,Clark County NV
343,10426936,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0 bath,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,9,4.33,262,54,Clark County NV
349,11457043,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0 bath,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,13,4.75,262,54,Clark County NV
385,11981837,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0 bath,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,11,4.64,262,54,Clark County NV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,49075800,231157127,Steven,2018-12-17,"Las Vegas, NV",within an hour,90%,56%,f,17,...,0.5 bath,,1.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Shamp...",350.0,2,5.00,15,19,Clark County NV
13347,840112485953424789,23020599,Ramzi,2014-10-27,"Jeddah, Saudi Arabia",,,,f,1,...,1.0 bath,1.0,1.0,[],245.0,0,,1,0,Clark County NV
13537,846251343784208947,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,1.0 bath,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",76.0,0,,39,38,Clark County NV
13733,851483198327421322,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,1.0 bath,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",75.0,0,,39,30,Clark County NV


### Function to make numerical values for # of bath

In [28]:
def add_numerical_bathrooms_column(df, column_name):
    """
    Adds a new column 'num_bath' to the DataFrame based on the numerical values extracted from the specified column.

    Parameters:
    - df: pd.DataFrame - The DataFrame to process.
    - column_name: str - The name of the column containing bathroom text descriptions.

    Returns:
    - pd.DataFrame - The original DataFrame with an additional 'num_bath' column.
    """
    # Extract numerical part from the specified column and convert to float
    df['num_bath'] = df[column_name].str.extract('([0-9]+\.?[0-9]*)').astype(float)
    return df

In [29]:
updated_df = add_numerical_bathrooms_column(filled_cleaned_listings, 'bathrooms_text')
print(updated_df[['bathrooms_text', 'num_bath']])

      bathrooms_text  num_bath
0          2.5 baths       2.5
1          2.5 baths       2.5
2            2 baths       2.0
3            2 baths       2.0
4          1.5 baths       1.5
...              ...       ...
13928         1 bath       1.0
13929        3 baths       3.0
13930  1 shared bath       1.0
13931        4 baths       4.0
13932        3 baths       3.0

[13933 rows x 2 columns]


In [30]:
updated_df['num_bath'].isnull().sum()

0

In [31]:
updated_df[updated_df['num_bath'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [32]:
updated_df['num_bath'].unique()

array([ 2.5,  2. ,  1.5,  1. ,  3. ,  3.5,  4.5,  4. ,  5. ,  0. ,  0.5,
        5.5,  6. ,  8. ,  6.5,  9.5,  7. ,  8.5,  7.5, 12. , 13.5, 11. ,
       14. , 10. ,  9. ])

In [33]:
# delete bathrooms column
updated_df.drop('bathrooms', axis=1, inplace=True)

In [34]:
updated_df.head()

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
0,875406,4621737,Jordan,2013-01-07,"Las Vegas, NV",within a day,50%,59%,f,1,...,4.0,4.0,"[""Toaster"", ""Essentials"", ""Free washer \u2013 ...",518.0,24,4.62,1,56,Clark County NV,2.5
1,4024858,20866230,Shaun,2014-09-02,"Las Vegas, NV",within an hour,100%,100%,t,1,...,4.0,5.0,"[""Essentials"", ""Free washer \u2013 In unit"", ""...",292.0,533,4.91,1,46,Clark County NV,2.5
2,883590,4677994,Markus,2013-01-12,"Los Angeles, CA",within an hour,100%,63%,f,5,...,2.0,2.0,"[""Essentials"", ""Free washer \u2013 In unit"", ""...",130.0,61,4.7,3,37,Clark County NV,2.0
3,890332,981582,Jason & Kevin,2011-08-17,"Las Vegas, NV",within a few hours,84%,27%,f,9,...,3.0,5.0,"[""Washer"", ""First aid kit"", ""Backyard"", ""Kitch...",220.0,63,4.38,9,25,Clark County NV,2.0
4,4066692,21090523,Maria,2014-09-08,"Las Vegas, NV",within a few hours,100%,100%,f,1,...,2.0,1.0,"[""Essentials"", ""Fire extinguisher"", ""Washer"", ...",114.0,184,4.77,1,21,Clark County NV,1.5


# Impute Bedrooms and Beds

In [35]:
updated_df['bedrooms'].unique()

array([ 4.,  2.,  3., nan,  1.,  5.,  6.,  7.,  8., 10.,  9., 11., 16.,
       12.])

In [36]:
updated_df['beds'].unique()

array([ 4.,  5.,  2.,  1.,  3.,  7.,  6.,  9.,  8., 14., 11., 10., nan,
       15., 12., 13., 16., 17., 18., 27., 19., 26., 25., 21., 20., 24.])

In [37]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
5,4068810,20488262,Harry,2014-08-24,"Las Vegas, NV",within an hour,95%,100%,f,1,...,,2.0,"[""Essentials"", ""Shared pool"", ""Fire extinguish...",138.0,355,4.68,1,47,Clark County NV,1.0
32,4349688,22554257,Mark,2014-10-15,"Irvine, CA",,,100%,t,1,...,,2.0,"[""Essentials"", ""Pool"", ""Iron"", ""Gym"", ""Kitchen...",150.0,627,4.86,1,23,Clark County NV,1.0
60,4527410,23471743,Vimal,2014-11-08,"Dallas, TX",within an hour,100%,100%,t,1,...,,2.0,"[""Essentials"", ""Shared pool"", ""Iron"", ""Gym"", ""...",126.0,127,4.90,1,32,Clark County NV,1.0
63,658643,3321660,Mike And Penny,2012-08-20,"Las Vegas, NV",within a few hours,100%,82%,f,2,...,,2.0,"[""Essentials"", ""Gym"", ""Kitchen"", ""Free parking...",115.0,382,4.81,2,25,Clark County NV,1.0
117,4997188,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,,2.0,"[""Essentials"", ""Fire extinguisher"", ""Iron"", ""O...",1663.0,37,4.71,262,42,Clark County NV,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13890,853718190488833928,411236970,Mari,2021-07-06,,within a few hours,96%,38%,f,58,...,,1.0,"[""Outdoor dining area"", ""Hot tub"", ""Carbon mon...",236.0,0,,30,13,Clark County NV,1.0
13893,853818133226101261,411236970,Mari,2021-07-06,,within a few hours,96%,38%,f,58,...,,2.0,"[""Outdoor dining area"", ""Hot tub"", ""Carbon mon...",308.0,0,,30,13,Clark County NV,1.0
13908,854172538995763143,411236970,Mari,2021-07-06,,within a few hours,96%,38%,f,58,...,,1.0,"[""Outdoor dining area"", ""Mountain view"", ""Carb...",324.0,0,,30,16,Clark County NV,1.0
13912,854236101053187823,411236970,Mari,2021-07-06,,within a few hours,96%,38%,f,58,...,,2.0,"[""Carbon monoxide alarm"", ""Exercise equipment""...",216.0,0,,30,10,Clark County NV,1.0


In [38]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
238,9755691,48171296,LuckyStar,2015-11-04,"Las Vegas, NV",within an hour,100%,100%,f,7,...,,,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",142.0,359,4.80,7,56,Clark County NV,1.5
256,9862496,50760426,Mark,2015-12-07,"Charlotte, NC",within an hour,100%,99%,t,4,...,,,"[""Toaster"", ""Essentials"", ""Coffee maker: drip ...",125.0,76,4.86,3,38,Clark County NV,0.0
336,9132839,47568376,Ronnie,2015-10-27,"Las Vegas, NV",within an hour,94%,96%,t,4,...,,,"[""Essentials"", ""Coffee maker: Keurig coffee ma...",88.0,272,4.91,2,37,Clark County NV,1.0
341,10370600,34412723,Crystal,2015-05-28,"Las Vegas, NV",within a few hours,100%,99%,t,2,...,,,"[""Essentials"", ""Fire extinguisher"", ""Iron"", ""F...",92.0,43,4.93,2,22,Clark County NV,1.0
373,9664608,19139775,Lorie,2014-07-29,"Cathedral City, CA",a few days or more,48%,22%,f,249,...,1.0,,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",439.0,1,5.00,17,27,Clark County NV,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10432,734022480115189255,57287561,Eileen,2016-02-03,"West Covina, CA",within an hour,100%,100%,f,2,...,1.0,,"[""Refrigerator"", ""Essentials"", ""Carbon monoxid...",56.0,3,4.00,2,20,Clark County NV,1.0
11658,789380805989716365,426975381,Sky702,2021-10-11,,within an hour,100%,100%,f,51,...,,,"[""Essentials"", ""Free washer \u2013 In unit"", ""...",94.0,6,4.67,51,42,Clark County NV,1.0
13019,830016870731731653,501737618,Bnb Green,2023-02-18,,within an hour,100%,100%,f,1,...,,,"[""Essentials"", ""Fire extinguisher"", ""Room-dark...",81.0,0,,1,37,Clark County NV,1.0
13338,837251708903341283,2981129,Mamie,2012-07-18,"Los Angeles, CA",within an hour,75%,0%,f,3,...,1.0,,"[""Host greets you"", ""Washer""]",70.0,0,,2,2,Clark County NV,0.0


In [39]:
def update_bedrooms_and_beds(df):
    """
    Updates the 'bedrooms' and 'beds' columns based on the conditions:
    1. If 'bedrooms' is null, set it to the rounded up value of 'num_bath'.
    2. If 'beds' is null, copy the value from 'bedrooms'.
    """
    # Rule 1: If bedrooms is NaN, round up 'num_bath' to nearest integer and assign to 'bedrooms'
    df.loc[df['bedrooms'].isnull(), 'bedrooms'] = np.ceil(df['num_bath'])
    
    # Rule 2: If beds is NaN, copy the value from 'bedrooms'
    df.loc[df['beds'].isnull(), 'beds'] = df['bedrooms']
    
    return df


In [40]:
updated_df = update_bedrooms_and_beds(updated_df)


In [41]:
# check for the previous NA rows in bedrooms
updated_df[updated_df['id'].isin(empty_bathrooms_text_ids)]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
236,6923653,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,55,4.72,262,49,Clark County NV,1.0
308,10237778,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,1.0,2.0,"[""Toaster"", ""Essentials"", ""Fire extinguisher"",...",1663.0,8,4.13,262,48,Clark County NV,1.0
343,10426936,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,9,4.33,262,54,Clark County NV,2.0
349,11457043,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,13,4.75,262,54,Clark County NV,2.0
385,11981837,12018637,888,2014-02-07,"Las Vegas, NV",within an hour,100%,73%,f,262,...,2.0,4.0,"[""Toaster"", ""Essentials"", ""UN conditioner"", ""F...",2136.0,11,4.64,262,54,Clark County NV,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,49075800,231157127,Steven,2018-12-17,"Las Vegas, NV",within an hour,90%,56%,f,17,...,1.0,1.0,"[""Essentials"", ""Carbon monoxide alarm"", ""Shamp...",350.0,2,5.00,15,19,Clark County NV,0.5
13347,840112485953424789,23020599,Ramzi,2014-10-27,"Jeddah, Saudi Arabia",,,,f,1,...,1.0,1.0,[],245.0,0,,1,0,Clark County NV,1.0
13537,846251343784208947,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",76.0,0,,39,38,Clark County NV,1.0
13733,851483198327421322,3696404,Mike,2012-09-27,"New Jersey, United States",within an hour,99%,97%,t,282,...,1.0,2.0,"[""Essentials"", ""Fire extinguisher"", ""Pool"", ""I...",75.0,0,,39,30,Clark County NV,1.0


In [42]:
updated_df['beds'].unique()

array([ 4.,  5.,  2.,  1.,  3.,  7.,  6.,  9.,  8., 14., 11., 10.,  0.,
       15., 12., 13., 16., 17., 18., 27., 19., 26., 25., 21., 20., 24.])

In [43]:
updated_df[updated_df['beds'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [44]:
updated_df['bedrooms'].unique()

array([ 4.,  2.,  3.,  1.,  5.,  6.,  7.,  0.,  8., 10.,  9., 11., 16.,
       12.])

In [45]:
updated_df[updated_df['bedrooms'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_since

In [46]:
# check the number of missing values
na = updated_df['host_since'].isnull().sum()
print(f'There are {na} missing values in the "host_since" column.')

# Fill missing values with the most recent date
most_recent_date = pd.to_datetime(updated_df['host_since']).max().strftime("%B %Y")
updated_df['host_since'].fillna(most_recent_date, inplace=True)


There are 0 missing values in the "host_since" column.


In [47]:
updated_df['host_since'].isnull().sum()

0

In [48]:
updated_df['host_since'].unique()

array(['2013-01-07', '2014-09-02', '2013-01-12', ..., '2023-03-24',
       '2023-01-16', '2015-03-25'], dtype=object)

In [49]:
updated_df[updated_df['host_since'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_location

In [50]:
# check the number of missing values
na = updated_df['host_location'].isnull().sum()
print(f'There are {na} missing values in the "host_location" column.')

# Fill missing values with 'unknown'
updated_df['host_location'].fillna("unknown", inplace=True)

There are 4028 missing values in the "host_location" column.


In [51]:
updated_df['host_location'].isnull().sum()

0

In [52]:
count_unknown_location = (updated_df['host_location'] == 'unknown').sum()
count_unknown_location

4028

In [53]:
updated_df['host_location'].unique()

array(['Las Vegas, NV', 'Los Angeles, CA', 'unknown', 'New York, NY',
       'United States', 'New Orleans, LA', 'Prague, Czechia',
       'San Diego, CA', 'New Jersey, United States', 'San Francisco, CA',
       'Chicago, IL', 'Irvine, CA', 'Rancho Santa Fe, CA', 'Phoenix, AZ',
       'Sacramento, CA', 'Pennsylvania, United States', 'San Jose, CA',
       'Dallas, TX', 'Huntington Beach, CA', 'Henderson, NV',
       'Fontana, CA', 'Reno, NV', 'North Las Vegas, NV',
       'Fort Pierce, FL', 'Arizona, United States', 'Paradise Valley, AZ',
       'Oakland, CA', 'Rowland Heights, CA', 'Mukilteo, WA', 'Miami, FL',
       'La Mirada, CA', 'Fort Lauderdale, FL', 'Vancouver, Canada',
       'California, United States', 'Ciudad Obregón, Mexico',
       'Hermosa Beach, CA', 'Cathedral City, CA', 'Valley Stream, NY',
       'Nevada, United States', 'Maple Grove, MN', 'Little Silver, NJ',
       'Toronto, Canada', 'Seattle, WA', 'Rome, Italy', 'Mt. Juliet, TN',
       'Lake Charles, LA', 'Franc

In [54]:
updated_df[updated_df['host_location'] =='']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [55]:
updated_df[updated_df['host_location'] ==' ']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_is_superhost

In [56]:
# check the number of missing values
na = updated_df['host_is_superhost'].isnull().sum()
print(f'There are {na} missing values in the "host_is_superhost" column.')

updated_df['host_is_superhost'].fillna("f", inplace=True)

There are 0 missing values in the "host_is_superhost" column.


In [57]:
updated_df['host_is_superhost'].isnull().sum()

0

In [58]:
count_unknown_super = (updated_df['host_is_superhost'] == 'f').sum()
count_unknown_super

8631

In [59]:
updated_df['host_is_superhost'].unique()

array(['f', 't'], dtype=object)

# host_listings_count

In [60]:
na = updated_df['host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_listings_count" column.')

updated_df['host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "host_listings_count" column.


In [61]:
updated_df['host_listings_count'].isnull().sum()

0

In [62]:
updated_df['host_listings_count'].unique()

array([   1,    5,    9,    3,   12,    2,  119,   15,   28,  186,   36,
         18,   20,   79,    8,   26,  382,    4,  262,   14,   69,   13,
          6,   10,  256,  249,  205,   11,   16,   87,  171,   33,   75,
          7,   93,   37,   42,  524,   29,  318, 1613,   31,  301,  284,
        161,   57,  293,   21,  350,   59,  461,   27,   67,   17,   58,
         35,   22,  296,   19,  326,   25,  165,  388,   56,  424,  157,
         34,  120,   24,   89,   32,  314,   49,  132,   85,   78,   23,
         77,  136,  194,  188,  130,  415,   61,  176,  126,  325,  657,
        497,   63,   30,   43,  478,  101,   38,   51,  144,   65,  199,
        145,  158,   52, 3302,  505,  114,  617,  308,   46,  173,  116,
       2321,  108,  677,  792,  645,  282,   83,  203,   40,  154,  611,
        320,  213,  191,   97,  287,  361,   41,  231, 2492,   62,  184,
        192,   47,   86,  220,  228, 2647,   68,   82,   76,   60,  348,
        671, 2067,   48,  148,   54,   39,  104,  8

# host_total_listings_count

In [63]:
na = updated_df['host_total_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "host_total_listings_count" column.')

updated_df['host_total_listings_count'].fillna(1, inplace=True)


There are 0 missing values in the "host_total_listings_count" column.


In [64]:
updated_df['host_total_listings_count'].isnull().sum()

0

In [65]:
updated_df['host_total_listings_count'].unique()

array([   4,    1,    8,   14,    2,   11,   54,   12,  122,    3,   23,
          6,   46,    5,   16,   24,  286,  190,   21,   56,    7,   87,
         10,   32,  465,   13,  296,   25,  101,   18,    9,   28,   39,
        851,  198,   27,   15,  403,  238,  155,  355,   44,  165,  147,
         37,   57,  662,   43,   62,  712,  366, 1930,   49,   60,  375,
         29,  335,  290,   64,  363,   40,   19,  391,  129,  550,   36,
         71,   45,  106,   83,   66,   41,   33,   34,  390,   22,  399,
         30,  301,  512,   17,   70,  898,  179,   48,  126,  145,   20,
         42,   90,   85,  393,   65,   35,   58,   38,   80,   59,   69,
        351,  220,  257,   31,  169,  569,   26,  312,  151,  382,  716,
        566,   55,  105, 1022,  138,   93,  204,  158,  239,  310,  299,
         99, 4535, 1049,  167,  690,  524,   47,  241,  156,  427,  121,
       7825,  117, 1772,  107,  672, 1714,  163,   52,  236,  231,  326,
        640, 1155,  214,  150,  206,  112,  378,  3

# host_verifications

In [66]:
na = updated_df['host_verifications'].isnull().sum()
print(f'There are {na} missing values in the "host_verifications" column.')

updated_df['host_verifications'].fillna("None", inplace=True)


There are 0 missing values in the "host_verifications" column.


In [67]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']", '[]'],
      dtype=object)

In [68]:
updated_df[updated_df['host_verifications'] == '[]']

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath
12326,808187260428649948,496997931,Lynn,2023-01-19,unknown,within an hour,100%,100%,f,1,...,3.0,3.0,"[""Toaster"", ""Essentials"", ""Cleaning products"",...",191.0,2,5.0,1,43,Clark County NV,2.5


In [69]:
# Replace '[]' with None in 'host_verifications' column
updated_df['host_verifications'] = updated_df['host_verifications'].replace('[]', "None")


In [70]:
updated_df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']", 'None'],
      dtype=object)

# host_identity_verified

In [71]:
na = updated_df['host_identity_verified'].isnull().sum()
print(f'There are {na} missing values in the "host_identity_verified" column.')

updated_df['host_identity_verified'].fillna("f", inplace=True)


There are 0 missing values in the "host_identity_verified" column.


In [72]:
updated_df['host_identity_verified'].unique()

array(['t', 'f'], dtype=object)

# calculated_host_listings_count

In [73]:
na = updated_df['calculated_host_listings_count'].isnull().sum()
print(f'There are {na} missing values in the "calculated_host_listings_count" column.')

updated_df['calculated_host_listings_count'].fillna(1, inplace=True)

There are 0 missing values in the "calculated_host_listings_count" column.


In [74]:
updated_df['calculated_host_listings_count'].unique()

array([  1,   3,   9,  12,   2,  11,  14,   4,   6,   5,  18,  19,  66,
       262,   7,  10,  17, 202,   8,  22,  24,  67,  21,  29,  13,  28,
        34,  38,  35,  16,  72,  27,  54,  33,  15,  89,  30,  85, 193,
        23,  26,  43,  53, 327, 101,  69,  57,  39,  51, 106, 178,  65,
        44,  79,  52,  25, 118,  73])

# host_name

In [75]:
updated_df[updated_df['host_name'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


# host_has_profile_pic

In [76]:
updated_df[updated_df['host_has_profile_pic'].isnull()]

Unnamed: 0,id,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,bedrooms,beds,amenities,price,number_of_reviews,review_scores_value,calculated_host_listings_count,amenities_count,city,num_bath


In [77]:
updated_df['host_has_profile_pic'].unique()

array(['t', 'f'], dtype=object)

In [78]:
na = updated_df['host_has_profile_pic'].isnull().sum()
print(f'There are {na} missing values in the "host_has_profile_pic" column.')

updated_df['host_has_profile_pic'].fillna('f', inplace=True)

There are 0 missing values in the "host_has_profile_pic" column.


# Neighborhood

In [79]:
updated_df['neighbourhood'].unique()

array(['Las Vegas, Nevada, United States', nan,
       'Henderson, Nevada, United States',
       'North Las Vegas, Nevada, United States',
       'Las Vegas , Nevada, United States',
       'Laughlin, Nevada, United States',
       'Sandy Valley, Nevada, United States',
       'lake las vegas, Nevada, United States',
       'Paradise, Nevada, United States',
       'Mesquite, Nevada, United States', 'Las Vegas, United States',
       'Lake Las Vegas, Henderson, Nevada, United States',
       'Boulder City, Nevada, United States',
       'Enterprise, Nevada, United States',
       'Las vegas, Nevada, United States',
       'Las Vegas , Nv, United States',
       'St Henderson, Nevada, United States',
       '拉斯维加斯, Nevada, United States',
       'Spring Valley, Nevada, United States',
       'las vegas, Nevada, United States',
       'Sunrise Manor, Nevada, United States',
       'Mount Charleston, Nevada, United States',
       'Bullhead City, Arizona, United States',
       'Goodspri

In [80]:
updated_df['neighbourhood'].isnull().sum()

6151

In [81]:
updated_df['neighbourhood_cleansed'].unique()

array(['Unincorporated Areas', 'City of Las Vegas', 'City of Henderson',
       'City of North Las Vegas', 'City of Mesquite', 'Nellis AFB',
       'Boulder City'], dtype=object)

### We are going to drop neighbourhood column for Clark County Dataset

In [82]:
updated_df.drop('neighbourhood', axis=1, inplace=True)

# Final look into missing values

In [83]:
updated_df.isnull().sum()

id                                   0
host_id                              0
host_name                            0
host_since                           0
host_location                        0
host_response_time                 683
host_response_rate                 683
host_acceptance_rate               464
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
latitude                             0
longitude                            0
room_type                            0
accommodates                         0
bathrooms_text                       0
bedrooms                             0
beds                                 0
amenities                            0
price                                0
number_of_reviews                    0
review_scores_value      

# Save the Final Dataframe

In [84]:
updated_df.to_csv('data/listings_detailed_after_na_clark.csv', index=False)