In [221]:
import pandas as pd
import os
import re

In [222]:
airbnb_df = pd.read_csv('./Raw_Data/denver_listings.csv')
starbucks_df = pd.read_csv('./Raw_Data/startbucks.csv')

# AirBNB Data Cleaning

In [223]:
display(airbnb_df.info())
display(airbnb_df.shape)
airbnb_columns = airbnb_df.columns.to_list()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5388 entries, 0 to 5387
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5388 non-null   int64  
 1   listing_url                                   5388 non-null   object 
 2   scrape_id                                     5388 non-null   int64  
 3   last_scraped                                  5388 non-null   object 
 4   source                                        5388 non-null   object 
 5   name                                          5388 non-null   object 
 6   description                                   5369 non-null   object 
 7   neighborhood_overview                         3813 non-null   object 
 8   picture_url                                   5388 non-null   object 
 9   host_id                                       5388 non-null   i

None

(5388, 75)

In [224]:
airbnb_columns_tokeep = \
['description',
 'neighborhood_overview',
 'host_neighbourhood',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'has_availability',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'calculated_host_listings_count',
 'reviews_per_month']


In [225]:
#Reviewed total of 75 original columns, eliminated unnecessary columns or columns 
#with excessively null data
airbnb_df = airbnb_df[airbnb_columns_tokeep]
# filled null values with -1 for easy identification - no organic data would contain -1
airbnb_df.fillna(-1,inplace=True)
# airbnb_df

In [226]:
display(airbnb_df.info())
# reviewed for columns with inappropriate data types, identified:
# bathrooms_text, price, first_review, last_review

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5388 entries, 0 to 5387
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   description                     5388 non-null   object 
 1   neighborhood_overview           5388 non-null   object 
 2   host_neighbourhood              5388 non-null   object 
 3   neighbourhood_cleansed          5388 non-null   object 
 4   latitude                        5388 non-null   float64
 5   longitude                       5388 non-null   float64
 6   property_type                   5388 non-null   object 
 7   room_type                       5388 non-null   object 
 8   accommodates                    5388 non-null   int64  
 9   bathrooms_text                  5388 non-null   object 
 10  bedrooms                        5388 non-null   float64
 11  beds                            5388 non-null   float64
 12  price                           53

None

In [227]:
#using regular expressions to extract the number of bathrooms
bathroom_re_format = r'(.*?)( .*)'
def extract_bathroom_count(string):
    string = str(string)
    re_return = re.match(bathroom_re_format, string)
    return float(re_return.group(1)) if re_return else None
airbnb_df['bathroom_count'] = airbnb_df['bathrooms_text'].apply(extract_bathroom_count)

#Using string methods to convert price to a numerical value
airbnb_df['price'] = airbnb_df['price'].str.replace(',','').str.strip('$')
airbnb_df['price'] = pd.to_numeric(airbnb_df['price'])

In [231]:
#Removing Price outliers (found to be likely glitches / faulty data)
airbnb_df = airbnb_df.loc[airbnb_df['price'] < 2001]
#Changing first and last reviews to datetimes
airbnb_df['first_review'] = pd.to_datetime(airbnb_df['first_review'], errors='coerce')
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'], errors='coerce')

In [234]:
airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5367 entries, 0 to 5387
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   description                     5367 non-null   object        
 1   neighborhood_overview           5367 non-null   object        
 2   host_neighbourhood              5367 non-null   object        
 3   neighbourhood_cleansed          5367 non-null   object        
 4   latitude                        5367 non-null   float64       
 5   longitude                       5367 non-null   float64       
 6   property_type                   5367 non-null   object        
 7   room_type                       5367 non-null   object        
 8   accommodates                    5367 non-null   int64         
 9   bathrooms_text                  5367 non-null   object        
 10  bedrooms                        5367 non-null   float64       
 11  beds     

# Starbucks Data Cleaning