In [1]:
import pandas as pd
import os
import re
from collections import Counter
import folium
import webbrowser
import csv

In [2]:
airbnb_df = pd.read_csv('./Raw_Data/denver_listings.csv')
starbucks_df = pd.read_csv('./Raw_Data/startbucks.csv')

# AirBNB Data Cleaning

In [3]:
display(airbnb_df.info())
display(airbnb_df.shape)
airbnb_columns = airbnb_df.columns.to_list()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5388 entries, 0 to 5387
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5388 non-null   int64  
 1   listing_url                                   5388 non-null   object 
 2   scrape_id                                     5388 non-null   int64  
 3   last_scraped                                  5388 non-null   object 
 4   source                                        5388 non-null   object 
 5   name                                          5388 non-null   object 
 6   description                                   5369 non-null   object 
 7   neighborhood_overview                         3813 non-null   object 
 8   picture_url                                   5388 non-null   object 
 9   host_id                                       5388 non-null   i

None

(5388, 75)

In [7]:
airbnb_columns_tokeep = \
['description',
 'neighborhood_overview',
 'host_neighbourhood',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'has_availability',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'calculated_host_listings_count',
 'reviews_per_month']


In [8]:
#Reviewed total of 75 original columns, eliminated unnecessary columns or columns 
#with excessively null data
airbnb_df = airbnb_df[airbnb_columns_tokeep]
# filled null values with -1 for easy identification - no organic data would contain -1
airbnb_df.fillna(-1,inplace=True)
# airbnb_df

In [10]:
display(airbnb_df.info())
# reviewed for columns with inappropriate data types, identified:
# bathrooms_text, price, first_review, last_review

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5388 entries, 0 to 5387
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   description                     5388 non-null   object 
 1   neighborhood_overview           5388 non-null   object 
 2   host_neighbourhood              5388 non-null   object 
 3   neighbourhood_cleansed          5388 non-null   object 
 4   latitude                        5388 non-null   float64
 5   longitude                       5388 non-null   float64
 6   property_type                   5388 non-null   object 
 7   room_type                       5388 non-null   object 
 8   accommodates                    5388 non-null   int64  
 9   bathrooms_text                  5388 non-null   object 
 10  bedrooms                        5388 non-null   float64
 11  beds                            5388 non-null   float64
 12  price                           53

None

In [11]:
#using regular expressions to extract the number of bathrooms
bathroom_re_format = r'(.*?)( .*)'
def extract_bathroom_count(string):
    string = str(string)
    re_return = re.match(bathroom_re_format, string)
    return float(re_return.group(1)) if re_return else None
airbnb_df['bathroom_count'] = airbnb_df['bathrooms_text'].apply(extract_bathroom_count)

#Using string methods to convert price to a numerical value
airbnb_df['price'] = airbnb_df['price'].str.replace(',','').str.strip('$')
airbnb_df['price'] = pd.to_numeric(airbnb_df['price'])

In [12]:
#Removing Price outliers (found to be likely glitches / faulty data)
airbnb_df = airbnb_df.loc[airbnb_df['price'] < 2001]
#Changing first and last reviews to datetimes
airbnb_df['first_review'] = pd.to_datetime(airbnb_df['first_review'], errors='coerce')
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'], errors='coerce')

In [13]:
airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5367 entries, 0 to 5387
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   description                     5367 non-null   object        
 1   neighborhood_overview           5367 non-null   object        
 2   host_neighbourhood              5367 non-null   object        
 3   neighbourhood_cleansed          5367 non-null   object        
 4   latitude                        5367 non-null   float64       
 5   longitude                       5367 non-null   float64       
 6   property_type                   5367 non-null   object        
 7   room_type                       5367 non-null   object        
 8   accommodates                    5367 non-null   int64         
 9   bathrooms_text                  5367 non-null   object        
 10  bedrooms                        5367 non-null   float64       
 11  beds     

In [14]:
#by Cindy calculate how many airbnb are ther in each host neiborhood
neib=airbnb_df['host_neighbourhood']
neib.tolist()
print(f"There are {len(neib.unique())} Neighborhood included in denver area")
freq_count=Counter(neib)
for element,count in freq_count.items():
    print(f"Neiborhood {element} occurs {count} times")
most_five=freq_count.most_common(6)
print("The most popular Neiborhood is: ",most_five)
less_five=freq_count.most_common()[:-6:-1]
print("The least popular Neiborhood is: ",less_five)


There are 197 Neighborhood included in denver area
Neiborhood Highland occurs 254 times
Neiborhood Five Points occurs 320 times
Neiborhood North Park Hill occurs 23 times
Neiborhood North Capitol Hill occurs 40 times
Neiborhood Baker occurs 114 times
Neiborhood West Highland occurs 45 times
Neiborhood Ballpark occurs 20 times
Neiborhood South Park Hill occurs 10 times
Neiborhood Washington Park West occurs 18 times
Neiborhood City Park West occurs 121 times
Neiborhood Clayton occurs 38 times
Neiborhood City Park occurs 19 times
Neiborhood Stapleton occurs 24 times
Neiborhood -1 occurs 284 times
Neiborhood Lowry Field occurs 5 times
Neiborhood Platt Park occurs 30 times
Neiborhood Sunnyside occurs 54 times
Neiborhood Capitol Hill occurs 94 times
Neiborhood Congress Park occurs 91 times
Neiborhood CBD occurs 50 times
Neiborhood Berkeley occurs 51 times
Neiborhood Lincoln Park occurs 68 times
Neiborhood Speer occurs 192 times
Neiborhood Cheesman Park occurs 67 times
Neiborhood West Colfax

In [15]:
airbnb_df.columns

Index(['description', 'neighborhood_overview', 'host_neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds',
       'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'has_availability', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count', 'reviews_per_month',
       'bathroom_count'],
      dtype='object')

In [16]:

most_five=freq_count.most_common(6)

desired_region1=most_five[0][0]
print(desired_region1)
region_data=airbnb_df[airbnb_df['host_neighbourhood']==desired_region1]
if not region_data.empty:
    latitude1=region_data['latitude'].values[0]
    longitude1=region_data['longitude'].values[0]
    print(f"The latitude of {desired_region1} is:{latitude1},The longitude of {desired_region1} is:{longitude1}")
print("The most popular Neiborhood is: ",most_five)

desired_region2=most_five[1][0]
print(desired_region2)
region_data2=airbnb_df[airbnb_df['host_neighbourhood']==desired_region2]
if not region_data2.empty:
    latitude2=region_data2['latitude'].values[0]
    longitude2=region_data2['longitude'].values[0]
    print(f"The latitude of {desired_region2} is:{latitude2},The longitude of {desired_region2} is:{longitude2}")

desired_region3=most_five[3][0]
print(desired_region3)
region_data3=airbnb_df[airbnb_df['host_neighbourhood']==desired_region3]
if not region_data3.empty:
    latitude3=region_data3['latitude'].values[0]
    longitude3=region_data3['longitude'].values[0]
    print(f"The latitude of {desired_region3} is:{latitude3},The longitude of {desired_region3} is:{longitude3}")

desired_region4=most_five[4][0]
print(desired_region4)
region_data4=airbnb_df[airbnb_df['host_neighbourhood']==desired_region4]
if not region_data4.empty:
    latitude4=region_data4['latitude'].values[0]
    longitude4=region_data4['longitude'].values[0]
    print(f"The latitude of {desired_region4} is:{latitude4},The longitude of {desired_region4} is:{longitude4}")

desired_region5=most_five[5][0]
print(desired_region5)
region_data5=airbnb_df[airbnb_df['host_neighbourhood']==desired_region5]
if not region_data5.empty:
    latitude5=region_data5['latitude'].values[0]
    longitude5=region_data5['longitude'].values[0]
    print(f"The latitude of {desired_region5} is:{latitude5},The longitude of {desired_region5} is:{longitude5}")

Five Points
The latitude of Five Points is:39.76672,The longitude of Five Points is:-104.97906
The most popular Neiborhood is:  [('Five Points', 320), ('Northwest', 296), (-1, 284), ('West', 277), ('Northeast', 257), ('Highland', 254)]
Northwest
The latitude of Northwest is:39.76502,The longitude of Northwest is:-105.03364
West
The latitude of West is:39.70719,The longitude of West is:-105.01878
Northeast
The latitude of Northeast is:39.75684,The longitude of Northeast is:-104.85525
Highland
The latitude of Highland is:39.766414642333984,The longitude of Highland is:-105.0020980834961


In [17]:
locations={
    desired_region1:(latitude1,longitude1),
    desired_region2:(latitude2,longitude2),
    desired_region3:(latitude3,longitude3),
    desired_region4:(latitude4,longitude4),
    desired_region5:(latitude5,longitude5)

}
#,popup=folium.Popup('Top2 Popular AirbnbLocation',parse_html=True)
mymap=folium.Map(location=[latitude2,longitude2],zoom_start=16)
icon_airbnb = folium.Icon(color='purple')

for location,coords in locations.items():
    sorted_items = sorted(locations.items(), key=lambda x: x[1], reverse=True)

# Find the rank of the item based on its value
    rank = next((index for index, (key, value) in enumerate(sorted_items) if key == location), None)

    if rank is not None:
        rank += 1  # Adjust rank to start from 1 instead of 0
        print(f"The rank of '{location}' is {rank}.")
    else:
        print(f"'{location}' is not in the dictionary.")

    folium.Marker(location=coords,radius=500,
                    popup=f"Top {rank} -- {location}",
                    color="#3186cc",
                    fill_color="#3186cc").add_to(mymap)
    
mymap.save("map2.html")

The rank of 'Five Points' is 1.
The rank of 'Northwest' is 3.
The rank of 'West' is 5.
The rank of 'Northeast' is 4.
The rank of 'Highland' is 2.


In [18]:
mymap

In [19]:


#html_path="map2.html"
#webbrowser.open_new_tab(html_path)

# Starbucks Data Cleaning

In [20]:
starbucks_df.columns

Index(['Unnamed: 0', 'storeNumber', 'countryCode', 'ownershipTypeCode',
       'schedule', 'slug', 'latitude', 'longitude', 'streetAddressLine1',
       'streetAddressLine2', 'streetAddressLine3', 'city',
       'countrySubdivisionCode', 'postalCode', 'currentTimeOffset',
       'windowsTimeZoneId', 'olsonTimeZoneId', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26'],
      dtype='object')

In [21]:
starbucks_columns_tokeep = \
['Unnamed: 0', 'storeNumber', 'latitude', 'longitude',  'postalCode']


In [22]:
starbucks_df_location = starbucks_df[starbucks_columns_tokeep]
starbucks_df_location
starbucks_df_location.to_csv("Star_Location.csv")

In [23]:

"""locations={
    desired_region1:(latitude1,longitude1),
    desired_region2:(latitude2,longitude2),
    desired_region3:(latitude3,longitude3),
    desired_region4:(latitude4,longitude4),
    desired_region5:(latitude5,longitude5)

}
'./Raw_Data/startbucks.csv'
"""

filename= 'Star_Location.csv'
keys=('storeNumber','latitude', 'longitude')
records=[]
with open(filename,'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        records.append({key:row[key] for key in keys})
print(records[0])

{'storeNumber': '75988-107245', 'latitude': '61.174006', 'longitude': '-149.981584'}


In [24]:
type(records[0])

dict

In [81]:
"""for record in records:
    star_longitude=record['longitude']
    star_latitude=record['latitude']
    records['longitude']=float(star_longitude)
   # records['latitude']=float(star_latitude)
"""

"for record in records:\n    star_longitude=record['longitude']\n    star_latitude=record['latitude']\n    records['longitude']=float(star_longitude)\n   # records['latitude']=float(star_latitude)\n"

In [26]:
from folium.plugins import FastMarkerCluster

In [30]:
starbucks_map=folium.Map(location=(locations[desired_region1]),zoom_start=16)
star_longitude=[a['longitude'] for a in records]
star_latitude=[a['latitude'] for a in records]
FastMarkerCluster(data=list(zip(star_latitude,star_longitude))).add_to(starbucks_map)


starbucks_map
