#### 1.1 Import Cost of living Data Set

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


zri= pd.read_csv('http://files.zillowstatic.com/research/public/Zip/Zip_Zri_AllHomesPlusMultifamily_Summary.csv')
print(zri.shape)
print(zri.head(10))


(11881, 12)
         Date  RegionName State                             Metro  \
0  2020-01-31       60657    IL          Chicago-Naperville-Elgin   
1  2020-01-31       77494    TX  Houston-The Woodlands-Sugar Land   
2  2020-01-31       77449    TX  Houston-The Woodlands-Sugar Land   
3  2020-01-31       10002    NY       New York-Newark-Jersey City   
4  2020-01-31       77084    TX  Houston-The Woodlands-Sugar Land   
5  2020-01-31       79936    TX                           El Paso   
6  2020-01-31       60640    IL          Chicago-Naperville-Elgin   
7  2020-01-31       11226    NY       New York-Newark-Jersey City   
8  2020-01-31       10467    NY       New York-Newark-Jersey City   
9  2020-01-31       78660    TX                 Austin-Round Rock   

            County          City  SizeRank   Zri     MoM     QoQ     YoY  \
0      Cook County       Chicago         0  1996 -0.0278 -0.0610 -0.0146   
1    Harris County          Katy         1  1778  0.0013  0.0079  0.0022   


#### 1.2 Import Postal code Data Sets for city "Y' & city "X"

In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get the boroughs from ZIP's table on www.geonames.org › postal-codes › san-francisco
sfo_pages = pd.read_html('https://www.geonames.org/postalcode-search.html?q=California%2C+San+Francisco&country=US')
nyc_pages = pd.read_html('https://www.geonames.org/postalcode-search.html?q=New+York%2C+Manhattan&country=US')


#### 1.3 Data wrangling

##### 1.3.1 Neighborhoods in city "Y" & city "X"

In [3]:

sfo_df = pd.DataFrame()
for xn in range(0, 500, 2):
    try:
        sfo_df=sfo_df.append(sfo_pages[2].loc[xn])
    except:
        print('# sfo neighborhoods: ', xn)                     
        break

nyc_df = pd.DataFrame()
for xy in range(0, 500, 2):
    try:
        nyc_df=nyc_df.append(nyc_pages[2].loc[xy])
    except:
        print('# nyc neighborhoods: ', xy)
        break
        
sfo_df.dropna(subset=['Admin1'], inplace=True)
nyc_df.dropna(subset=['Admin1'], inplace=True)
print('City "Y" shape: ', sfo_df.shape)
print('City "X" shape: ', nyc_df.shape) 

sfo1_df = sfo_df.reset_index(drop=True)
sfo1_df = sfo1_df.rename(columns={'Admin1':'State', 'Admin2':'County', 'Place':'City', 'Code':'PostalCode'})
sfo1_df = sfo1_df.drop(["Admin3",  "Unnamed: 0"], axis=1)
print(sfo1_df.shape[0])
print(sfo1_df.head(10))

nyc1_df = nyc_df.reset_index(drop=True)
nyc1_df = nyc1_df.rename(columns={'Admin1':'State', 'Admin2':'County', 'Place':'City', 'Code':'PostalCode'})
nyc1_df = nyc1_df.drop(["Admin3",  "Unnamed: 0"], axis=1)
print(nyc1_df.shape[0])
print(nyc1_df.head(10))

# sfo neighborhoods:  108
# nyc neighborhoods:  292
City "Y" shape:  (53, 7)
City "X" shape:  (145, 7)
53
        State                            County PostalCode        Country  \
0  California  City and County of San Francisco      94102  United States   
1  California  City and County of San Francisco      94103  United States   
2  California  City and County of San Francisco      94107  United States   
3  California  City and County of San Francisco      94108  United States   
4  California  City and County of San Francisco      94109  United States   
5  California  City and County of San Francisco      94105  United States   
6  California  City and County of San Francisco      94111  United States   
7  California                         San Mateo      94080  United States   
8  California                         San Mateo      94128  United States   
9  California  City and County of San Francisco      94104  United States   

                  City  
0        San Francisc

##### 1.3.2 Cost of Living (Rental Average)

In [4]:
## Cost of Living (Rental Average)
zri.drop(['Date', 'Metro', 'County', 'City', 'SizeRank', 'MoM', 'QoQ', 'YoY', 'State', 'ZriRecordCnt'], axis=1, inplace = True)
zri.rename(columns={'RegionName': 'PostalCode', 'Zri': 'RentAmount'}, inplace = True)
zri.set_index('PostalCode', inplace = True)
print (zri.head())


            RentAmount
PostalCode            
60657             1996
77494             1778
77449             1443
10002             3605
77084             1438


#### 1.4 Get Latitude & Longitude for City "Y"

##### 1.4.1 City "Y"

In [5]:

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
geolocator = Nominatim(user_agent="on_explorer")
nexc = 0
for xn in range (0, sfo1_df.shape[0]):
    postalc = sfo1_df.loc[xn]['PostalCode']
    city = sfo1_df.loc[xn]['City']
    state = sfo1_df.loc[xn]['State']
    country = sfo1_df.loc[xn]['Country']
    address =  'USA, ' + city + ', ' + state + ', ' + postalc  
    try:
        location = geolocator.geocode(address)
        if location != None:
            latto = location.latitude
            lonto = location.longitude

            sfo1_df.loc[xn, 'latitude'] = latto
            sfo1_df.loc[xn, 'longitude'] = lonto
        else:
            nexc = nexc + 1
            sfo1_df.loc[xn, 'latitude'] = 0.0
            sfo1_df.loc[xn, 'longitude'] = 0.0
    except:
        nexc = nexc + 1
        sfo1_df.loc[xn, 'latitude'] = 0.0
        sfo1_df.loc[xn, 'longitude'] = 0.0
print('# of exceptions = ', nexc)

# delete all rows with no coordinates city Y
sfo1_df.dropna(subset=['latitude'], inplace=True)
indexCodes = sfo1_df[ sfo1_df['latitude'] == 0.0 ].index
sfo2_df = sfo1_df.drop(indexCodes , inplace=False)

sfo2_df= sfo2_df.reset_index(drop=True)
sfo2_df.drop(['Country'], axis=1, inplace = True)
print(sfo2_df.shape)
sfo2_df.head(10)


# of exceptions =  1
(52, 6)


Unnamed: 0,State,County,PostalCode,City,latitude,longitude
0,California,City and County of San Francisco,94102,San Francisco,37.779418,-122.418279
1,California,City and County of San Francisco,94103,San Francisco,37.775364,-122.408251
2,California,City and County of San Francisco,94107,San Francisco,37.78274,-122.392789
3,California,City and County of San Francisco,94108,San Francisco,37.792072,-122.41228
4,California,City and County of San Francisco,94109,San Francisco,37.798012,-122.422964
5,California,City and County of San Francisco,94105,San Francisco,37.788566,-122.39716
6,California,City and County of San Francisco,94111,San Francisco,37.794788,-122.399664
7,California,San Mateo,94080,South San Francisco,37.654476,-122.413768
8,California,San Mateo,94128,San Francisco,37.771837,-122.416422
9,California,City and County of San Francisco,94104,San Francisco,37.784953,-122.404903


##### 1.4.2 City "X"


In [6]:
geolocator = Nominatim(user_agent="cx_explorer")
nexx = 0
for xn in range (0, nyc1_df.shape[0]):
    postalc = nyc1_df.loc[xn]['PostalCode']
    city = nyc1_df.loc[xn]['City']
    state = nyc1_df.loc[xn]['State']
    country = nyc1_df.loc[xn]['Country']
    address =  'USA, ' + city + ', ' + state + ', ' + postalc  
    try:
        location = geolocator.geocode(address)
        if location != None:
            latto = location.latitude
            lonto = location.longitude
            nyc1_df.loc[xn, 'latitude'] = latto
            nyc1_df.loc[xn, 'longitude'] = lonto
        else:
            nexx = nexx + 1
            nyc1_df.loc[xn, 'latitude'] = 0.0
            nyc1_df.loc[xn, 'longitude'] = 0.0
    except:
        nexx = nexx + 1
        nyc1_df.loc[xn, 'latitude'] = 0.0
        nyc1_df.loc[xn, 'longitude'] = 0.0

# delete all rows with no coordinates city x
print('# of exceptions = ', nexx)
nyc1_df.dropna(subset=['latitude'], inplace=True)
indexCodes = nyc1_df[ nyc1_df['latitude'] == 0.0 ].index
nyc2_df = nyc1_df.drop(indexCodes , inplace=False)

nyc2_df= nyc2_df.reset_index(drop=True)
nyc2_df.drop(['Country'], axis=1, inplace = True)
print(nyc2_df.shape)
nyc2_df.head(10)


# of exceptions =  4
(141, 6)


Unnamed: 0,State,County,PostalCode,City,latitude,longitude
0,New York,New York,10001,New York,40.729825,-73.960752
1,New York,New York,10011,New York,40.740847,-73.999433
2,New York,New York,10016,New York,40.748112,-73.984384
3,New York,New York,10017,New York,40.750983,-73.993832
4,New York,New York,10019,New York,40.761413,-73.983541
5,New York,New York,10021,New York,40.770237,-73.95973
6,New York,New York,10022,New York,40.758263,-73.967889
7,New York,New York,10036,New York,40.755948,-73.980014
8,New York,New York,10065,New York,40.766119,-73.964665
9,New York,New York,10002,New York,40.722313,-73.987709


#### 2.1 Map the city "Y" Get Latitude & Longitude for City "Y"

In [7]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'San Francisco, California, US'

geolocator = Nominatim(user_agent="sfo_explorer")
location = geolocator.geocode(address)
latsfo = location.latitude
lonsfo = location.longitude
print('The geograpical coordinates of SFO are {}, {}.'.format(latsfo, lonsfo))

! conda install -c conda-forge folium
import folium

# create map of Toronto using latitude and longitude values
map_sfo = folium.Map(location=[latsfo, lonsfo], zoom_start=10)

# add markers to map
for lat, lng, cityx, pcode in zip(sfo2_df['latitude'], sfo2_df['longitude'], sfo2_df['City'], sfo2_df['PostalCode']):
    label = '{}, {}'.format(pcode, cityx)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sfo)  
    
map_sfo

The geograpical coordinates of SFO are 37.7790262, -122.4199061.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.4.0               |             py_0          26 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.10.1              |             py_0          59 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    ------------------------------------------------------------
                                          

#### 2.2 Map the city "X"

In [8]:
## Get Latitude & Longitude for City "X"
address = 'New York City, New York, US'

geolocator = Nominatim(user_agent="nyc_explorer")
location = geolocator.geocode(address)
latnyc = location.latitude
lonnyc = location.longitude
print('The geograpical coordinates of city X are {}, {}.'.format(latnyc, lonnyc))

# create map of city X using latitude and longitude values
map_nyc = folium.Map(location=[latnyc, lonnyc], zoom_start=10)

# add markers to map
for lat, lng, cityx, pcode in zip(nyc2_df['latitude'], nyc2_df['longitude'], nyc2_df['City'], nyc2_df['PostalCode']):
    label = '{}, {}'.format(pcode, cityx)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='Green',
        fill=True,
        fill_color='Green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

The geograpical coordinates of city X are 40.7127281, -74.0060152.


In [9]:
#### Define Foursquare Credentials and Version
CLIENT_ID = 'TMIKXOISICLWJJQODBQEWYOC44TQ50DL223HISL2NP0BGO14' # your Foursquare ID
CLIENT_SECRET = 'G1IDMYTRAKU1OP4RW1S35YJJ4OEDYX2UIR44UI3QEJZHNLQY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TMIKXOISICLWJJQODBQEWYOC44TQ50DL223HISL2NP0BGO14
CLIENT_SECRET:G1IDMYTRAKU1OP4RW1S35YJJ4OEDYX2UIR44UI3QEJZHNLQY


In [10]:
print(sfo2_df.head())
print(nyc2_df.head())


        State                            County PostalCode           City  \
0  California  City and County of San Francisco      94102  San Francisco   
1  California  City and County of San Francisco      94103  San Francisco   
2  California  City and County of San Francisco      94107  San Francisco   
3  California  City and County of San Francisco      94108  San Francisco   
4  California  City and County of San Francisco      94109  San Francisco   

    latitude   longitude  
0  37.779418 -122.418279  
1  37.775364 -122.408251  
2  37.782740 -122.392789  
3  37.792072 -122.412280  
4  37.798012 -122.422964  
      State    County PostalCode      City   latitude  longitude
0  New York  New York      10001  New York  40.729825 -73.960752
1  New York  New York      10011  New York  40.740847 -73.999433
2  New York  New York      10016  New York  40.748112 -73.984384
3  New York  New York      10017  New York  40.750983 -73.993832
4  New York  New York      10019  New York  40.761

In [11]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        cat_list = row['categories']
    except:
        cat_list = row['venue.categories']
        
    if len(cat_list) == 0:
        return None
    else:
        return cat_list[0]['name']

### 3.1 Explore Neighborhoods in City "Y"

#### 3.1.1 Set up parmeters to call Foursquare APIs

In [12]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files
#### Define Foursquare Credentials and Version
CLIENT_ID = 'TMIKXOISICLWJJQODBQEWYOC44TQ50DL223HISL2NP0BGO14' # your Foursquare ID
CLIENT_SECRET = 'G1IDMYTRAKU1OP4RW1S35YJJ4OEDYX2UIR44UI3QEJZHNLQY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TMIKXOISICLWJJQODBQEWYOC44TQ50DL223HISL2NP0BGO14
CLIENT_SECRET:G1IDMYTRAKU1OP4RW1S35YJJ4OEDYX2UIR44UI3QEJZHNLQY


#### 3.1.2 Function to get all nearby venues of a neighbourhoods in city X, Y

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 100 # limit of number of venues returned by Foursquare API
    radius = 500 # define radius    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### 3.2 Analyze Each Neighborhood of City "Y"

#### 3.2.1 Get all nearby venues of all neighbourhoods in city "Y"

In [14]:
## Code cy run the above function on each neighborhood and create a new dataframe called cy_venues*.
cy_venues = getNearbyVenues(names=sfo2_df['PostalCode'],
                                   latitudes=sfo2_df['latitude'],
                                   longitudes=sfo2_df['longitude']
                                  )
# Let's check the size of the resulting dataframe
print(cy_venues.shape)
print(cy_venues.head())
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(cy_venues['Venue Category'].unique())))
# Let's check how many venues were returned for each neighborhood
print(cy_venues[['PostalCode', 'Venue']].groupby('PostalCode').count())

(3156, 7)
  PostalCode  Neighborhood Latitude  Neighborhood Longitude  \
0      94102              37.779418             -122.418279   
1      94102              37.779418             -122.418279   
2      94102              37.779418             -122.418279   
3      94102              37.779418             -122.418279   
4      94102              37.779418             -122.418279   

                            Venue  Venue Latitude  Venue Longitude  \
0                Asian Art Museum       37.780178      -122.416505   
1  Louise M. Davies Symphony Hall       37.777976      -122.420157   
2                  Herbst Theater       37.779548      -122.420953   
3        War Memorial Opera House       37.778601      -122.420816   
4            San Francisco Ballet       37.778580      -122.420798   

  Venue Category  
0     Art Museum  
1   Concert Hall  
2   Concert Hall  
3    Opera House  
4   Dance Studio  
There are 284 uniques categories.
            Venue
PostalCode       
94080 

#### 3.2.2  Convert into a matrix of Venue categories vs Postal codes & group by PostalCode

In [15]:
# Convert Categorical variables in city "Y" dummy indicator (o / 1 / NaN)
cy_venind = pd.get_dummies(cy_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back cy dataframe
cy_venind['PostalCode'] = cy_venues['PostalCode'] 

# move neighborhood column cy the first column
fixed_col = [cy_venind.columns[-1]] + list(cy_venind.columns[:-1])
cy_venind  = cy_venind[fixed_col]
cy_venind.head(10)

# let's group rows by Postal Code and by taking the mean of the frequency of occurrence of each category
cy_venue_gr = cy_venind.groupby('PostalCode').mean().reset_index()

print('Postal codes: ', cy_venue_gr.shape[0], '  Venue Categories: ', cy_venue_gr.shape[1])
print(cy_venue_gr.head(2))


Postal codes:  52   Venue Categories:  285
  PostalCode       ATM  Acai House  Accessories Store  Acupuncturist  \
0      94080  0.017241         0.0                0.0            0.0   
1      94083  0.000000         0.0                0.0            0.0   

   Adult Boutique  Afghan Restaurant  African Restaurant  Alternative Healer  \
0             0.0                0.0                 0.0                 0.0   
1             0.0                0.0                 0.0                 0.0   

   American Restaurant  Antique Shop  Arcade  Arepa Restaurant  \
0                  0.0           0.0     0.0               0.0   
1                  0.0           0.0     0.0               0.0   

   Argentinian Restaurant  Art Gallery  Art Museum  Arts & Crafts Store  \
0                     0.0          0.0         0.0             0.000000   
1                     0.0          0.0         0.0             0.166667   

   Asian Restaurant  Athletics & Sports  Austrian Restaurant  Auto Garage 

#### 3.2.3 Let's print each neighborhood along with the top 5 most common venues

In [16]:
num_cyp_venues = 6

for hood in cy_venue_gr['PostalCode']:
    print("----"+hood+"----")
    temp = cy_venue_gr[cy_venue_gr['PostalCode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 4})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_cyp_venues))
    print('\n')

----94080----
                 venue    freq
0   Mexican Restaurant  0.1034
1   Chinese Restaurant  0.0517
2          Coffee Shop  0.0517
3   Italian Restaurant  0.0345
4                Diner  0.0345
5  Japanese Restaurant  0.0345


----94083----
                 venue    freq
0  Rental Car Location  0.3333
1            Cafeteria  0.1667
2           Food Truck  0.1667
3  Arts & Crafts Store  0.1667
4          Coffee Shop  0.1667
5           Non-Profit  0.0000


----94102----
                   venue  freq
0            Coffee Shop  0.05
1           Cocktail Bar  0.05
2  Vietnamese Restaurant  0.04
3                Theater  0.04
4                   Café  0.04
5               Beer Bar  0.03


----94103----
                   venue    freq
0              Nightclub  0.0946
1                Gay Bar  0.0541
2               Wine Bar  0.0541
3             Restaurant  0.0405
4            Coffee Shop  0.0405
5  Vietnamese Restaurant  0.0405


----94104----
            venue  freq
0        Boutiqu

#### 3.2.4 Convert the grouped venues into a DF (check if this needed)

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


top10y = 10
# create columns according cy number of cyp venues
prefixy = ['st', 'nd', 'rd']
columns = ['PostalCode']
for ind in np.arange(top10y):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, prefixy[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
top10vy = pd.DataFrame(columns=columns)
top10vy['PostalCode'] = cy_venue_gr['PostalCode']

for ind in np.arange(cy_venue_gr.shape[0]):
    top10vy.iloc[ind, 1:] = return_most_common_venues(cy_venue_gr.iloc[ind, :], top10y)

top10vy.head(15)

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,94080,Mexican Restaurant,Coffee Shop,Chinese Restaurant,Brewery,Thai Restaurant,Diner,Vietnamese Restaurant,Liquor Store,Sandwich Place,Italian Restaurant
1,94083,Rental Car Location,Food Truck,Coffee Shop,Arts & Crafts Store,Cafeteria,Flower Shop,Flea Market,Filipino Restaurant,Field,Fast Food Restaurant
2,94102,Coffee Shop,Cocktail Bar,Vietnamese Restaurant,Theater,Café,Bakery,Vegetarian / Vegan Restaurant,Beer Bar,Performing Arts Venue,Music Venue
3,94103,Nightclub,Wine Bar,Gay Bar,Restaurant,Vietnamese Restaurant,Coffee Shop,Art Gallery,Clothing Store,Motorcycle Shop,Café
4,94104,Boutique,Women's Store,Coffee Shop,Clothing Store,Hotel,Spa,Furniture / Home Store,Café,Cosmetics Shop,Men's Store
5,94105,Coffee Shop,Sandwich Place,Food Truck,Salad Place,Bar,Café,American Restaurant,Yoga Studio,Cocktail Bar,Sushi Restaurant
6,94107,Coffee Shop,Café,Bar,Park,Thai Restaurant,New American Restaurant,Baseball Stadium,Scenic Lookout,Sandwich Place,Brewery
7,94108,Hotel,American Restaurant,Cocktail Bar,Café,Italian Restaurant,Breakfast Spot,Art Gallery,Coffee Shop,Bar,Grocery Store
8,94109,Coffee Shop,Gym / Fitness Center,Italian Restaurant,Hotel,Bar,Sushi Restaurant,Cosmetics Shop,Deli / Bodega,Park,Wine Bar
9,94110,Mexican Restaurant,Pizza Place,Bar,Furniture / Home Store,Café,Bakery,Cocktail Bar,Juice Bar,Boutique,Coffee Shop


#### 3.2.5 Extracts only the venues which match the  5 desired characteristics in city "Y"

In [18]:
cy_venue_t5 = []

vn_cols = [col for col in cy_venue_gr.columns]
for cx in range(0,len(vn_cols)):
    if 'Martial' in vn_cols[cx]:
        cy_venue_t5.append(cy_venue_gr[vn_cols[cx]])
    if 'Tennis' in vn_cols[cx]:
        cy_venue_t5.append(cy_venue_gr[vn_cols[cx]])
    if 'Yoga' in vn_cols[cx]:
        cy_venue_t5.append(cy_venue_gr[vn_cols[cx]])
    if 'Park' in vn_cols[cx]:
        if vn_cols[cx] != 'Parking':
            cy_venue_t5.append(cy_venue_gr[vn_cols[cx]])
    if 'Mall' in vn_cols[cx]:
        cy_venue_t5.append(cy_venue_gr[vn_cols[cx]])


cy_venue_t5df = pd.DataFrame(cy_venue_t5)
print (cy_venue_t5df.shape)
print(cy_venue_t5df.head())



(5, 52)
                    0    1     2         3     4     5         6     7   \
Martial Arts Dojo  0.0  0.0  0.00  0.000000  0.00  0.00  0.000000  0.01   
Park               0.0  0.0  0.01  0.013514  0.00  0.01  0.037975  0.01   
Shopping Mall      0.0  0.0  0.00  0.000000  0.01  0.00  0.000000  0.00   
Tennis Court       0.0  0.0  0.00  0.000000  0.00  0.00  0.000000  0.00   
Yoga Studio        0.0  0.0  0.00  0.000000  0.00  0.02  0.000000  0.02   

                         8     9     10        11        12       13   14  \
Martial Arts Dojo  0.000000  0.00  0.00  0.000000  0.000000  0.00000  0.0   
Park               0.031250  0.01  0.02  0.066667  0.056604  0.03125  0.0   
Shopping Mall      0.000000  0.00  0.00  0.000000  0.000000  0.00000  0.0   
Tennis Court       0.010417  0.01  0.00  0.000000  0.018868  0.03125  0.0   
Yoga Studio        0.010417  0.02  0.00  0.000000  0.018868  0.03125  0.0   

                         15        16   17       18        19       20  \
Mart

#### 3.2.6 Preparing & Transforming data on the 5 desired characteristics in city "Y"

In [19]:
## Transpose matrix
cy_venue_t5dft = cy_venue_t5df.transpose()
cy_venue_t5dft['PostalCode'] = cy_venue_gr['PostalCode']
# Move postal code to the first column of the DF
cols = list(cy_venue_t5dft.columns)
cols = [cols[-1]] + cols[:-1]
cy_venue_t5dft = cy_venue_t5dft[cols]
## Add TotRanking Column
cy_venue_t5dft['TotRanking'] = cy_venue_t5dft.sum(axis=1)

## Delete rows with TotRanking == 0 & Sort
rowstod = cy_venue_t5dft[ cy_venue_t5dft['TotRanking'] == 0.0 ].index
cy_venue_t5dft.drop(rowstod , inplace=True)
cy_venue_t5dft.sort_values(by=['TotRanking'],ascending = False, inplace = True)
print('Shape before add Rental Amount: ',cy_venue_t5dft.shape)

## Add Rental amount & Latitude / longitude for the neighborhood
zri.sort_values(by=['PostalCode'],ascending = False, inplace = True)
print('zri shape: ', zri.shape)
cy_venue_t5dft2 = cy_venue_t5dft.set_index('PostalCode')
cy_venue_t5dft2['RentAmount'] = np.nan
cy_venue_t5dft2['latitude'] = np.nan
cy_venue_t5dft2['longitude'] = np.nan
sfo2_dfix = sfo2_df.set_index('PostalCode')
for pcx in cy_venue_t5dft['PostalCode']:
    pcxn = int(pcx)
    try:
        cy_venue_t5dft2.loc[pcx]['RentAmount'] = zri.loc[pcxn]['RentAmount']
        cy_venue_t5dft2.loc[pcx]['latitude'] = sfo2_dfix.loc[pcx]['latitude']
        cy_venue_t5dft2.loc[pcx]['longitude'] = sfo2_dfix.loc[pcx]['longitude']
    except:
        print ('Except: ',pcx)

cy_venue_t5dft2 = cy_venue_t5dft2.dropna()
print('Shape after add Rental Amount: ',cy_venue_t5dft2.shape)
print(cy_venue_t5dft2.head(10))


Shape before add Rental Amount:  (26, 7)
zri shape:  (11881, 1)
Except:  94129
Except:  94143
Except:  94164
Except:  94128
Except:  94104
Shape after add Rental Amount:  (21, 9)
            Martial Arts Dojo      Park  Shopping Mall  Tennis Court  \
PostalCode                                                             
94118                     0.0  0.217391            0.0      0.000000   
94124                     0.0  0.166667            0.0      0.000000   
94123                     0.0  0.048780            0.0      0.024390   
94114                     0.0  0.056604            0.0      0.018868   
94115                     0.0  0.031250            0.0      0.031250   
94122                     0.0  0.076923            0.0      0.000000   
94132                     0.0  0.000000            0.0      0.000000   
94117                     0.0  0.061538            0.0      0.000000   
94112                     0.0  0.066667            0.0      0.000000   
94158                     0.0

### 3.3 Clustering the 5 desired characteristics in city "Y"

In [20]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

cy_neigh_clust  = cy_venue_t5dft2.reset_index()
cy_neigh_clusta = cy_neigh_clust[['PostalCode', 'latitude', 'longitude']]
cy_neigh_clustb = cy_neigh_clust.drop(['latitude', 'longitude'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cy_neigh_clustb)

# check cluster labels generated for each row in the dataframe
print (len(kmeans.labels_))
print (kmeans.labels_[0:] )

# add clustering labels
cy_neigh_clustb.insert(0, 'Cluster Labels', kmeans.labels_)

# merge neighborhood Y_grouped with city Y_data to add latitude/longitude for each neighborhood
cy_neigh_clustb = cy_neigh_clustb.join(cy_neigh_clusta.set_index('PostalCode'), on='PostalCode')
cy_neigh_clustb.head(10) # check the last columns!

21
[3 2 0 0 0 1 2 3 2 0 3 1 0 1 3 1 3 3 0 1 2]


Unnamed: 0,Cluster Labels,PostalCode,Martial Arts Dojo,Park,Shopping Mall,Tennis Court,Yoga Studio,TotRanking,RentAmount,latitude,longitude
0,3,94118,0.0,0.217391,0.0,0.0,0.0,0.217391,4423.0,37.775515,-122.457818
1,2,94124,0.0,0.166667,0.0,0.0,0.0,0.166667,3810.0,37.7163,-122.394562
2,0,94123,0.0,0.04878,0.0,0.02439,0.02439,0.097561,4924.0,37.801901,-122.430807
3,0,94114,0.0,0.056604,0.0,0.018868,0.018868,0.09434,4713.0,37.763689,-122.439791
4,0,94115,0.0,0.03125,0.0,0.03125,0.03125,0.09375,4644.0,37.782757,-122.440178
5,1,94122,0.0,0.076923,0.0,0.0,0.0,0.076923,4009.0,37.759897,-122.47365
6,2,94132,0.0,0.0,0.0,0.0,0.076923,0.076923,3767.0,37.718021,-122.47425
7,3,94117,0.0,0.061538,0.0,0.0,0.015385,0.076923,4417.0,37.773044,-122.451545
8,2,94112,0.0,0.066667,0.0,0.0,0.0,0.066667,3742.0,37.721952,-122.445043
9,0,94158,0.0,0.043478,0.0,0.0,0.021739,0.065217,4703.0,37.770242,-122.386794


### 3.4 Create map of the 5 desired characteristics in city "Y"

In [21]:
# create map
map_neighborhoods = folium.Map(location=[latsfo, lonsfo], zoom_start=11)

# set color scheme for the clusters
colorsx = ['Red', 'Blue', 'Yellow', 'Brown', 'Green']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cy_neigh_clustb['latitude'], cy_neigh_clustb['longitude'], cy_neigh_clustb['PostalCode'], cy_neigh_clustb['Cluster Labels']):
        rentAm = cy_venue_t5dft2.loc[poi]['RentAmount']
        label = folium.Popup(str(poi) + ' Cluster ' + str(int(cluster) + 1) + ' Rent Amt ' + str(rentAm), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=3,
            popup=label,
            color=colorsx[int(cluster)],
            fill=True,
            fill_color=colorsx[int(cluster)],
            fill_opacity=0.7).add_to(map_neighborhoods)
       
map_neighborhoods

### 3.5 Examine clusters of the 5 desired characteristics in city "Y"

In [22]:
# Clusters
for clx in range (0, kclusters):
    cls = cy_neigh_clustb.loc[cy_neigh_clustb['Cluster Labels'] == clx, cy_neigh_clustb.columns[[1] + list(range(5, cy_neigh_clustb.shape[1]))]]
    print ('# of Neiborhoods in cluster ', clx + 1, '(', colorsx[int(clx)], ') : ' , cls.shape[0])   
    
for clx in range (0, kclusters):
    cls = cy_neigh_clustb.loc[cy_neigh_clustb['Cluster Labels'] == clx, cy_neigh_clustb.columns[[1] + list(range(7, cy_neigh_clustb.shape[1]))]]
    print(cls)

# of Neiborhoods in cluster  1 ( Red ) :  6
# of Neiborhoods in cluster  2 ( Blue ) :  5
# of Neiborhoods in cluster  3 ( Yellow ) :  4
# of Neiborhoods in cluster  4 ( Brown ) :  6
   PostalCode  TotRanking  RentAmount   latitude   longitude
2       94123    0.097561      4924.0  37.801901 -122.430807
3       94114    0.094340      4713.0  37.763689 -122.439791
4       94115    0.093750      4644.0  37.782757 -122.440178
9       94158    0.065217      4703.0  37.770242 -122.386794
12      94133    0.050000      4805.0  37.799946 -122.408747
18      94111    0.020000      4684.0  37.794788 -122.399664
   PostalCode  TotRanking  RentAmount   latitude   longitude
5       94122    0.076923      4009.0  37.759897 -122.473650
11      94127    0.051724      4145.0  37.739616 -122.465307
13      94121    0.047619      4000.0  37.778591 -122.492289
15      94108    0.040000      4187.0  37.792072 -122.412280
19      94103    0.013514      4059.0  37.775364 -122.408251
   PostalCode  TotRanking

#### 4.2 Analyze Each Neighborhood of City "X"

#### 4.2.1 Get all nearby venues of all neighbourhoods in city X

In [23]:
cx_venues = getNearbyVenues(names=nyc2_df['PostalCode'],
                                   latitudes=nyc2_df['latitude'],
                                   longitudes=nyc2_df['longitude']
                                  )
# Let's check the size of the resulting dataframe
print(cx_venues.shape)
print(cx_venues.head())
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(cx_venues['Venue Category'].unique())))
# Let's check how many venues were returned for each neighborhood
print(cx_venues[['PostalCode', 'Venue']].groupby('PostalCode').count())

(8225, 7)
  PostalCode  Neighborhood Latitude  Neighborhood Longitude  \
0      10001              40.729825              -73.960752   
1      10001              40.729825              -73.960752   
2      10001              40.729825              -73.960752   
3      10001              40.729825              -73.960752   
4      10001              40.729825              -73.960752   

                   Venue  Venue Latitude  Venue Longitude Venue Category  
0  WNYC Transmitter Park       40.729745       -73.960823           Park  
1           Paulie Gee’s       40.729801       -73.958520    Pizza Place  
2                Bellocq       40.730372       -73.959213       Tea Room  
3                 Ovenly       40.729708       -73.959544         Bakery  
4          New Love City       40.729760       -73.958247    Yoga Studio  
There are 350 uniques categories.
            Venue
PostalCode       
10001          68
10002         100
10003         100
10004          52
10005         100
1

#### 4.2.2 Convert into a matrix of Venue categories vs Postal codes & group by PostalCode

In [24]:
# Convert Categorical variables in city "Y" dummy indicator (o / 1 / NaN)
cx_venind = pd.get_dummies(cx_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back cy dataframe
cx_venind['PostalCode'] = cx_venues['PostalCode'] 

# move neighborhood column cy the first column
fixed_col = [cx_venind.columns[-1]] + list(cx_venind.columns[:-1])
cx_venind  = cx_venind[fixed_col]
cx_venind.head(10)

# let's group rows by Postal Code and by taking the mean of the frequency of occurrence of each category
cx_venue_gr = cx_venind.groupby('PostalCode').mean().reset_index()

print('Postal codes: ', cx_venue_gr.shape[0], '  Venue Categories: ', cx_venue_gr.shape[1])
print(cx_venue_gr.head(2))

Postal codes:  128   Venue Categories:  351
  PostalCode  Accessories Store  Adult Boutique  Afghan Restaurant  \
0      10001                0.0             0.0                0.0   
1      10002                0.0             0.0                0.0   

   African Restaurant  American Restaurant  Antique Shop  Arcade  \
0                 0.0             0.014706      0.014706     0.0   
1                 0.0             0.020000      0.000000     0.0   

   Arepa Restaurant  Argentinian Restaurant  Art Gallery  Art Museum  \
0               0.0                    0.00     0.029412         0.0   
1               0.0                    0.01     0.030000         0.0   

   Arts & Crafts Store  Asian Restaurant  Athletics & Sports  Auditorium  \
0             0.014706              0.00            0.014706         0.0   
1             0.000000              0.02            0.000000         0.0   

   Australian Restaurant  Austrian Restaurant  BBQ Joint  Bagel Shop  \
0                   0.

#### 4.2.3 Let's print each neighborhood along with the top 5 most common venues

In [25]:
num_cyp_venues = 6
for hood in cx_venue_gr['PostalCode']:
    print("----"+hood+"----")
    temp = cx_venue_gr[cx_venue_gr['PostalCode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 4})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_cyp_venues))
    print('\n')

----10001----
               venue    freq
0                Bar  0.1176
1       Cocktail Bar  0.0735
2        Yoga Studio  0.0441
3           Boutique  0.0441
4  French Restaurant  0.0294
5        Coffee Shop  0.0294


----10002----
                venue  freq
0   French Restaurant  0.05
1  Italian Restaurant  0.05
2         Coffee Shop  0.05
3         Pizza Place  0.04
4        Cocktail Bar  0.04
5      Sandwich Place  0.03


----10003----
                           venue  freq
0            Japanese Restaurant  0.06
1                    Yoga Studio  0.04
2                   Dessert Shop  0.04
3  Vegetarian / Vegan Restaurant  0.03
4       Mediterranean Restaurant  0.03
5                    Coffee Shop  0.03


----10004----
                venue    freq
0  Mexican Restaurant  0.0962
1       Boat or Ferry  0.0769
2                 Bar  0.0769
3        Cocktail Bar  0.0577
4         Coffee Shop  0.0577
5      Sandwich Place  0.0385


----10005----
                venue  freq
0        Coc

#### 4.2.5 Extracts only the 5 desired characteristics in city "X"

In [26]:
cx_venue_t5 = []
vn_cols = [col for col in cx_venue_gr.columns]

for cx in range(0,len(vn_cols)):
    if 'Martial' in vn_cols[cx]:
        cx_venue_t5.append(cx_venue_gr[vn_cols[cx]])      
    if 'Tennis' in vn_cols[cx]:
        cx_venue_t5.append(cx_venue_gr[vn_cols[cx]])
    if 'Yoga' in vn_cols[cx]:
        cx_venue_t5.append(cx_venue_gr[vn_cols[cx]])
    if 'Park' in vn_cols[cx]:
        if vn_cols[cx] != 'Parking':
            cx_venue_t5.append(cx_venue_gr[vn_cols[cx]])
    if 'Mall' in vn_cols[cx]:
        cx_venue_t5.append(cx_venue_gr[vn_cols[cx]])

cx_venue_t5df = pd.DataFrame(cx_venue_t5)
print (cx_venue_t5df.shape)
print(cx_venue_t5df.head())


(9, 128)
                              0    1     2         3     4     5     6    \
Martial Arts Dojo        0.000000  0.0  0.00  0.000000  0.00  0.00  0.01   
Park                     0.014706  0.0  0.01  0.038462  0.01  0.07  0.02   
Shopping Mall            0.000000  0.0  0.00  0.000000  0.00  0.00  0.01   
Skate Park               0.000000  0.0  0.00  0.000000  0.00  0.00  0.00   
State / Provincial Park  0.000000  0.0  0.00  0.000000  0.00  0.00  0.00   

                          7         8     9     10    11    12   13    14   \
Martial Arts Dojo        0.00  0.000000  0.00  0.00  0.01  0.00  0.0  0.01   
Park                     0.02  0.066667  0.01  0.02  0.00  0.02  0.0  0.00   
Shopping Mall            0.02  0.000000  0.00  0.00  0.00  0.00  0.0  0.00   
Skate Park               0.00  0.000000  0.00  0.00  0.00  0.00  0.0  0.00   
State / Provincial Park  0.00  0.000000  0.00  0.00  0.00  0.00  0.0  0.00   

                         15        16   17   18   19   20    21  

#### 4.2.6 Preparing & Transforming data on the 5 desired characteristics in city "X"

In [27]:
## Transpose matrix
cx_venue_t5dft = cx_venue_t5df.transpose()
cx_venue_t5dft['PostalCode'] = cx_venue_gr['PostalCode']
# Move postal code to the first column of the DF
cols = list(cx_venue_t5dft.columns)
cols = [cols[-1]] + cols[:-1]
cx_venue_t5dft = cx_venue_t5dft[cols]
## Add TotRanking Column
cx_venue_t5dft['TotRanking'] = cx_venue_t5dft.sum(axis=1)

## Delete rows with TotRanking == 0 & Sort
rowstod = cx_venue_t5dft[ cx_venue_t5dft['TotRanking'] == 0.0 ].index
cx_venue_t5dft.drop(rowstod , inplace=True)
cx_venue_t5dft.sort_values(by=['TotRanking'],ascending = False, inplace = True)
print('cx_venue shape before add Rental Amount: ',cx_venue_t5dft.shape)

## Add Rental amount & Latitude / longitude for the neighborhood
zri.sort_values(by=['PostalCode'],ascending = False, inplace = True)
print('zri shape: ', zri.shape)
cx_venue_t5dft2 = cx_venue_t5dft.set_index('PostalCode')
cx_venue_t5dft2['RentAmount'] = np.nan
cx_venue_t5dft2['latitude'] = np.nan
cx_venue_t5dft2['longitude'] = np.nan
nyc2_dfix = nyc2_df.set_index('PostalCode')
for pcx in cx_venue_t5dft['PostalCode']:
    pcxn = int(pcx)
    try:
        cx_venue_t5dft2.loc[pcx]['RentAmount'] = zri.loc[pcxn]['RentAmount']
        cx_venue_t5dft2.loc[pcx]['latitude'] = nyc2_dfix.loc[pcx]['latitude']
        cx_venue_t5dft2.loc[pcx]['longitude'] = nyc2_dfix.loc[pcx]['longitude']
    except:
        print ('Except: ',pcx)

cx_venue_t5dft2 = cx_venue_t5dft2.dropna()
print('cx_venue shape after add Rental Amount: ',cx_venue_t5dft2.shape)
print(cx_venue_t5dft2.head(10))

## Transpose matrix
cx_venue_t5dft = cx_venue_t5df.transpose()
cx_venue_t5dft['PostalCode'] = cx_venue_gr['PostalCode']
# Move postal code to the first column of the DF
cols = list(cx_venue_t5dft.columns)
cols = [cols[-1]] + cols[:-1]
cx_venue_t5dft = cx_venue_t5dft[cols]
## Add TotRanking Column
cx_venue_t5dft['TotRanking'] = cx_venue_t5dft.sum(axis=1)

## Delete rows with TotRanking == 0 & Sort
rowstod = cx_venue_t5dft[ cx_venue_t5dft['TotRanking'] == 0.0 ].index
cx_venue_t5dft.drop(rowstod , inplace=True)
cx_venue_t5dft.sort_values(by=['TotRanking'],ascending = False, inplace = True)
print('Shape before add Rental Amount: ',cx_venue_t5dft.shape)

## Add Rental amount & Latitude / longitude for the neighborhood
zri.sort_values(by=['PostalCode'],ascending = False, inplace = True)
print('zri shape: ', zri.shape)
cx_venue_t5dft2 = cx_venue_t5dft.set_index('PostalCode')
cx_venue_t5dft2['RentAmount'] = np.nan
cx_venue_t5dft2['latitude'] = np.nan
cx_venue_t5dft2['longitude'] = np.nan
nyc2_dfix = nyc2_df.set_index('PostalCode')
for pcx in cx_venue_t5dft['PostalCode']:
    pcxn = int(pcx)
    try:
        cx_venue_t5dft2.loc[pcx]['RentAmount'] = zri.loc[pcxn]['RentAmount']
        cx_venue_t5dft2.loc[pcx]['latitude'] = nyc2_dfix.loc[pcx]['latitude']
        cx_venue_t5dft2.loc[pcx]['longitude'] = nyc2_dfix.loc[pcx]['longitude']
    except:
        print ('Except: ',pcx)

cx_venue_t5dft2 = cx_venue_t5dft2.dropna()
print('Shape after add Rental Amount: ',cx_venue_t5dft2.shape)
print(cx_venue_t5dft2.head(10))

cx_venue shape before add Rental Amount:  (70, 11)
zri shape:  (11881, 1)
Except:  10039
Except:  10280
Except:  10285
Except:  10281
Except:  10115
Except:  10028
Except:  10282
Except:  10044
Except:  10011
Except:  10008
Except:  10003
Except:  10023
Except:  10162
Except:  10025
Except:  10279
Except:  10013
Except:  10007
Except:  10271
Except:  10110
Except:  10158
Except:  10031
Except:  10034
Except:  10176
Except:  10033
Except:  10038
Except:  10275
Except:  10024
Except:  10045
Except:  10153
Except:  10155
Except:  10165
Except:  10173
Except:  10123
Except:  10178
Except:  10107
Except:  10012
Except:  10199
Except:  10170
Except:  10174
Except:  10172
Except:  10014
Except:  10169
Except:  10168
Except:  10167
Except:  10166
Except:  10122
Except:  10121
Except:  10120
Except:  10106
Except:  10041
Except:  10021
Except:  10118
cx_venue shape after add Rental Amount:  (18, 13)
            Martial Arts Dojo      Park  Shopping Mall  Skate Park  \
PostalCode                

### 4.3 Clustering the 5 desired characteristics in city "X"

In [31]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

cx_neigh_clust  = cx_venue_t5dft2.reset_index()
cx_neigh_clusta = cx_neigh_clust[['PostalCode', 'latitude', 'longitude']]
cx_neigh_clustb = cx_neigh_clust.drop(['latitude', 'longitude'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cx_neigh_clustb)

# check cluster labels generated for each row in the dataframe
print (len(kmeans.labels_))
print (kmeans.labels_[0:] )

# add clustering labels
cx_neigh_clustb.insert(0, 'Cluster Labels', kmeans.labels_)

# merge neighborhood Y_grouped with city Y_data to add latitude/longitude for each neighborhood
cx_neigh_clustb = cx_neigh_clustb.join(cx_neigh_clusta.set_index('PostalCode'), on='PostalCode')
cx_neigh_clustb # check the last columns!

18
[2 3 0 3 3 2 1 1 2 0 3 3 1 1 1 0 3 0]


Unnamed: 0,Cluster Labels,PostalCode,Martial Arts Dojo,Park,Shopping Mall,Skate Park,State / Provincial Park,Tennis Court,Tennis Stadium,Theme Park Ride / Attraction,Yoga Studio,TotRanking,RentAmount,latitude,longitude
0,2,10069,0.0,0.074074,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.092593,3899.0,40.776977,-73.988202
1,3,10006,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,3821.0,40.706513,-74.014417
2,0,10009,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,3416.0,40.726752,-73.973799
3,3,10001,0.0,0.014706,0.0,0.0,0.0,0.0,0.0,0.0,0.044118,0.058824,3633.0,40.729825,-73.960752
4,3,10018,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,3525.0,40.760244,-74.002875
5,2,10004,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,4077.0,40.700732,-74.013475
6,1,10026,0.0,0.036364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036364,2984.0,40.803047,-73.952798
7,1,10032,0.0,0.033898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033898,2817.0,40.837412,-73.94103
8,2,10005,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.03,4060.0,40.720757,-74.00667
9,0,10029,0.0,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641,3150.0,40.783622,-73.943041


### 4.4 Create map of the 5 desired characteristics in city "X"

In [29]:
# create map
map_neighborhoods = folium.Map(location=[latnyc, lonnyc], zoom_start=11)

# set color scheme for the clusters
colorsx = ['Red', 'Blue', 'Yellow', 'Brown', 'Green']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cx_neigh_clustb['latitude'], cx_neigh_clustb['longitude'], cx_neigh_clustb['PostalCode'], cx_neigh_clustb['Cluster Labels']):
        rentAm = cx_venue_t5dft2.loc[poi]['RentAmount']
        label = folium.Popup(str(poi) + ' Cluster ' + str(int(cluster) + 1) + ' Rent Amt ' + str(rentAm), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=3,
            popup=label,
            color=colorsx[int(cluster)],
            fill=True,
            fill_color=colorsx[int(cluster)],
            fill_opacity=0.7).add_to(map_neighborhoods)
       
map_neighborhoods
# create map
map_neighborhoods = folium.Map(location=[latnyc, lonnyc], zoom_start=11)

# set color scheme for the clusters
colorsx = ['Red', 'Blue', 'Yellow', 'Brown', 'Green']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cx_neigh_clustb['latitude'], cx_neigh_clustb['longitude'], cx_neigh_clustb['PostalCode'], cx_neigh_clustb['Cluster Labels']):
        rentAm = cx_venue_t5dft2.loc[poi]['RentAmount']
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + ' Rent Amt ' + str(rentAm), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=3,
            popup=label,
            color=colorsx[int(cluster)],
            fill=True,
            fill_color=colorsx[int(cluster)],
            fill_opacity=0.7).add_to(map_neighborhoods)
       
map_neighborhoods

### 4.5 Examine clusters of the 5 desired characteristics in city "X"

In [32]:
# Clusters
for clx in range (0, kclusters):
    cls = cx_neigh_clustb.loc[cx_neigh_clustb['Cluster Labels'] == clx, cx_neigh_clustb.columns[[1] + list(range(5, cx_neigh_clustb.shape[1]))]]
    print ('# of Neiborhoods in cluster ', clx + 1, '(', colorsx[int(clx)], ') : ' , cls.shape[0])   
    
for clx in range (0, kclusters):
    cls = cx_neigh_clustb.loc[cx_neigh_clustb['Cluster Labels'] == clx, cx_neigh_clustb.columns[[1] + list(range(5, cx_neigh_clustb.shape[1]))]]
    print(cls)

# of Neiborhoods in cluster  1 ( Red ) :  4
# of Neiborhoods in cluster  2 ( Blue ) :  5
# of Neiborhoods in cluster  3 ( Yellow ) :  3
# of Neiborhoods in cluster  4 ( Brown ) :  6
   PostalCode  Skate Park  State / Provincial Park  Tennis Court  \
2       10009         0.0                      0.0           0.0   
9       10029         0.0                      0.0           0.0   
15      10016         0.0                      0.0           0.0   
17      10036         0.0                      0.0           0.0   

    Tennis Stadium  Theme Park Ride / Attraction  Yoga Studio  TotRanking  \
2              0.0                           0.0          0.0    0.066667   
9              0.0                           0.0          0.0    0.025641   
15             0.0                           0.0          0.0    0.010000   
17             0.0                           0.0          0.0    0.010000   

    RentAmount   latitude  longitude  
2       3416.0  40.726752 -73.973799  
9       3150.

#### ------------ END of NOTEBOOK ------------