# Capstone for Coursera's IBM professional certificate in data science
## Final Project - Location Similarities
*If I like where I'm living now, but need to move, where should I look in the new city?*

In [3]:
import os 
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### Get source zipcode latitude & longitude 

In [4]:
# this is the source zipcode
zipcode = '19096'

In [5]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

geolocator = Nominatim()
location = geolocator.geocode(zipcode)
s_lat = location.latitude
s_lng = location.longitude
print('The geograpical coordinate of zipcode' , zipcode, 'are ({}, {}).'.format(s_lat, s_lng))

The geograpical coordinate of zipcode 19096 are (39.9972289747623, -75.2737817252265).


### Get source zipcode features

[![Zillow](https://www.zillowstatic.com/vstatic/7de9b24/static/logos/Zillow_Logo_HoodsProvided_RightAligned.gif "Zillow")](https://www.zillow.com/howto/api/neighborhood-boundaries.htm)

**Set up FourSquare API**

In [6]:
import requests
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [7]:
# from https://foursquare.com/developers/apps
CLIENT_ID = 'DSF1WYVBWMMC0WEPHN01ZDDBVO14SXAHZBNNKXG00C3HIS30' # my Foursquare ID
CLIENT_SECRET = 'UMTILYQVWSRF5UJL4GIQNWS2VL1K1PJ344AUFPJZDAOHBR3N' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
RADIUS = 1600 # radius in meters
LIMIT = 100

***
**Pull Data**  
FourSquare data comes in JSON format
***

In [8]:
# create a function to get venues for source zipcode
venues_list=[]

# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    s_lat, 
    s_lng, 
    RADIUS, 
    LIMIT)

# make the GET request
r = requests.get(url).json()["response"]['groups'][0]['items']

# return only relevant information for each nearby venue
venues_list.append([(
    zipcode, 
    s_lat, 
    s_lng, 
    v['venue']['name'], 
    v['venue']['location']['distance'],
    v['venue']['location']['lat'], 
    v['venue']['location']['lng'],  
    v['venue']['categories'][0]['name']) for v in r]) 

source_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
source_venues.columns = ['Zipcode', 
              'Zipcode Latitude', 
              'Zipcode Longitude', 
              'Venue', 
              'Distance',
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']

source_venues.head()

Unnamed: 0,Zipcode,Zipcode Latitude,Zipcode Longitude,Venue,Distance,Venue Latitude,Venue Longitude,Venue Category
0,19096,39.997229,-75.273782,First Watch,569,40.002315,-75.274486,Breakfast Spot
1,19096,39.997229,-75.273782,Whole Foods Market,551,40.001985,-75.275576,Grocery Store
2,19096,39.997229,-75.273782,Sabrina's Cafe,498,40.001682,-75.273213,Café
3,19096,39.997229,-75.273782,DSW Designer Shoe Warehouse,538,40.001706,-75.271398,Shoe Store
4,19096,39.997229,-75.273782,Anthony's Coal Fired Pizza,410,40.000839,-75.272799,Pizza Place


In [22]:
import folium # map rendering library

# draw map of source location
map_venues = folium.Map(location=[s_lat, s_lng], zoom_start=13)

# add venue markers to map
for lat, lng, name, categories in zip(source_venues['Venue Latitude'], 
                                      source_venues['Venue Longitude'], 
                                      source_venues['Venue'], 
                                      source_venues['Venue Category']):
    label = '{},{}'.format(categories,name)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='purple',
        fill_opacity=0.7).add_to(map_venues) 

# show map
map_venues

In [10]:
# distance-weighted categories:
# 'distance' is distance from 'edge' (1600m radius from centroid)
# lower numbers are 'worse' because they're farther away
# this means that filling NaN with 0 is valid
source_features = source_venues
source_features['Distance'] = 1600-source_features['Distance']
source_features['Zipcode'] = zipcode
source_features = pd.pivot_table(source_features,
                                 values = 'Distance',
                                 index = 'Zipcode',
                                 columns= 'Venue Category',
                                 aggfunc = np.mean
                                )
source_features.head()

Venue Category,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Beer Garden,Breakfast Spot,Bus Stop,Café,...,Pizza Place,Playground,Plaza,Pub,Salon / Barbershop,Shoe Store,Thai Restaurant,Train Station,Video Game Store,Vietnamese Restaurant
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19096,75.0,962.0,342.0,239.0,284.0,123.0,133.0,1031.0,261.0,1102.0,...,472.0,203.0,220.0,120.0,526.0,1062.0,148.0,231.0,1072.0,267.0


***
### Identify potential destination zip codes from Zillow

NOTE TO SELF:
We do require that you provide attribution to Zillow by including the Zillow logo provided here on any page on which the Zillow data itself is used. The logo must link back to this Zillow page. 
https://www.zillowstatic.com/vstatic/7de9b24/static/logos/Zillow_Logo_HoodsProvided_RightAligned.gif 
https://www.zillow.com/howto/api/neighborhood-boundaries.htm

[![Zillow](https://www.zillowstatic.com/vstatic/7de9b24/static/logos/Zillow_Logo_HoodsProvided_RightAligned.gif "Zillow")](https://www.zillow.com/howto/api/neighborhood-boundaries.htm)

***
**Pull Data**  
Zillow data comes in XML format
***

In [11]:
import requests
import bs4 
#from xml.etree import ElementTree as et

In [12]:
# from https://www.zillow.com/webservice/Registration.htm
ZWSID = 'your_id_here'
CHILDTYPE = 'zipcode'

In [13]:
def get_county_data(state, county):
    """
    Build get request for Zillow GetRegionChildren API and return data frame built from response
    """
    
    
    # build url for get request
    url = 'http://www.zillow.com/webservice/GetRegionChildren.htm?zws-id={}&state={}&county={}&childtype={}'.format(
        ZWSID, 
        state, 
        county,
        CHILDTYPE)
    #print(url) # for debug
    
    # call api
    r = requests.get(url)

    # parse xml with beautifulsoup
    soup = bs4.BeautifulSoup(r.content, ['lxml-xml'])

    #extract into a list of "regions"
    soup_list = soup.findChildren('region')

    ids = []
    zips = []
    lats = []
    lons = []
    for line in soup_list[1:]:
        ids.append(line.find('id').string)
        zips.append(line.find('name').string)
        lats.append(line.find('latitude').string)
        lons.append(line.find('longitude').string)

    d = {'ZillowID' : ids,
         'Zipcode' : zips,
         'Latitude' : lats,
         'Longitude' : lons,
         'County' : county
        }

    return pd.DataFrame(d)

In [14]:
montgomery_df = get_county_data('PA', 'Montgomery')
delaware_df = get_county_data('PA', 'Delaware')

In [15]:
df = (pd.concat([montgomery_df , delaware_df], axis=0)
      .drop_duplicates(['Zipcode']))
print(df.shape)
print(df.head())
print(df.tail())

(142, 5)
  ZillowID Zipcode   Latitude   Longitude      County
0    65933   19446   40.22151    -75.2959  Montgomery
1    65943   19464  40.260818   -75.61634  Montgomery
2    65905   19403  40.153779   -75.38402  Montgomery
3    65941   19460  40.123997  -75.521969  Montgomery
4    65904   19401  40.131831  -75.331942  Montgomery
   ZillowID Zipcode   Latitude   Longitude    County
44    65761   19091  39.872803   -75.43204  Delaware
45    65767   19098       39.9      -75.31  Delaware
46    65781   19113   39.87224  -75.281976  Delaware
47    65864   19339  39.882787  -75.545897  Delaware
48    65865   19340  39.882787  -75.545897  Delaware


In [16]:
df.head()

Unnamed: 0,ZillowID,Zipcode,Latitude,Longitude,County
0,65933,19446,40.22151,-75.2959,Montgomery
1,65943,19464,40.260818,-75.61634,Montgomery
2,65905,19403,40.153779,-75.38402,Montgomery
3,65941,19460,40.123997,-75.521969,Montgomery
4,65904,19401,40.131831,-75.331942,Montgomery


### Get Destination Zipcode Features

In [17]:
# from https://foursquare.com/developers/apps
CLIENT_ID = 'your_ID_here' # my Foursquare ID
CLIENT_SECRET = 'your_secret_here' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
RADIUS = 1600 # radius in meters
LIMIT = 100

In [18]:
# create a function to get all venues for each neighborhood
def get_nearby_venues(zipcodes, latitudes, longitudes, RADIUS=1600):
    """
    From list of zipcodes, latitudes, and longitudes, 
    get (top?) 100 venues within RADIUS meters
    Returns df
    """
    venues_list=[]
    for zcode, lat, lng in zip(zipcodes, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        r = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            zcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['distance'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in r]) 

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Zipcode', 
                  'Zipcode Latitude', 
                  'Zipcode Longitude', 
                  'Venue', 
                  'Distance',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

***
#### WARNING: POTENTIALLY LARGE CALL VOLUME
**The full FourSquare API call will make a call per zip code listed**  
*Please make sure this will not cause a rate limit or call limit error!*
***

In [19]:
# just so you know, the full FourSquare API call will make n_rows * LIMIT calls
print('API will make', df.shape[0], 'calls to the FourSquare API.')

API will make 142 calls to the FourSquare API.


In [21]:
#run function for all zipcodes in areas of interest
dest_venues = get_nearby_venues(zipcodes=df['Zipcode'],
                                latitudes=df['Latitude'],
                                longitudes=df['Longitude']
                               )
print(dest_venues.shape)
print(dest_venues.head())
print(dest_venues.tail())

(5095, 8)
  Zipcode Zipcode Latitude Zipcode Longitude                     Venue  \
0   19446         40.22151          -75.2959              CVS pharmacy   
1   19446         40.22151          -75.2959  West Point Gulf and Deli   
2   19446         40.22151          -75.2959             Primo Hoagies   
3   19446         40.22151          -75.2959             Parkside park   
4   19446         40.22151          -75.2959                   Dunkin'   

   Distance  Venue Latitude  Venue Longitude  Venue Category  
0       536       40.221558       -75.302210        Pharmacy  
1       551       40.220893       -75.302343   Deli / Bodega  
2      1344       40.227633       -75.282259  Sandwich Place  
3      1399       40.211075       -75.286724            Park  
4       461       40.221310       -75.301319      Donut Shop  
     Zipcode Zipcode Latitude Zipcode Longitude                   Venue  \
5090   19340        39.882787        -75.545897           Pearle Vision   
5091   19340     

**Analyze Destination Zipcodes**

In [23]:
dest_features = dest_venues.copy()
dest_features['Distance'] = 1600-dest_features['Distance']
dest_features.head()

Unnamed: 0,Zipcode,Zipcode Latitude,Zipcode Longitude,Venue,Distance,Venue Latitude,Venue Longitude,Venue Category
0,19446,40.22151,-75.2959,CVS pharmacy,1064,40.221558,-75.30221,Pharmacy
1,19446,40.22151,-75.2959,West Point Gulf and Deli,1049,40.220893,-75.302343,Deli / Bodega
2,19446,40.22151,-75.2959,Primo Hoagies,256,40.227633,-75.282259,Sandwich Place
3,19446,40.22151,-75.2959,Parkside park,201,40.211075,-75.286724,Park
4,19446,40.22151,-75.2959,Dunkin',1139,40.22131,-75.301319,Donut Shop


In [24]:
# distance-weighted categories:
# 'distance' is distance from 'edge' (1600m radius from centroid)
# lower numbers are 'worse' because they're farther away
# this means that filling NaN with 0 is valid
dest_features = dest_venues.copy()
dest_features['Distance'] = 1600-dest_features['Distance']
dest_features = pd.pivot_table(dest_features,
                               values = 'Distance',
                               index = 'Zipcode',
                               columns= 'Venue Category',
                               aggfunc = np.mean,
                               fill_value = 0
                              )
dest_features.head()

Venue Category,Accessories Store,Airport,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,...,Vietnamese Restaurant,Warehouse Store,Weight Loss Center,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18041,0,0,0.0,0,0.0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1139.0,0.0,0,0
18054,0,0,0.0,0,0.0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,0.0,0.0,0,0
18070,0,0,0.0,0,0.0,604,0,0,0,0.0,...,0.0,0,0,0,0,0,0.0,0.0,0,0
18073,0,0,0.0,0,0.0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,0.0,0.0,0,0
18074,0,0,0.0,0,0.0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,0.0,0.0,0,0


***
### Find Similar Destination Locations

In [25]:
# make sure feature vectors are same shape
source_features = pd.DataFrame(data=source_features,
                   columns=dest_features.columns,
                  )
source_features = source_features.fillna(0)

print(source_features.shape)
print(dest_features.shape)

(1, 307)
(142, 307)


In [26]:
from sklearn.neighbors import NearestNeighbors

k_neighbors = 10

# knn using cosine distance
# cosine distance will find neighbors with similar orientation
# i.e., the relationships between feature points is similar to target
cos_nn = NearestNeighbors(n_neighbors = k_neighbors,
                          metric = 'cosine'
                         )

# compute distances 
cos_nn.fit(dest_features)

# get indices of k nearest neighbors
cos_neighbors = cos_nn.kneighbors(source_features, return_distance=False)

In [27]:
print(cos_neighbors)

[[ 84  74  34  90  91  72  71  10 123  41]]


In [28]:
cos_zips = []
for i in cos_neighbors[0].tolist():
    cos_zips.append(dest_features.reset_index()['Zipcode'][i])
    
cos_zips

['19096',
 '19083',
 '19025',
 '19339',
 '19340',
 '19081',
 '19080',
 '18936',
 '19454',
 '19033']

***
### Review Top Zipcodes

In [29]:
summary = dest_features.reset_index()
summary = summary[summary['Zipcode'].isin(cos_zips)].replace(0,np.nan).dropna(axis=1,how="all").transpose()
summary

Unnamed: 0_level_0,10,34,41,71,72,74,84,90,91,123
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Zipcode,18936,19025,19033,19080,19081,19083,19096,19339,19340,19454
American Restaurant,909.667,,804.333,1074,,684,286,,,317
Art Gallery,,,,1001,,,,,,
Arts & Crafts Store,331,,,,,,,,,776.5
Asian Restaurant,,1178,,1213.5,,,904,,,3
Athletics & Sports,,,,,,,286,,,
BBQ Joint,516,,,,,,,,,
Bagel Shop,,,,,,752,1205,,,48
Bakery,953,,910,1207.75,,276,568.333,,,
Bank,,1088,885.25,1012.33,1015,627.25,757,,,


In [30]:
df.head()

Unnamed: 0,ZillowID,Zipcode,Latitude,Longitude,County
0,65933,19446,40.22151,-75.2959,Montgomery
1,65943,19464,40.260818,-75.61634,Montgomery
2,65905,19403,40.153779,-75.38402,Montgomery
3,65941,19460,40.123997,-75.521969,Montgomery
4,65904,19401,40.131831,-75.331942,Montgomery


In [31]:
import folium
folium.__version__

'0.8.3'

In [32]:
import folium # map rendering library

# calculate centered lat/lng coords
center_lat, center_lng = df[df['Zipcode'].isin(cos_zips)][['Latitude','Longitude']].astype('float').mean()

# draw map of source location
map_dest = folium.Map(location=[center_lat, center_lng], zoom_start=10)
folium.TileLayer(opacity=0.5).add_to(map_dest)
#folium.TileLayer('stamentoner').add_to(map_dest)

# set color scheme for the clusters
x = np.arange(k_neighbors)
ys = [i + x + (i*x)**2 for i in range(k_neighbors)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


for idx in range(k_neighbors):
    zipcode = cos_zips[idx]
    
    # add venue markers to map
    for lat, lng, name, categories in zip(dest_venues[dest_venues['Zipcode'] == zipcode]['Venue Latitude'].astype('float'), 
                                          dest_venues[dest_venues['Zipcode'] == zipcode]['Venue Longitude'].astype('float'), 
                                          dest_venues[dest_venues['Zipcode'] == zipcode]['Venue'], 
                                          dest_venues[dest_venues['Zipcode'] == zipcode]['Venue Category']):
        label = '{},{}'.format(name, categories)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=3,
            popup=label,
            color=rainbow[idx],
            fill=True,
            fill_color=rainbow[idx],
            fill_opacity=0.7).add_to(map_dest) 
        
    # add destination zip code centers to map
    for lat, lng in zip(df[df['Zipcode'] == zipcode]['Latitude'].astype('float'), 
                        df[df['Zipcode'] == zipcode]['Longitude'].astype('float')):
        label = '{}'.format(zipcode)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=7,
            popup=label,
            color=rainbow[idx],
            #icon=folium.Icon(color=rainbow[idx])
            fill=True,
            fill_color='black',
            fill_opacity=0.7
        ).add_to(map_dest) 

# show map
map_dest