### Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you didn't have this library
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

### Since there are only 19 neighborhoods in Nashville, I will just type the neighborhoods by hand into a list

In [3]:
nashville_neibors = ['Antioch', 'Bellevue', 'Donelson', 'East Nashville', 'Germantown', 'Green Hills', 
                           'The Gulch', 'Hermitage', 'Hillsboro Village', 'Inglewood', 'Joelton', 'Lakewood',
                           'Lockeland Springs', 'Madison', 'Old Hickory', 'Pasquo', 'Tusculum', 'Woodbine',
                           'Whites Creek', 'West Nashville']

### Using Geo library to find the latitude and longitude of each neighborhood in nashville and store them in a dataframe

In [4]:
latitudes = []
longitudes = []

for nei in nashville_neibors:
    address = nei + ' ,Nashville, USA'
    geolocator = Nominatim(user_agent='ny_explorer')
    location = geolocator.geocode(address)
    latitudes.append(location.latitude)
    longitudes.append(location.longitude)

In [5]:
nashville_data = pd.DataFrame({'Neighborhood':nashville_neibors, 'Latitude':latitudes, 'Longitude':longitudes})

In [6]:
nashville_data.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Antioch,36.06006,-86.672219
1,Bellevue,36.064782,-86.939446
2,Donelson,36.162557,-86.669997
3,East Nashville,36.172556,-86.759721
4,Germantown,36.279498,-86.873611


### Four square client_id and client_secret, hidden by IBM cloud

In [7]:
# The code was removed by Watson Studio for sharing.

### Using four square API to get the nearby vendors in each neighbor and store them in dataframe

In [8]:
LIMIT = 100

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
# Since Nashville is a small place, we will use a large radius to get more venues

nashville_venues = getNearbyVenues(names=nashville_data['Neighborhood'],
                                   latitudes=nashville_data['Latitude'],
                                   longitudes=nashville_data['Longitude'],
                                   radius = 3000
                                  )


Antioch
Bellevue
Donelson
East Nashville
Germantown
Green Hills
The Gulch
Hermitage
Hillsboro Village
Inglewood
Joelton
Lakewood
Lockeland Springs
Madison
Old Hickory
Pasquo
Tusculum
Woodbine
Whites Creek
West Nashville


In [11]:
nashville_venues.shape

(1289, 7)

### Create a set containing all the unique venues in Nashville

In [12]:
venues_kinds_nashville = set(nashville_venues['Venue Category'].unique())

In [13]:
len(venues_kinds_nashville)

201

### Get the New York data

In [14]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [15]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [16]:
neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods_ny = pd.DataFrame(columns=column_names)

In [17]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods_ny = neighborhoods_ny.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [18]:
neighborhoods_ny.shape

(306, 4)

### Since there are too many neighbors in New York, we only choose part of it, the manhatten part

In [19]:
manhattan_data = neighborhoods_ny[neighborhoods_ny['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.shape

(40, 4)

### Get the manhattan venues

In [20]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


### Get the veneus in manhattan

In [21]:
venues_kinds_manhattan = set(manhattan_venues['Venue Category'].unique())

### Get the common venues in manhattan and Nashville and remove the rows in Manhattan data and Nashville data that don't have the commn venues

In [22]:
common_venues = venues_kinds_manhattan.intersection(venues_kinds_nashville)

In [23]:
len(common_venues)

166

In [24]:
a = []

for v in manhattan_venues['Venue Category']:
    a.append(v in common_venues)

In [25]:
b = []

for v in nashville_venues['Venue Category']:
    b.append(v in common_venues)

In [26]:
all_venues_df = pd.concat([manhattan_venues[a], nashville_venues[b]])

In [27]:
all_venues_df.shape

(3960, 7)

### Next we will do the clustering using Kmeans algorithm, but first, we need to do some preprossing of the dataset

In [29]:
# one hot encoding
all_onehot = pd.get_dummies(all_venues_df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
all_onehot['Neighborhood'] = all_venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [all_onehot.columns[-1]] + list(all_onehot.columns[:-1])
all_onehot = all_onehot[fixed_columns]

all_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
all_grouped = all_onehot.groupby('Neighborhood').mean().reset_index()
all_grouped

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Antioch,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.029412,0.0,...,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.014706,0.0,0.0
1,Battery Park City,0.0,0.013514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040541,0.0,0.040541,0.0
2,Bellevue,0.0,0.010526,0.0,0.0,0.0,0.0,0.021053,0.0,0.0,...,0.0,0.010526,0.021053,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Carnegie Hill,0.0,0.010989,0.0,0.0,0.0,0.010989,0.0,0.0,0.0,...,0.010989,0.0,0.0,0.021978,0.0,0.010989,0.032967,0.0,0.010989,0.032967
4,Central Harlem,0.0,0.054054,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Chelsea,0.0,0.037037,0.0,0.0,0.024691,0.0,0.0,0.012346,0.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.012346,0.012346
6,Chinatown,0.0,0.053333,0.0,0.0,0.0,0.0,0.0,0.026667,0.0,...,0.013333,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.013333
7,Civic Center,0.0,0.037975,0.012658,0.0,0.0,0.0,0.0,0.012658,0.0,...,0.0,0.0,0.0,0.0,0.0,0.025316,0.025316,0.0,0.0,0.037975
8,Clinton,0.0,0.04878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012195,0.0,0.0,0.0,0.0,0.02439,0.036585,0.0,0.0,0.0
9,Donelson,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Print the top 6 venues for comparision after clustering

In [31]:
num_top_venues = 6

for hood in all_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = all_grouped[all_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Antioch----
                  venue  freq
0        Discount Store  0.07
1  Fast Food Restaurant  0.07
2           Video Store  0.06
3     Convenience Store  0.06
4           Pizza Place  0.04
5           Gas Station  0.04


----Battery Park City----
           venue  freq
0           Park  0.09
1          Hotel  0.07
2    Coffee Shop  0.07
3            Gym  0.05
4  Women's Store  0.04
5      Wine Shop  0.04


----Bellevue----
                  venue  freq
0    Mexican Restaurant  0.06
1        Sandwich Place  0.05
2        Ice Cream Shop  0.05
3             Pet Store  0.04
4  Fast Food Restaurant  0.04
5           Pizza Place  0.04


----Carnegie Hill----
                 venue  freq
0          Coffee Shop  0.08
1          Pizza Place  0.05
2       Cosmetics Shop  0.04
3          Yoga Studio  0.03
4        Grocery Store  0.03
5  Japanese Restaurant  0.03


----Central Harlem----
                 venue  freq
0    French Restaurant  0.05
1                  Bar  0.05
2   Chinese Resta