# The Battle of Neighborhoods - Week 2

### 1. Download spatial data for NYC from https://geo.nyu.edu/catalog/nyu_2451_34572

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
import requests
neighborhood_json = requests.get("https://cocl.us/new_york_dataset").json()

In [3]:
neighborhood_data = neighborhood_json['features']
neighborhoods = pd.DataFrame(columns = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'])
for data in neighborhood_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [4]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


#### make sure the dataset has 5 boroughs and 306 neighborhoods

In [5]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


#### Use geopy library to get the latitude and longitude values of New York City.

In [6]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


#### Create a map of New York with neighborhoods superimposed on top

In [7]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### 2. Population 

In [8]:
from bs4 import BeautifulSoup

In [9]:
nyc_population_text = requests.get('https://en.wikipedia.org/wiki/New_York_City').content
soup = BeautifulSoup(nyc_population_text,'html.parser')

In [10]:
table = soup.find('table', {'class' : 'wikitable sortable'})

#get last 9 columns for county specific headers
header = ['Borough', 'County', 'Population Estimate (2018)', 'GDP billions(US$)', 
           'GDP per capita(US$)', 'Land Area square miles', 'Land Area square km', 
           'persons/sq. mi', 'persons/km2']

population_df = pd.DataFrame(columns = header)
rows = []
myrows = table.find_all('tr')
for row in myrows:
    td = row.find_all('td')
    myrow = [row.text.strip() for row in td]
    #anything less than 9 columns is not for borough specific data
    if not myrow or len(myrow) < 9:
        continue
        
    rows.append(pd.Series(myrow, index = header))

population_df = population_df.append(rows, ignore_index=True)
population_df.head()
    

Unnamed: 0,Borough,County,Population Estimate (2018),GDP billions(US$),GDP per capita(US$),Land Area square miles,Land Area square km,persons/sq. mi,persons/km2
0,The Bronx,Bronx,1432132,42.695,29200,42.1,109.04,34653,13231
1,Brooklyn,Kings,2582830,91.559,34600,70.82,183.42,37137,14649
2,Manhattan,New York,1628701,600.244,360600,22.83,59.13,72033,27826
3,Queens,Queens,2278906,93.31,39600,108.53,281.09,21460,8354
4,Staten Island,Richmond,476179,14.514,30300,58.37,151.18,8112,3132


In [11]:
#select columns we are interested in
header = [ 'Borough', 'County', 'Population Estimate (2018)', "persons/sq. mi"]
population_df = population_df[header]

In [12]:
population_df.head()

Unnamed: 0,Borough,County,Population Estimate (2018),persons/sq. mi
0,The Bronx,Bronx,1432132,34653
1,Brooklyn,Kings,2582830,37137
2,Manhattan,New York,1628701,72033
3,Queens,Queens,2278906,21460
4,Staten Island,Richmond,476179,8112


### 3. Population Demographics

In [13]:
demographics_text = requests.get('https://en.wikipedia.org/wiki/New_York_City').content
soup = BeautifulSoup(demographics_text, 'html.parser')
table = soup.find(text="Racial composition").find_parent("table")

In [14]:
def clean(x):
    if x:
        index = x.find('[')
        if index > 0:
            x = x[:index]
    return x.strip()

headers = [ 'Racial composition', '2010', '1990', '1970', '1940']

demographics_df = pd.DataFrame(columns = headers)
rows = []
myrows = table.find_all('tr')
for row in myrows:
    td = row.find_all('td')
    myrow = [clean(row.text) for row in td]
    #anything less than 5 columns?
    if not myrow or len(myrow) < 5:
        continue

    rows.append(pd.Series(myrow, index = headers))

demographics_df = demographics_df.append(rows, ignore_index=True)
demographics_df.head()

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%,92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%,1.6%
4,Asian,12.7%,7.0%,1.2%,−


### 4. Population Age

In [15]:
age_text = requests.get('https://www.baruch.cuny.edu/nycdata/population-geography/age_distribution.htm').content
soup = BeautifulSoup(age_text, 'html.parser')

In [16]:
table = soup.find(text="New York City (NYC)").find_parent("table")

In [17]:
headers = [ 'Age', 'NYC', 'Manhattan', 'The Bronx', 'Brooklyn', 'Queens', 'Staten Island']

age_df = pd.DataFrame(columns = headers)
rows = []
myrows = table.find_all('tr')
for i, row in enumerate(myrows):
    if i < 10: #skip headers
        continue
    td = row.find_all('td')
    myrow = [ clean(row.text) for row in td]
    if myrow[0] == '' and myrow[-1] == '':
        myrow = myrow[1:-1]
    if len(myrow) < 7:
        continue
    
    rows.append(pd.Series(myrow, index=headers))
    
age_df = age_df.append(rows, ignore_index=True)
age_df

Unnamed: 0,Age,NYC,Manhattan,The Bronx,Brooklyn,Queens,Staten Island
0,Under 5 years,6.4%,4.8%,7.2%,7.3%,6.2%,5.7%
1,5 to 9 years,5.5%,3.8%,6.8%,5.9%,5.4%,5.9%
2,10 to 14 years,5.6%,3.7%,6.8%,6.2%,5.4%,6.4%
3,15 to 19 years,5.3%,4.2%,6.8%,5.4%,5.1%,5.8%
4,20 to 24 years,6.7%,7.0%,7.7%,6.4%,6.1%,6.5%
5,25 to 29 years,9.4%,11.8%,8.7%,9.6%,8.5%,7.2%
6,30 to 34 years,8.6%,10.8%,7.4%,8.9%,7.9%,6.2%
7,35 to 39 years,7.3%,8.2%,6.4%,7.5%,7.4%,5.8%
8,40 to 44 years,6.4%,6.1%,6.2%,6.4%,6.4%,7.0%
9,45 to 49 years,6.4%,6.3%,6.3%,6.0%,6.8%,6.9%


### 5. Average income

In [18]:
income_text = requests.get('https://www.baruch.cuny.edu/nycdata/income-taxes/med_hhold_income.htm').content
soup = BeautifulSoup(income_text, 'html.parser')

In [19]:
table = soup.find(text="New York City (NYC)").find_parent("table")

In [20]:
headers = [ 'Borough', 'Median Income']

income_df = pd.DataFrame(columns = headers)
rows = []
myrows = table.find_all('tr')
for i, row in enumerate(myrows):  
    if i < 4:
        continue
        
    td = row.find_all('td')
    myrow = [ clean(row.text) for row in td]
    if myrow[0] == '' and myrow[-1] == '':
        myrow = myrow[1:-1]

    if len(myrow) < 2 or myrow[0].startswith(('New', 'United')):
        continue

    if myrow[0] == 'Bronx':
        myrow[0] = 'The Bronx'
        
    rows.append(pd.Series(myrow, index=headers))
    
income_df = income_df.append(rows, ignore_index=True)
income_df

Unnamed: 0,Borough,Median Income
0,The Bronx,"$37,397"
1,Brooklyn (Kings),"$56,942"
2,Manhattan,"$85,071"
3,Queens,"$64,509"
4,Staten Island (Richmond),"$79,201"


### manhattan/Brooklyn detail information using foursquare api

In [22]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [23]:
brooklyn_data = neighborhoods[neighborhoods['Borough'] == 'Brooklyn'].reset_index(drop=True)
brooklyn_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Brooklyn,Bay Ridge,40.625801,-74.030621
1,Brooklyn,Bensonhurst,40.611009,-73.99518
2,Brooklyn,Sunset Park,40.645103,-74.010316
3,Brooklyn,Greenpoint,40.730201,-73.954241
4,Brooklyn,Gravesend,40.59526,-73.973471


In [24]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [25]:
# create map of Brooklyn using latitude and longitude values
map_brooklyn = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(brooklyn_data['Latitude'], brooklyn_data['Longitude'], brooklyn_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_brooklyn)  
    
map_brooklyn

### foursquare api

In [28]:
CLIENT_ID = 'TFEVPSAUIPZHQ1IQ110XGOCHEVUEN3TXS0EC3XO0ELI2JZ00' # your Foursquare ID
CLIENT_SECRET = 'CKS2E3PILK2IOOYULBA1RSX0RLH0FGVJFQZ4QGJN0R4RYIUV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TFEVPSAUIPZHQ1IQ110XGOCHEVUEN3TXS0EC3XO0ELI2JZ00
CLIENT_SECRET:CKS2E3PILK2IOOYULBA1RSX0RLH0FGVJFQZ4QGJN0R4RYIUV


In [29]:
LIMIT = 100
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        response = requests.get(url).json()["response"]
        results = response['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
# get neighborhood information for manhattan and brooklyn
# find the venues in all Toronto Neighbohoods
combined_df = pd.concat([manhattan_data, brooklyn_data])
nearby_venues = getNearbyVenues(names=combined_df['Neighborhood'],
                                   latitudes=combined_df['Latitude'],
                                   longitudes=combined_df['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker Heights
Gerritsen Beach
Marine

In [37]:
print (nearby_venues.shape)
print (nearby_venues.columns)
print (sorted(nearby_venues['Venue Category'].unique()))

(6148, 7)
Index(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
       'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'],
      dtype='object')
['Accessories Store', 'Adult Boutique', 'Afghan Restaurant', 'African Restaurant', 'Airport Terminal', 'American Restaurant', 'Antique Shop', 'Arcade', 'Arepa Restaurant', 'Argentinian Restaurant', 'Art Gallery', 'Art Museum', 'Arts & Crafts Store', 'Arts & Entertainment', 'Asian Restaurant', 'Athletics & Sports', 'Auditorium', 'Australian Restaurant', 'Austrian Restaurant', 'Auto Garage', 'BBQ Joint', 'Baby Store', 'Bagel Shop', 'Bakery', 'Bank', 'Bar', 'Baseball Field', 'Baseball Stadium', 'Basketball Court', 'Beach', 'Bed & Breakfast', 'Beer Bar', 'Beer Garden', 'Beer Store', 'Big Box Store', 'Bike Rental / Bike Share', 'Bike Shop', 'Bike Trail', 'Bistro', 'Board Shop', 'Boat or Ferry', 'Bookstore', 'Boutique', 'Boxing Gym', 'Brazilian Restaurant', 'Breakfast Spot', 'Brewery', 'Bridal Shop', 'Bridge', 'Bubbl

In [40]:
has_competitor_neighborhood = set(nearby_venues[nearby_venues['Venue Category'].isin(('Japanese Restaurant', 'Sushi Restaurant'))]['Neighborhood'].unique())
all_neighborhood = set(nearby_venues['Neighborhood'].unique())
no_japanese_restaurant_neighhoorhood = all_neighborhood - has_competitor_neighborhood

In [None]:
### find out how many neighborhood that has no japanese restaurant

In [45]:
print (f'number of neighborhoods that has no japanese restaurant: {len(no_japanese_restaurant_neighhoorhood)}')

number of neighborhoods that has no japanese restaurant: 48


#### find out for each of the neighborhood, how many competitors are there, i.e. those categories that belongs to restaurant. This can be done by looking at the category if the category is in the list below or the category contains the word 'restaurant'

In [46]:
extra_category = ['Buffet', 'Burger Joint', 'Burrito Place', 'Cafeteria', 'Café', 'Donut Shop', 
                  'Food', 'Food & Drink Shop', 'Food Court', 'Food Stand', 'Food Truck', 'Soup Place',
                  'Noodle House', 'Salad Place', 
                 ]
competitors_df = nearby_venues[nearby_venues['Venue Category'].isin(extra_category) | nearby_venues['Venue Category'].str.contains('Restaurant')]

In [55]:
groupby = competitors_df['Neighborhood'].value_counts()

In [None]:
#get the top 5 neighbor that has lest of competitors

In [64]:
least_competitor = list(groupby.sort_values()[:5].index)
least_competitor

['Marine Park', 'Bergen Beach', 'Stuyvesant Town', 'New Lots', 'Madison']

In [66]:
#check in the neighborhood how many bus stop available
bus_df = nearby_venues[nearby_venues['Neighborhood'].isin(least_competitor) & nearby_venues['Venue Category'].str.contains('Bus')]

In [67]:
bus_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
6120,Madison,40.609378,-73.948415,Bus Stop Ave R And Nostrand,40.607782,-73.943189,Bus Station


In [68]:
#information for this neighbor
neighborhoods[neighborhoods['Neighborhood'] == 'Madison']

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
296,Brooklyn,Madison,40.609378,-73.948415


### conclusion

Madison which is located in brooklyn, which has relatively young median age, sufficient high median high.
has the least number of competitors, has no japanese/sushi restaurant. It has accessible public transportation.
will be good location for opening a new sushi restaurant.