# The Battle of the Neighborhoods (Week 2)

## Download and Explore New York city geographical coordinates dataset

Neighborhood has a total of 5 boroughs and 306 neighborhoods. In order to segement the neighborhoods and explore them, we will essentially need a dataset that contains the 5 boroughs and the neighborhoods that exist in each borough as well as the the latitude and logitude coordinates of each neighborhood.

Download data from https://geo.nyu.edu/catalog/nyu_2451_34572

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

import folium

import csv

print('Libraries imported.')

Libraries imported.


In [2]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [0]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [0]:
neighborhoods_data = newyork_data['features']

In [5]:
neighborhoods_data[0]

{'geometry': {'coordinates': [-73.84720052054902, 40.89470517661],
  'type': 'Point'},
 'geometry_name': 'geom',
 'id': 'nyu_2451_34572.1',
 'properties': {'annoangle': 0.0,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661],
  'borough': 'Bronx',
  'name': 'Wakefield',
  'stacked': 1},
 'type': 'Feature'}

## Tranform the data into a pandas dataframe

In [0]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [7]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [8]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [9]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [0]:
neighborhoods.to_csv('BON1_NYC_GEO.csv',index=False)

In [11]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


## Create a map of New York with neighborhoods superimposed on top.

In [12]:
# create map of Toronto using latitude and longitude values
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

## Web scrapping of Population and Demographics data of New York city from Wikipedia

In [0]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [0]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('BON2_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

**Load Data from CSV**

In [13]:
Pop_data=pd.read_csv('BON2_POPULATION1.csv')
Pop_data

Unnamed: 0,New York City's five boroughsvte\n,Jurisdiction\n,Population\n,Gross Domestic Product\n,Land area\n,Density\n,Borough,County,Estimate (2018)[12],billions(US$)[13],per capita(US$),square miles,squarekm,persons / sq. mi,persons /km2\n
0,The Bronx\n,\n Bronx\n,"1,432,132\n",42.695\n,"29,200\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,582,830\n",91.559\n,"34,600\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,,,,
2,Manhattan\n,\n New York\n,"1,628,701\n",600.244\n,"360,600\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,,,,
3,Queens\n,\n Queens\n,"2,278,906\n",93.310\n,"39,600\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,,,,
4,Staten Island\n,\n Richmond\n,"476,179\n",14.514\n,"30,300\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,,,,
5,City of New York,8398748,842.343,97700,302.64,783.83,28188,"10,947\n",,,,,,,
6,State of New York,19745289,1701.399,85700,47214,122284,416.4,159\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [14]:
Pop_data.drop(Pop_data.columns[[3,8,9,10,11,12,13,14]], axis=1,inplace=True)
print('Data downloaded!')

Data downloaded!


**Renaming the column and removing whitespace**

In [15]:
Pop_data

Unnamed: 0,New York City's five boroughsvte\n,Jurisdiction\n,Population\n,Land area\n,Density\n,Borough,County
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,600\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [16]:
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace('\'','')
Pop_data.rename(columns={'Borough':'persons_sq_mi','County':'persons_sq_km'}, inplace=True)
Pop_data

Unnamed: 0,NewYorkCitysfiveboroughsvte\n,Jurisdiction\n,Population\n,Landarea\n,Density\n,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,600\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [17]:
Pop_data.rename(columns = {'NewYorkCitysfiveboroughsvte\n' : 'Borough',
                   'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2017', 
                   'Landarea\n':'square_miles',
                    'Density\n':'square_km'}, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,600\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [18]:
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Estimate_2017']=Pop_data['Estimate_2017'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_km']=Pop_data['persons_sq_km'].replace(to_replace='\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701.0,360600.0,22.83,59.13,72033.0
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112.0
5,City of New York,8398748,842.343,302.64,783.83,28188.0,10947.0
6,State of New York,19745289,1701.399,47214.0,122284.0,416.4,159.0
7,Sources:[14] and see individual borough articles,,,,,,


**Remove 'NAN'**

In [19]:
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701.0,360600.0,22.83,59.13,72033.0
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112.0
5,City of New York,8398748,842.343,302.64,783.83,28188.0,10947.0
6,State of New York,19745289,1701.399,47214.0,122284.0,416.4,159.0
7,Sources:[14] and see individual borough articles,,,,,,


In [20]:
i = Pop_data[((Pop_data.Borough == 'Sources:[14] and see individual borough articles'))].index
Pop_data.drop(i)

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137
2,Manhattan,New York,1628701.0,360600.0,22.83,59.13,72033
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112
5,City of New York,8398748,842.343,302.64,783.83,28188.0,10947
6,State of New York,19745289,1701.399,47214.0,122284.0,416.4,159


In [0]:
Pop_data.to_csv('BON2_POPULATION.csv',index=False)

## Web scrapping of Demographics data from wikipedia page using BeautifulSoup.

In [0]:
website_url = requests.get('https://en.wikipedia.org/wiki/New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.findAll('table')[8]
headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_DEMO.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [54]:
Demo_data=pd.read_csv('NYC_DEMO.csv')
print('Data downloaded!')
Demo_data

Data downloaded!


Unnamed: 0,Racial composition,2010[230],1990[232],1970[232],1940[232]\n
0,White,44.0%,52.3%,76.6%,93.6%\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[233],92.0%\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[233],1.6%\n
4,Asian,12.7%,7.0%,1.2%,−\n


In [55]:
Demo_data.columns

Index(['Racial composition', '2010[230]', '1990[232]', '1970[232]',
       '1940[232]\n'],
      dtype='object')

In [56]:
Demo_data.rename(columns = {'2010[230]' : '2010',
                   '1990[232]':'1990',
                   '1970[232]':'1970', 
                   '1940[232]\n':'1940',
                    }, inplace=True)
Demo_data

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[233],92.0%\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[233],1.6%\n
4,Asian,12.7%,7.0%,1.2%,−\n


In [0]:
Demo_data.columns = Demo_data.columns.str.replace(' ', '')

In [58]:
Demo_data= Demo_data.replace('\n',' ', regex=True)
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%[233],92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[233],1.6%
4,Asian,12.7%,7.0%,1.2%,−


In [59]:
Demo_data['1970'] = Demo_data['1970'].str.rstrip('[233]')
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%,92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%,1.6%
4,Asian,12.7%,7.0%,1.2%,−


In [0]:
Demo_data.to_csv('BON2_DEMOGRAPHICS.csv',index=False)