# Web scrapping of Population and Demographics data of New York city from Wikipedia

### Population Data: 
Web scrapping of Population data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

#!conda install -c anaconda beautiful-soup --yes
from bs4 import BeautifulSoup # package for parsing HTML and XML documents

import csv # implements classes to read and write tabular data in CSV form


usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


Web scrapping of Population data from wikipedia page using BeautifulSoup.

Beautiful Soup is a Python package for parsing HTML and XML documents (including having malformed markup, i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.


In [3]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [4]:
Pop_data=pd.read_csv('NYC_POPULATION1.csv', usecols=[0,1,2,3,4,5,6,7,8], header=None)
Pop_data.columns = ["Borough", "County", "Population", "GDP", "Per Capita Income", "Sq Miles","Sq Km","person per sq mi","person per sq km"]

In [5]:
Pop_data.drop(Pop_data.index[0],inplace=True)
Pop_data

Unnamed: 0,Borough,County,Population,GDP,Per Capita Income,Sq Miles,Sq Km,person per sq mi,person per sq km
1,The Bronx\r\n,\r\n Bronx\r\n,"1,471,160\r\n",28.787\r\n,"19,570\r\n",42.10\r\n,109.04\r\n,"34,653\r\n","13,231\r\n"
2,Brooklyn\r\n,\r\n Kings\r\n,"2,648,771\r\n",63.303\r\n,"23,900\r\n",70.82\r\n,183.42\r\n,"37,137\r\n","14,649\r\n"
3,Manhattan\r\n,\r\n New York\r\n,"1,664,727\r\n",629.682\r\n,"378,250\r\n",22.83\r\n,59.13\r\n,"72,033\r\n","27,826\r\n"
4,Queens\r\n,\r\n Queens\r\n,"2,358,582\r\n",73.842\r\n,"31,310\r\n",108.53\r\n,281.09\r\n,"21,460\r\n","8,354\r\n"
5,Staten Island\r\n,\r\n Richmond\r\n,"479,458\r\n",11.249\r\n,"23,460\r\n",58.37\r\n,151.18\r\n,"8,112\r\n","3,132\r\n"
6,City of New York,8622698,806.863,93574,302.64,783.83,28188,"10,947\r\n",
7,State of New York,19849399,1547.116,78354,47214,122284,416.4,159\r\n,
8,Sources:[14] and see individual borough articl...,,,,,,,,


In [6]:
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Sq Miles']=Pop_data['Sq Miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['Sq Km']=Pop_data['Sq Km'].replace(to_replace='\n', value='', regex=True)
Pop_data['person per sq mi']=Pop_data['person per sq mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['person per sq km']=Pop_data['person per sq km'].replace(to_replace='\n', value='', regex=True)
Pop_data['Population']=Pop_data['Population'].replace(to_replace='\n', value='', regex=True)
Pop_data['GDP']=Pop_data['GDP'].replace(to_replace='\n', value='', regex=True)
Pop_data['Per Capita Income']=Pop_data['Per Capita Income'].replace(to_replace='\n', value='', regex=True)
Pop_data.drop(Pop_data.index[5:8],inplace=True)
Pop_data


Unnamed: 0,Borough,County,Population,GDP,Per Capita Income,Sq Miles,Sq Km,person per sq mi,person per sq km
1,The Bronx\r,\r Bronx\r,"1,471,160\r",28.787\r,"19,570\r",42.10\r,109.04\r,"34,653\r","13,231\r"
2,Brooklyn\r,\r Kings\r,"2,648,771\r",63.303\r,"23,900\r",70.82\r,183.42\r,"37,137\r","14,649\r"
3,Manhattan\r,\r New York\r,"1,664,727\r",629.682\r,"378,250\r",22.83\r,59.13\r,"72,033\r","27,826\r"
4,Queens\r,\r Queens\r,"2,358,582\r",73.842\r,"31,310\r",108.53\r,281.09\r,"21,460\r","8,354\r"
5,Staten Island\r,\r Richmond\r,"479,458\r",11.249\r,"23,460\r",58.37\r,151.18\r,"8,112\r","3,132\r"


Save DataFrame 

In [7]:
Pop_data.to_csv('NYC_POPULATION.csv',index=False)

### DEMOGRAPHICS DATA

We will web scrap Demographics data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [8]:
website_url = requests.get('https://en.wikipedia.org/wiki/New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable collapsible'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_DEMO.csv', 'w',encoding='utf-8') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [9]:
Demo_data=pd.read_csv('NYC_DEMO.csv')

Demo_data

Unnamed: 0,Racial composition,2010[249],1990[251],1970[251],1940[251]
0,White,44.0%,52.3%,76.6%,93.6%\r\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[252],92.0%\r\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\r\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[252],1.6%\r\n
4,Asian,12.7%,7.0%,1.2%,−\r\n


In [10]:
Demo_data.rename(columns = {'2010[249]' : '2010',
                   '1990[251]':'1990',
                   '1970[251]':'1970', 
                   '1940[251]':'1940',
                    }, inplace=True)

In [11]:
Demo_data.columns = Demo_data.columns.str.replace(' ', '')

In [12]:
Demo_data= Demo_data.replace('\n',' ', regex=True)
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940[251]
0,White,44.0%,52.3%,76.6%,93.6%\r
1,—Non-Hispanic,33.3%,43.2%,62.9%[252],92.0%\r
2,Black or African American,25.5%,28.7%,21.1%,6.1%\r
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[252],1.6%\r
4,Asian,12.7%,7.0%,1.2%,−\r


In [13]:
Demo_data['1970'] = Demo_data['1970'].str.rstrip('[252]')
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940[251]
0,White,44.0%,52.3%,76.6%,93.6%\r
1,—Non-Hispanic,33.3%,43.2%,62.9%,92.0%\r
2,Black or African American,25.5%,28.7%,21.1%,6.1%\r
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%,1.6%\r
4,Asian,12.7%,7.0%,1.2%,−\r


In [14]:
Demo_data.to_csv('NYC_DEMOGRAPHICS.csv',index=False)