# Webscraping Income and Charity Contribution Data In NY By Zipcode

 This Notebook shows the code used in the MTA data analysis.
 
 Group: Danny, Navina, John, Kevin 

## Pt 1: Code To Scrape "Income Per Capita" Data

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

Fetches the income per capita given a zipcode

In [None]:
def zipcode_to_capita(zipcode):
    url = "https://www.incomebyzipcode.com/newyork/{}".format(zipcode)
    page = requests.get(url).text
    soup = BeautifulSoup(page, features="html5lib")
    a = soup.findAll('table',{'class':'table table-responsive'})
    income_per_capita = a[2].findAll('td')[0].text
    return(int(income_per_capita.strip('$').replace(',', '')))

Load the dataset that contains zipcodes, zipcodes with no data on the webpage are dropped

In [None]:
data = pd.read_csv("station_zips.csv")
data = data.drop(index=208)
data = data.drop(index=255)
data = data.drop(index=323)
data = data.reset_index()
data.head()

Populates a column in the data with incomes

In [None]:
zips = data['zip']
incomes = []
for index,zipcode in enumerate(zips[:5]):
    print("index: ", index,"zipcode: ", zipcode)   
    incomes.append(zipcode_to_capita(zipcode))
data['incomes'] = incomes
data_incomes = data

Option 1: Code to store data in a dataframe 

In [None]:
#data_with_incomes.to_csv('data_income_data.csv', sep=',', encoding='utf-8')
#data_with_incomes.head(10)

Option 2: Creating dictionary for fast access, better option. Will be used for charity as well


In [None]:
#data_incomes = pd.read_csv("zip_income_data.csv")
zipcode_to_income_dict = data_incomes.set_index('zip').to_dict()['incomes']

Using neighboring zipcodes for those with empty entries


In [None]:
zipcode_to_income_dict[10119] = 86347
zipcode_to_income_dict[10020] = 90151

Store dictionary into a numpy file for easy importing in other files

In [None]:
np.save('zipcode_to_income_dict.npy', zipcode_to_income_dict)

## Pt 2: Code To Scrape "Charity Contributions" Data

In [None]:
def zipcode_to_charity(zipcode):   
    url = "http://www.city-data.com/zips/{}.html".format(zipcode)
    page = requests.get(url).text
    soup = BeautifulSoup(page, features="html5lib")
    b = soup.findAll('div',{'class':'hgraph'})
    str1 = str(b[15])
    charity_contribution = re.findall(r"\$[^ ]+", str1)[0].split('<')[0]
    return(int(charity_contribution.strip('$').replace(',', '')))
   

Load the dataset that contains data and zipcodes, zipcodes with no data on the webpage are dropped

In [None]:
data = pd.read_csv("station_zips.csv")

Populates a column in the data with incomes


In [None]:
zips = data['zip']
charity_num = []
for index,zipcode in enumerate(zips[5:]):
        print("index: ", index,"zipcode: ", zipcode)   
        charity_num.append(zipcode_to_charity(zipcode))
        print(charity_num[index])
data['charity'] = charity_num

Drop data that does not have an entry, fill in if needed

In [None]:
data = data.drop(index=208)
data = data.drop(index=167)
data = data.drop(index=134)
data = data.drop(index=233)
data = data.drop(index=236)
data = data.drop(index=255)
data = data.drop(index=323)
data = data.drop(index=418)
data = data.reset_index()

In [None]:
zipcode_to_charity_dict = data.set_index('zip').to_dict()['charity']

#Store dictionary into a numpy file for easy importing in other files
np.save('zipcode_to_charity_dict.npy', zipcode_to_charity_dict)