# Code To Scrape Income Per Capita Data

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
#Fetches the income per capita given a zipcode
def zipcode_to_capita(zipcode):
    url = "https://www.incomebyzipcode.com/newyork/{}".format(zipcode)
    page = requests.get(url).text
    soup = BeautifulSoup(page, features="html5lib")
    a = soup.findAll('table',{'class':'table table-responsive'})
    income_per_capita = a[2].findAll('td')[0].text
    return(int(income_per_capita.strip('$').replace(',', '')))


In [None]:
#Load the dataset that contains data and zipcodes, zipcodes with no data on the webpage are dropped
data = pd.read_csv("station_zips.csv")
print("Page doesn't exist for zipcode:",data['15'][208])
data = data.drop(index=208)
print("Page doesn't exist for zipcode:",data['15'][255])
data = data.drop(index=255)
print("Page doesn't exist for zipcode:",data['15'][323])
data = data.drop(index=323)
data = data[data['15']!=10120]
data = data.reset_index()

In [None]:
#Populates a column in the data with incomes
zips = data['15']
incomes = []
for index,zipcode in enumerate(zips):
    print("index: ", index,"zipcode: ", zipcode)   
    incomes.append(zipcode_to_capita(zipcode))
data['16'] = incomes

In [None]:
#Stores the data in a csv file so we don't have to rescrape 
data_with_incomes.to_csv('data.csv', sep=',', encoding='utf-8')
data_with_incomes.head(10)

In [2]:
#Store data in a dataframe and make a dictionary
data_incomes = pd.read_csv("zip_income_data.csv")
zipcode_to_income_dict = data_incomes.set_index('15').to_dict()['16']
#Store dictionary into a numpy file for easy importing in other files
np.save('zipcode_to_income_dict.npy', zipcode_to_income_dict)


{11230: 25068, 11691: 18914, 10462: 23628, 10004: 95707, 11385: 26235, 11375: 45616, 11231: 56806, 10003: 85199, 11208: 17235, 10023: 111473, 11201: 71656, 10029: 27030, 11421: 24385, 11101: 39670, 11221: 22084, 10007: 166343, 11377: 24382, 11220: 17132, 11207: 18401, 10027: 28396, 11232: 22028, 10039: 21880, 10013: 108003, 10036: 77208, 11225: 27054, 11233: 21478, 10017: 103346, 11226: 23110, 11209: 39005, 10455: 12897, 11212: 17064, 11235: 29449, 11215: 64040, 10452: 14136, 10454: 12325, 11211: 35470, 11206: 19071, 11374: 34187, 11216: 32291, 10467: 17484, 10022: 133504, 10469: 23628, 10461: 28443, 10012: 79481, 11223: 23928, 11219: 15183, 11435: 24100, 11418: 22888, 11102: 33318, 11238: 45762, 10044: 49076, 10459: 13167, 11229: 27997, 10032: 20843, 11106: 33951, 11204: 21242, 10453: 12702, 10019: 90151, 10466: 21252, 10128: 92067, 11694: 41019, 10020: 0, 10001: 86347, 10026: 36542, 11217: 61845, 10463: 31875, 11214: 24952, 10031: 24556, 10065: 122657, 10457: 13626, 11222: 43603, 113

In [11]:
#Testing the import of the dictionary
read_dictionary = np.load('zipcode_to_income_dict.npy').item()
print("There are",len(read_dictionary), "entries.", "Example: read_dictionary[10003] =",read_dictionary[10003]) # displays an income for testing

There are 122 entries. Example: read_dictionary[10003] = 85199
