# Data Cleaning

## Light Rail Link Stations

In [1]:
# importing pandas for the notebook
import pandas as pd

#link for the tables
linkLink="https://en.wikipedia.org/wiki/List_of_Link_light_rail_stations"

# fetching the tables
stationsData=pd.read_html(linkLink,header=0,flavor="bs4",attrs={'class':"wikitable"})

In [2]:
stationVars=stationsData[1].copy()

In [3]:
stationVars.head()

Unnamed: 0,Station,Line[1],Location[2],Opened,Weekday ridership[n 1]
0,Angle Lake †,Line 1,SeaTac,"September 24, 2016[12]",3194
1,Beacon Hill,Line 1,"Beacon Hill, Seattle","July 18, 2009[8]",2675
2,Capitol Hill,Line 1,"Capitol Hill, Seattle","March 19, 2016[41]",7116
3,Columbia City,Line 1,"Columbia City, Seattle","July 18, 2009[8]",2358
4,Commerce Street/S 11th St,Line T,Downtown Tacoma,"September 15, 2011[10]",1051


In [4]:
stationVars

Unnamed: 0,Station,Line[1],Location[2],Opened,Weekday ridership[n 1]
0,Angle Lake †,Line 1,SeaTac,"September 24, 2016[12]",3194
1,Beacon Hill,Line 1,"Beacon Hill, Seattle","July 18, 2009[8]",2675
2,Capitol Hill,Line 1,"Capitol Hill, Seattle","March 19, 2016[41]",7116
3,Columbia City,Line 1,"Columbia City, Seattle","July 18, 2009[8]",2358
4,Commerce Street/S 11th St,Line T,Downtown Tacoma,"September 15, 2011[10]",1051
5,Convention Center/S 15th St,Line T,Downtown Tacoma,"August 23, 2003[7]",564
6,International District/Chinatown[n 2],Line 1,"Chinatown-International District, Seattle","July 18, 2009[8][n 3]",5233
7,Mount Baker,Line 1,"Mount Baker, Seattle","July 18, 2009[8]",2237
8,Northgate †,Line 1,"Northgate, Seattle","October 2, 2021[13]",—
9,Othello,Line 1,"NewHolly, Seattle","July 18, 2009[8]",2307


In [5]:
# drop "line" and "location" columns
columnsToDrop=[1,2]

# drop and update the data frame
stationVars.drop(labels=stationVars.columns[columnsToDrop],axis=1,inplace=True)

In [6]:
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership[n 1]'], dtype='object')

In [7]:
# remove brackets
import re
stationVars.columns=stationVars.columns.str.replace("\[(n )?1\]","",regex=True)

In [8]:
# see current columns
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership'], dtype='object')

In [9]:
# check frequnecy of table
stationVars['Station'].value_counts()

Angle Lake †                             1
SeaTac/Airport                           1
University Street                        1
University of Washington                 1
Union Station/S 19th St                  1
U District                               1
Tukwila International Boulevard          1
Theater District/S 9th St †              1
Tacoma Dome †                            1
Stadium                                  1
S 25th St                                1
SODO                                     1
Roosevelt                                1
Beacon Hill                              1
Rainier Beach                            1
Pioneer Square                           1
Othello                                  1
Northgate †                              1
Mount Baker                              1
International District/Chinatown[n 2]    1
Convention Center/S 15th St              1
Commerce Street/S 11th St                1
Columbia City                            1
Capitol Hil

In [10]:
stationVars

Unnamed: 0,Station,Opened,Weekday ridership
0,Angle Lake †,"September 24, 2016[12]",3194
1,Beacon Hill,"July 18, 2009[8]",2675
2,Capitol Hill,"March 19, 2016[41]",7116
3,Columbia City,"July 18, 2009[8]",2358
4,Commerce Street/S 11th St,"September 15, 2011[10]",1051
5,Convention Center/S 15th St,"August 23, 2003[7]",564
6,International District/Chinatown[n 2],"July 18, 2009[8][n 3]",5233
7,Mount Baker,"July 18, 2009[8]",2237
8,Northgate †,"October 2, 2021[13]",—
9,Othello,"July 18, 2009[8]",2307


In [11]:
# replace the matching strings
stationVars_updated = stationVars.replace(to_replace ='( †)|\[(n )?[0-9]?[0-9]\]', value = '', regex = True)

# Print the updated dataframe
print(stationVars_updated)

                             Station              Opened Weekday ridership
0                         Angle Lake  September 24, 2016              3194
1                        Beacon Hill       July 18, 2009              2675
2                       Capitol Hill      March 19, 2016              7116
3                      Columbia City       July 18, 2009              2358
4          Commerce Street/S 11th St  September 15, 2011              1051
5        Convention Center/S 15th St     August 23, 2003               564
6   International District/Chinatown       July 18, 2009              5233
7                        Mount Baker       July 18, 2009              2237
8                          Northgate     October 2, 2021                 —
9                            Othello       July 18, 2009              2307
10                    Pioneer Square       July 18, 2009              4015
11                     Rainier Beach       July 18, 2009              1858
12                       

In [12]:
stationVars_updated['Opened Year'] = pd.DatetimeIndex(stationVars_updated['Opened']).year
stationVars_updated

Unnamed: 0,Station,Opened,Weekday ridership,Opened Year
0,Angle Lake,"September 24, 2016",3194,2016
1,Beacon Hill,"July 18, 2009",2675,2009
2,Capitol Hill,"March 19, 2016",7116,2016
3,Columbia City,"July 18, 2009",2358,2009
4,Commerce Street/S 11th St,"September 15, 2011",1051,2011
5,Convention Center/S 15th St,"August 23, 2003",564,2003
6,International District/Chinatown,"July 18, 2009",5233,2009
7,Mount Baker,"July 18, 2009",2237,2009
8,Northgate,"October 2, 2021",—,2021
9,Othello,"July 18, 2009",2307,2009


In [13]:
# save to csv format
stationVars_updated.to_csv("stationVars.csv",index=False)

## Station Zip Codes

In [14]:
# extract address from site
import requests
from bs4 import BeautifulSoup

def getdata(url):
    r = requests.get(url)
    return r.text

htmldata = getdata("https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station")
soup = BeautifulSoup(htmldata, 'html.parser')
data = soup.find_all('p')[0].text

In [15]:
# set pandas display option
pd.set_option('display.max_colwidth', None)

# For the scrape:
from bs4 import BeautifulSoup as BShtml
import urllib.request as ur

# Make empty dataframe
station_zips = pd.DataFrame({"station":[], "zip":[]})

urls = ['https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/roosevelt-station',
      'https://www.soundtransit.org/ride-with-us/stops-stations/u-district-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/university-washington-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/capitol-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/westlake-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/university-street-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/pioneer-square-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/international-district-chinatown-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/stadium-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/sodo-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/beacon-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/mount-baker-station-transit-center',
        'https://www.soundtransit.org/ride-with-us/stops-stations/columbia-city-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/othello-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/rainier-beach-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/tukwila-international-boulevard-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/seatac-airport-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/angle-lake-station']

# Populate dataframe with quotes for first three pages
for url in urls:
    r = ur.urlopen(url).read()
    soup = BShtml(r, "html.parser")
    for post in soup.find_all('div', class_="station-map-card-content sidebar-container__content"):
        new_result = pd.DataFrame({
            "station":[post.find_all("span", class_="station-map-card-name")],
            "zip":[post.find_all('p')[0]]
        })
        station_zips = station_zips.append(new_result)
station_zips

Unnamed: 0,station,zip
0,[[Northgate Station]],"[\n, [Northgate Station], \n, [], \n 10200 1st Ave NE\n , [], \n Seattle, WA 98125\n ]"
0,[[Roosevelt Station]],"[\n, [Roosevelt Station], \n, [], \n 6501 12th Ave NE\n , [], \n Seattle, WA 98115\n ]"
0,[[U District Station]],"[\n, [U District Station], \n, [], \n 4300 Brooklyn Ave NE\n , [], \n Seattle, WA 98105\n ]"
0,[[University of Washington Station]],"[\n, [University of Washington Station], \n, [], \n 3720 Montlake Blvd NE\n , [], \n Seattle, WA 98195\n ]"
0,[[Capitol Hill Station]],"[\n, [Capitol Hill Station], \n, [], \n 140 Broadway E\n , [], \n Seattle, WA 98102\n ]"
0,[[Westlake Station]],"[\n, [Westlake Station], \n, [], \n 4th Ave & Pine St\n , [], \n Seattle, WA 98101\n ]"
0,[[University Street Station]],"[\n, [University Street Station], \n, [], \n 3rd Ave & Seneca St\n , [], \n Seattle, WA 98101\n ]"
0,[[Pioneer Square Station]],"[\n, [Pioneer Square Station], \n, [], \n 3rd Ave & James St\n , [], \n Seattle, WA 98104\n ]"
0,[[International District/Chinatown Station]],"[\n, [International District/Chinatown Station], \n, [], \n 5th Ave and S Jackson St\n , [], \n Seattle, WA 98104\n ]"
0,[[Stadium Station]],"[\n, [Stadium Station], \n, [], \n 501 S Royal Brougham Way\n , [], \n Seattle, WA 98134\n ]"


In [16]:
# convert values to strings
station_zips['station'] = station_zips['station'].astype('str')
station_zips['zip'] = station_zips['zip'].astype('str')

In [17]:
# remove html tags from strings
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
station_zips['station']=station_zips['station'].apply(lambda cw : remove_tags(cw))
station_zips['zip']=station_zips['zip'].apply(lambda cw : remove_tags(cw))
station_zips

Unnamed: 0,station,zip
0,[Northgate Station],"\nNorthgate Station\n\n 10200 1st Ave NE\n \n Seattle, WA 98125\n"
0,[Roosevelt Station],"\nRoosevelt Station\n\n 6501 12th Ave NE\n \n Seattle, WA 98115\n"
0,[U District Station],"\nU District Station\n\n 4300 Brooklyn Ave NE\n \n Seattle, WA 98105\n"
0,[University of Washington Station],"\nUniversity of Washington Station\n\n 3720 Montlake Blvd NE\n \n Seattle, WA 98195\n"
0,[Capitol Hill Station],"\nCapitol Hill Station\n\n 140 Broadway E\n \n Seattle, WA 98102\n"
0,[Westlake Station],"\nWestlake Station\n\n 4th Ave &amp; Pine St\n \n Seattle, WA 98101\n"
0,[University Street Station],"\nUniversity Street Station\n\n 3rd Ave &amp; Seneca St\n \n Seattle, WA 98101\n"
0,[Pioneer Square Station],"\nPioneer Square Station\n\n 3rd Ave &amp; James St\n \n Seattle, WA 98104\n"
0,[International District/Chinatown Station],"\nInternational District/Chinatown Station\n\n 5th Ave and S Jackson St\n \n Seattle, WA 98104\n"
0,[Stadium Station],"\nStadium Station\n\n 501 S Royal Brougham Way\n \n Seattle, WA 98134\n"


In [18]:
# clean up row values
station_zips['station'] = station_zips['station'].replace('\[|\]','', regex = True)
station_zips['zip'] = station_zips.zip.str.extract('([0-9]{5})\n')
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
0,Roosevelt Station,98115
0,U District Station,98105
0,University of Washington Station,98195
0,Capitol Hill Station,98102
0,Westlake Station,98101
0,University Street Station,98101
0,Pioneer Square Station,98104
0,International District/Chinatown Station,98104
0,Stadium Station,98134


In [19]:
# reset row numbers
station_zips.reset_index(drop=True,inplace=True)
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
1,Roosevelt Station,98115
2,U District Station,98105
3,University of Washington Station,98195
4,Capitol Hill Station,98102
5,Westlake Station,98101
6,University Street Station,98101
7,Pioneer Square Station,98104
8,International District/Chinatown Station,98104
9,Stadium Station,98134


In [20]:
# save to csv format
station_zips.to_csv("station_zips.csv",index=False)

## ACS Data Using an API

In [21]:
# install censusdata
!pip install CensusData



In [22]:
import censusdata

In [23]:
# set variables for census data
api_key='7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
dsource='zbp'
state='53'
zipcode='98125, 98115,98105,98195,98102,98101,98104,98134,98144,98108,98118,98188'

In [24]:
# 2019: pull transportation by car, truck, or van alone
data_url = f'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,group(S0802)&for=zip%20code%20tabulation%20area:{zipcode}&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
transportation_response=requests.get(data_url)
print(transportation_response.text)

[["NAME","GEO_ID","NAME","S0802_C01_001E","S0802_C01_001EA","S0802_C01_001M","S0802_C01_001MA","S0802_C01_002E","S0802_C01_002EA","S0802_C01_002M","S0802_C01_002MA","S0802_C01_003E","S0802_C01_003EA","S0802_C01_003M","S0802_C01_003MA","S0802_C01_004E","S0802_C01_004EA","S0802_C01_004M","S0802_C01_004MA","S0802_C01_005E","S0802_C01_005EA","S0802_C01_005M","S0802_C01_005MA","S0802_C01_006E","S0802_C01_006EA","S0802_C01_006M","S0802_C01_006MA","S0802_C01_007E","S0802_C01_007EA","S0802_C01_007M","S0802_C01_007MA","S0802_C01_008E","S0802_C01_008EA","S0802_C01_008M","S0802_C01_008MA","S0802_C01_009E","S0802_C01_009EA","S0802_C01_009M","S0802_C01_009MA","S0802_C01_010E","S0802_C01_010EA","S0802_C01_010M","S0802_C01_010MA","S0802_C01_011E","S0802_C01_011EA","S0802_C01_011M","S0802_C01_011MA","S0802_C01_012E","S0802_C01_012EA","S0802_C01_012M","S0802_C01_012MA","S0802_C01_013E","S0802_C01_013EA","S0802_C01_013M","S0802_C01_013MA","S0802_C01_014E","S0802_C01_014EA","S0802_C01_014M","S0802_C01_01

In [25]:
# pull 2019 transportation into dataframe
data=transportation_response.json()
transportation=pd.DataFrame(data[1:], columns=data[0])
transportation.head()

Unnamed: 0,NAME,GEO_ID,NAME.1,S0802_C01_001E,S0802_C01_001EA,S0802_C01_001M,S0802_C01_001MA,S0802_C01_002E,S0802_C01_002EA,S0802_C01_002M,...,S0802_C04_099MA,S0802_C04_100E,S0802_C04_100EA,S0802_C04_100M,S0802_C04_100MA,S0802_C04_101E,S0802_C04_101EA,S0802_C04_101M,S0802_C04_101MA,zip code tabulation area
0,ZCTA5 98104,8600000US98104,ZCTA5 98104,7621,,586,,0.9,,1.3,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98104
1,ZCTA5 98101,8600000US98101,ZCTA5 98101,9039,,747,,0.4,,0.4,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98101
2,ZCTA5 98105,8600000US98105,ZCTA5 98105,24841,,1240,,10.9,,1.8,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98105
3,ZCTA5 98118,8600000US98118,ZCTA5 98118,25095,,1241,,2.1,,0.6,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98118
4,ZCTA5 98134,8600000US98134,ZCTA5 98134,497,,186,,3.2,,5.8,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98134


In [26]:
# add year column to 2019 data
transportation['Year'] = '2019'
transportation

Unnamed: 0,NAME,GEO_ID,NAME.1,S0802_C01_001E,S0802_C01_001EA,S0802_C01_001M,S0802_C01_001MA,S0802_C01_002E,S0802_C01_002EA,S0802_C01_002M,...,S0802_C04_100E,S0802_C04_100EA,S0802_C04_100M,S0802_C04_100MA,S0802_C04_101E,S0802_C04_101EA,S0802_C04_101M,S0802_C04_101MA,zip code tabulation area,Year
0,ZCTA5 98104,8600000US98104,ZCTA5 98104,7621,,586,,0.9,,1.3,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98104,2019
1,ZCTA5 98101,8600000US98101,ZCTA5 98101,9039,,747,,0.4,,0.4,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98101,2019
2,ZCTA5 98105,8600000US98105,ZCTA5 98105,24841,,1240,,10.9,,1.8,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98105,2019
3,ZCTA5 98118,8600000US98118,ZCTA5 98118,25095,,1241,,2.1,,0.6,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98118,2019
4,ZCTA5 98134,8600000US98134,ZCTA5 98134,497,,186,,3.2,,5.8,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98134,2019
5,ZCTA5 98125,8600000US98125,ZCTA5 98125,23742,,881,,2.1,,0.9,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98125,2019
6,ZCTA5 98102,8600000US98102,ZCTA5 98102,19929,,879,,0.2,,0.2,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98102,2019
7,ZCTA5 98108,8600000US98108,ZCTA5 98108,12161,,695,,2.1,,0.8,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98108,2019
8,ZCTA5 98188,8600000US98188,ZCTA5 98188,13440,,889,,2.4,,0.9,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98188,2019
9,ZCTA5 98195,8600000US98195,ZCTA5 98195,0,,12,,-666666666.0,-,-222222222.0,...,-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),98195,2019


In [27]:
# 2011-2019: pull median income in the past 12 months (in inflation-adjusted dollars) by place of birth
years=list(range(2011,2019))
allUrls = [f'https://api.census.gov/data/'+str(y)+"/acs/acs5/subject?get=NAME,group(S1903)&for=zip%20code%20tabulation%20area:*&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5" for y in years]

In [28]:
# pull 2010-2018 median income into dataframe
year=2011
zipcodes=['98125', '98115','98105','98195','98102','98101','98104','98134','98144','98108','98118','98188']
allDataFramesTransportation=[]
for url in allUrls:
    transportation_response=requests.get(url)
    data=transportation_response.json()
    transportation=pd.DataFrame(data[1:], columns=data[0])
    transportation['Year']=year
    transportation=transportation[transportation['zip code tabulation area'].isin(zipcodes)]
    allDataFramesTransportation.append(transportation)
    year+=1

KeyboardInterrupt: 

In [None]:
# 2019: pull hispanic or latino origin by race
zipcode='98125, 98115,98105,98195,98102,98101,98104,98134,98144,98108,98118,98188'
data_url = f'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,group(S1903)&for=zip%20code%20tabulation%20area:{zipcode}&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
race_response=requests.get(data_url)
print(race_response.text)

# pull 2019 race data into dataframe
data=race_response.json()
race_data=pd.DataFrame(data[1:], columns=data[0])
race_data.head()

# add year column to 2019 data
race_data['Year']='2019'
race_data

# 2011-2019: pull race data
allUrlsRace = [f'https://api.census.gov/data/'+str(y)+"/acs/acs5/subject?get=NAME,group(S1903)&for=zip%20code%20tabulation%20area:*&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5" for y in years]

# pull 2010-2018 race data into dataframe
year=2011
zipcode=['98125', '98115','98105','98195','98102','98101','98104','98134','98144','98108','98118','98188']
allDataFrames=[]
for url in allUrlsRace:
    race_response=requests.get(url)
    data=race_response.json()
    race_data=pd.DataFrame(data[1:], columns=data[0])
    race_data['Year']=year
    race_data=race_data[race_data['zip code tabulation area'].isin(zipcode)]
    allDataFrames.append(race_data)
    year+=1

# concatenate, append
pd.concat(allDataFrames, ignore_index=True)

# append all years of data
race_data=race_data.append(allDataFrames, ignore_index=True)
race_data

In [None]:
allDataFramesIncome.Year.unique()

In [None]:
transportation

In [None]:
# append all years of data
transportation=transportation.append(allDataFramesTransportation, ignore_index=True)
transportation

In [None]:
# keep wanted columns
transportation=transportation.filter(regex='(zip code tabulation area)|S0802_C02_0(20|12|19)E')
transportation

In [None]:
# drop unwanted columns
transportation.drop(transportation.filter(regex='EA$').columns, axis=1, inplace=True)
transportation

In [None]:
transportation.Year.unique()

In [None]:
# clean up median income data frame
medianIncome=medianIncome.rename({'S1903_C01_001E':'Household Median Income','zip code tabulation area':'Zip Code'}, axis=1)
medianIncome.drop(medianIncome.columns[[0,2]], axis=1, inplace = True)
medianIncome

In [None]:
# move zip code to first column
medianIncome.insert(0, 'Zip Code', medianIncome.pop('Zip Code'))
medianIncome

In [None]:
# reset row numbers
medianIncome.reset_index(drop=True,inplace=True)
medianIncome

In [None]:
# save to csv format
medianIncome.to_csv("median_income.csv",index=False)

In [None]:
# 2019: pull hispanic or latino origin by race
zipcode='98125, 98115,98105,98195,98102,98101,98104,98134,98144,98108,98118,98188'
data_url = f'https://api.census.gov/data/2019/acs/acs5?get=NAME,group(B03002)&for=zip%20code%20tabulation%20area:{zipcode}&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
race_response=requests.get(data_url)
print(race_response.text)

In [None]:
# pull 2019 race data into dataframe
data=race_response.json()
race_data=pd.DataFrame(data[1:], columns=data[0])
race_data.head()

In [None]:
# add year column to 2019 data
race_data['Year']='2019'
race_data

In [None]:
# 2011-2019: pull race data
allUrlsRace = [f'https://api.census.gov/data/'+str(y)+"/acs/acs5?get=NAME,group(B03002)&for=zip%20code%20tabulation%20area:*&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5" for y in years]

In [None]:
# pull 2010-2018 race data into dataframe
year=2011
zipcode=['98125', '98115','98105','98195','98102','98101','98104','98134','98144','98108','98118','98188']
allDataFrames=[]
for url in allUrlsRace:
    race_response=requests.get(url)
    data=race_response.json()
    race_data=pd.DataFrame(data[1:], columns=data[0])
    race_data['Year']=year
    race_data=race_data[race_data['zip code tabulation area'].isin(zipcode)]
    allDataFrames.append(race_data)
    year+=1

In [None]:
pd.concat(allDataFrames, ignore_index=True)

In [None]:
# append all years of data
race_data=race_data.append(allDataFrames, ignore_index=True)
race_data

In [None]:
race_data.Year.unique()

In [None]:
# drop unwanted columns
race_data.drop(race_data.filter(regex='EA|M|NAME|state|GEO_ID|_002|01[01]|01[3-9]|02[0-1]').columns, axis=1, inplace=True)
race_data

In [None]:
# rename columns
race=race_data.rename({'zip code tabulation area':'Zip Code','B03002_001E': 'TotalPopulation','B03002_003E':'White alone', 'B03002_004E':'Black or African American alone', 'B03002_005E':'American Indian and Alaska Native alone', 'B03002_006E':'Asian alone', 'B03002_007E':'Native Hawaiian and Other Pacific Islander alone', 'B03002_008E':'Some other race alone', 'B03002_009E':'Two or more races alone', 'B03002_012E':'Hispanic or Latino'}, axis=1)
race

In [None]:
race['TotalPopulation'].apply(pd.to_numeric)

In [None]:
race = pd.DataFrame({
    'Year': race['Year'],
    'Zip Code': race['Zip Code'],
    'TotalPopulation': race['TotalPopulation'].apply(pd.to_numeric),
    'White alone': race['White alone'].apply(pd.to_numeric),
    'Hispanic or Latino': race['Hispanic or Latino'].apply(pd.to_numeric),
    'Black or African American alone': race['Black or African American alone'].apply(pd.to_numeric),
    'American Indian and Alaska Native alone': race['American Indian and Alaska Native alone'].apply(pd.to_numeric),
    'Asian alone': race['Asian alone'].apply(pd.to_numeric),
    'Native Hawaiian and Other Pacific Islander alone': race['Native Hawaiian and Other Pacific Islander alone'].apply(pd.to_numeric),
    'Some other race alone': race['Some other race alone'].apply(pd.to_numeric),
    'Two or more races alone': race['Two or more races alone'].apply(pd.to_numeric)
})

In [None]:
race = pd.DataFrame({
    'Year': race['Year'],
    'Zip Code': race['Zip Code'],
    'TotalPopulation': race['TotalPopulation'],
    'White alone': race['White alone'].div(race.TotalPopulation, axis=0),
    'Hispanic or Latino': race['Hispanic or Latino'].div(race.TotalPopulation, axis=0),
    'Black or African American alone': race['Black or African American alone'].div(race.TotalPopulation, axis=0),
    'American Indian and Alaska Native alone': race['American Indian and Alaska Native alone'].div(race.TotalPopulation, axis=0),
    'Asian alone': race['Asian alone'].div(race.TotalPopulation, axis=0),
    'Native Hawaiian and Other Pacific Islander alone': race['Native Hawaiian and Other Pacific Islander alone'].div(race.TotalPopulation, axis=0),
    'Some other race alone': race['Some other race alone'].div(race.TotalPopulation, axis=0),
    'Two or more races alone': race['Two or more races alone'].div(race.TotalPopulation, axis=0)
})
race

In [None]:
# move zip code to first column
race.insert(0, 'Zip Code', race.pop('Zip Code'))
race

In [None]:
# reset row numbers
race.reset_index(drop=True,inplace=True)
race

In [None]:
# save to csv format
race.to_csv("race.csv",index=False)

# Data Integration

## Merge ACS Data

In [None]:
medianIncome.columns

In [None]:
# merge race & median income data
acsData=race.merge(medianIncome,left_on=['Year','Zip Code'],right_on=['Year','Zip Code'])
acsData

In [None]:
# save to csv format
acsData.to_csv("acsData.csv",index=False)

## Merge Light Rail Link Stations

In [32]:
# link to station wiki data
linkStationWiki='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/stationVars.csv'
stationWiki=pd.read_csv(linkStationWiki)

# link to station zips
linkStationZips='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/station_zips.csv'
stationZips=pd.read_csv(linkStationZips)

# link to acs data
linkACS='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/acsData.csv'
acsMerge=pd.read_csv(linkACS)

In [33]:
# check current columns
stationWiki.columns

Index(['Station', 'Opened', 'Weekday ridership', 'Opened Year'], dtype='object')

In [34]:
stationWiki['Station'] = stationWiki['Station'].astype(str) + ' Station'
stationWiki

Unnamed: 0,Station,Opened,Weekday ridership,Opened Year
0,Angle Lake Station,"September 24, 2016",3194,2016
1,Beacon Hill Station,"July 18, 2009",2675,2009
2,Capitol Hill Station,"March 19, 2016",7116,2016
3,Columbia City Station,"July 18, 2009",2358,2009
4,Commerce Street/S 11th St Station,"September 15, 2011",1051,2011
5,Convention Center/S 15th St Station,"August 23, 2003",564,2003
6,International District/Chinatown Station,"July 18, 2009",5233,2009
7,Mount Baker Station,"July 18, 2009",2237,2009
8,Northgate Station,"October 2, 2021",—,2021
9,Othello Station,"July 18, 2009",2307,2009


In [35]:
stationZips = stationZips.replace(to_replace ='Mount Baker Station &amp; Transit Center', value = 'Mount Baker Station', regex = True)
stationZips

Unnamed: 0,station,zip
0,Northgate Station,98125
1,Roosevelt Station,98115
2,U District Station,98105
3,University of Washington Station,98195
4,Capitol Hill Station,98102
5,Westlake Station,98101
6,University Street Station,98101
7,Pioneer Square Station,98104
8,International District/Chinatown Station,98104
9,Stadium Station,98134


In [36]:
# merge station zips to station wiki data
stationMerge=stationWiki.merge(stationZips,left_on="Station",right_on="station",how='outer',indicator='True')
stationMerge

Unnamed: 0,Station,Opened,Weekday ridership,Opened Year,station,zip,True
0,Angle Lake Station,"September 24, 2016",3194,2016,Angle Lake Station,98188.0,both
1,Beacon Hill Station,"July 18, 2009",2675,2009,Beacon Hill Station,98144.0,both
2,Capitol Hill Station,"March 19, 2016",7116,2016,Capitol Hill Station,98102.0,both
3,Columbia City Station,"July 18, 2009",2358,2009,Columbia City Station,98108.0,both
4,Commerce Street/S 11th St Station,"September 15, 2011",1051,2011,,,left_only
5,Convention Center/S 15th St Station,"August 23, 2003",564,2003,,,left_only
6,International District/Chinatown Station,"July 18, 2009",5233,2009,International District/Chinatown Station,98104.0,both
7,Mount Baker Station,"July 18, 2009",2237,2009,Mount Baker Station,98144.0,both
8,Northgate Station,"October 2, 2021",—,2021,Northgate Station,98125.0,both
9,Othello Station,"July 18, 2009",2307,2009,Othello Station,98118.0,both


In [37]:
# drop left_only from merge (not line 1 stops)
stationMerge=stationWiki.merge(stationZips,left_on="Station",right_on="station")
stationMerge

Unnamed: 0,Station,Opened,Weekday ridership,Opened Year,station,zip
0,Angle Lake Station,"September 24, 2016",3194,2016,Angle Lake Station,98188
1,Beacon Hill Station,"July 18, 2009",2675,2009,Beacon Hill Station,98144
2,Capitol Hill Station,"March 19, 2016",7116,2016,Capitol Hill Station,98102
3,Columbia City Station,"July 18, 2009",2358,2009,Columbia City Station,98108
4,International District/Chinatown Station,"July 18, 2009",5233,2009,International District/Chinatown Station,98104
5,Mount Baker Station,"July 18, 2009",2237,2009,Mount Baker Station,98144
6,Northgate Station,"October 2, 2021",—,2021,Northgate Station,98125
7,Othello Station,"July 18, 2009",2307,2009,Othello Station,98118
8,Pioneer Square Station,"July 18, 2009",4015,2009,Pioneer Square Station,98104
9,Rainier Beach Station,"July 18, 2009",1858,2009,Rainier Beach Station,98118


In [38]:
# move zip code to first column
stationMerge.insert(0, 'zip', stationMerge.pop('zip'))
stationMerge

Unnamed: 0,zip,Station,Opened,Weekday ridership,Opened Year,station
0,98188,Angle Lake Station,"September 24, 2016",3194,2016,Angle Lake Station
1,98144,Beacon Hill Station,"July 18, 2009",2675,2009,Beacon Hill Station
2,98102,Capitol Hill Station,"March 19, 2016",7116,2016,Capitol Hill Station
3,98108,Columbia City Station,"July 18, 2009",2358,2009,Columbia City Station
4,98104,International District/Chinatown Station,"July 18, 2009",5233,2009,International District/Chinatown Station
5,98144,Mount Baker Station,"July 18, 2009",2237,2009,Mount Baker Station
6,98125,Northgate Station,"October 2, 2021",—,2021,Northgate Station
7,98118,Othello Station,"July 18, 2009",2307,2009,Othello Station
8,98104,Pioneer Square Station,"July 18, 2009",4015,2009,Pioneer Square Station
9,98118,Rainier Beach Station,"July 18, 2009",1858,2009,Rainier Beach Station


## Merge ACS Data to Link Data

In [None]:
acsMerge

In [None]:
# merge acs data with link data
finalMerge=acsMerge.merge(stationMerge,left_on="Zip Code",right_on="zip",how='outer',indicator=True)
finalMerge.head()

In [None]:
# check merge outputs
finalMerge['_merge'].value_counts()

In [None]:
# drop and update the data frame
columnsToDrop = [16,17,18]
finalMerge.drop(labels=finalMerge.columns[columnsToDrop],axis=1,inplace=True)
finalMerge.head()

In [None]:
# reorder columns
finalMerge = finalMerge[["Zip Code", "Station", "Opened", "Opened Year","Year","Weekday ridership", "Household Median Income", "Hispanic or Latino", "White alone", "Black or African American alone", "Asian alone", "American Indian and Alaska Native alone", "Native Hawaiian and Other Pacific Islander alone", "Some other race alone", "Two or more races alone"]]
finalMerge

In [None]:
finalMerge.sort_values('Station')
finalMerge.reset_index(drop=True,inplace=True)
finalMerge

In [None]:
# save to csv format
finalMerge.to_csv("finalMerge.csv",index=False)

In [None]:
# for future use in Python
finalMerge.to_pickle("finalMerge_OK.pkl")

In [None]:
finalMerge_OK=pd.read_pickle("finalMerge_OK.pkl")
finalMerge_OK.info()

In [None]:
## Merge ACS Data to Link Data# save to r
!pip install rpy2

from rpy2.robjects import pandas2ri
pandas2ri.activate()

from rpy2.robjects.packages import importr

base = importr('base')
base.saveRDS(finalMerge,file="finalMerge_OK.RDS")

## Merge Equity Data to Link Data

In [29]:
# link to station wiki data
linkStationWiki='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/stationVars.csv'
stationWiki=pd.read_csv(linkStationWiki)

# link to station zips
linkStationZips='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/station_zips.csv'
stationZips=pd.read_csv(linkStationZips)

# link to Seattle equity data
linkEquity='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/deliverable_1/data/equityData.csv'
equityData=pd.read_csv(linkEquity)

In [30]:
equityData.dtypes

ZipCode                     object
CompositePercentile        float64
RacePercentile             float64
SocioeconomicPercentile    float64
dtype: object

In [39]:
stationMerge.dtypes

zip                   int64
Station              object
Opened               object
Weekday ridership    object
Opened Year           int64
station              object
dtype: object

In [40]:
stationMerge['zip']=stationMerge['zip'].astype(str).astype(object)

In [41]:
# merge equity data with link data
finalMerge=equityData.merge(stationMerge,left_on="ZipCode",right_on="zip",how='outer',indicator=True)
finalMerge.head()

Unnamed: 0,ZipCode,CompositePercentile,RacePercentile,SocioeconomicPercentile,zip,Station,Opened,Weekday ridership,Opened Year,station,_merge
0,98101,0.610571,0.602,0.597714,98101,University Street Station,"July 18, 2009",5284,2009.0,University Street Station,both
1,98101,0.610571,0.602,0.597714,98101,Westlake Station,"July 18, 2009",10096,2009.0,Westlake Station,both
2,98102,0.281,0.3865,0.333833,98102,Capitol Hill Station,"March 19, 2016",7116,2016.0,Capitol Hill Station,both
3,98104,0.89525,0.854,0.9045,98104,International District/Chinatown Station,"July 18, 2009",5233,2009.0,International District/Chinatown Station,both
4,98104,0.89525,0.854,0.9045,98104,Pioneer Square Station,"July 18, 2009",4015,2009.0,Pioneer Square Station,both


In [42]:
finalMerge

Unnamed: 0,ZipCode,CompositePercentile,RacePercentile,SocioeconomicPercentile,zip,Station,Opened,Weekday ridership,Opened Year,station,_merge
0,98101,0.610571,0.602,0.597714,98101.0,University Street Station,"July 18, 2009",5284,2009.0,University Street Station,both
1,98101,0.610571,0.602,0.597714,98101.0,Westlake Station,"July 18, 2009",10096,2009.0,Westlake Station,both
2,98102,0.281,0.3865,0.333833,98102.0,Capitol Hill Station,"March 19, 2016",7116,2016.0,Capitol Hill Station,both
3,98104,0.89525,0.854,0.9045,98104.0,International District/Chinatown Station,"July 18, 2009",5233,2009.0,International District/Chinatown Station,both
4,98104,0.89525,0.854,0.9045,98104.0,Pioneer Square Station,"July 18, 2009",4015,2009.0,Pioneer Square Station,both
5,98105,0.4652,0.5828,0.5042,98105.0,U District Station,"October 2, 2021",—,2021.0,U District Station,both
6,98108,0.922,0.952,0.926286,98108.0,Columbia City Station,"July 18, 2009",2358,2009.0,Columbia City Station,both
7,98115,0.305846,0.387462,0.299692,98115.0,Roosevelt Station,"October 2, 2021",—,2021.0,Roosevelt Station,both
8,98118,0.8725,0.870167,0.8125,98118.0,Othello Station,"July 18, 2009",2307,2009.0,Othello Station,both
9,98118,0.8725,0.870167,0.8125,98118.0,Rainier Beach Station,"July 18, 2009",1858,2009.0,Rainier Beach Station,both


In [54]:
finalMerge=finalMerge.drop([16,17,18,19])

In [44]:
# drop and update the data frame
columnsToDrop = [4,10]
finalMerge.drop(labels=finalMerge.columns[columnsToDrop],axis=1,inplace=True)
finalMerge.head()

Unnamed: 0,ZipCode,CompositePercentile,RacePercentile,SocioeconomicPercentile,Station,Opened,Weekday ridership,Opened Year,station
0,98101,0.610571,0.602,0.597714,University Street Station,"July 18, 2009",5284,2009.0,University Street Station
1,98101,0.610571,0.602,0.597714,Westlake Station,"July 18, 2009",10096,2009.0,Westlake Station
2,98102,0.281,0.3865,0.333833,Capitol Hill Station,"March 19, 2016",7116,2016.0,Capitol Hill Station
3,98104,0.89525,0.854,0.9045,International District/Chinatown Station,"July 18, 2009",5233,2009.0,International District/Chinatown Station
4,98104,0.89525,0.854,0.9045,Pioneer Square Station,"July 18, 2009",4015,2009.0,Pioneer Square Station


In [49]:
finalMerge['Opened Year'] = pd.to_numeric(finalMerge['Opened Year'], downcast='float')
finalMerge

Unnamed: 0,ZipCode,CompositePercentile,RacePercentile,SocioeconomicPercentile,Station,Opened,Weekday ridership,Opened Year,station
0,98101,0.610571,0.602,0.597714,University Street Station,"July 18, 2009",5284,2009.0,University Street Station
1,98101,0.610571,0.602,0.597714,Westlake Station,"July 18, 2009",10096,2009.0,Westlake Station
2,98102,0.281,0.3865,0.333833,Capitol Hill Station,"March 19, 2016",7116,2016.0,Capitol Hill Station
3,98104,0.89525,0.854,0.9045,International District/Chinatown Station,"July 18, 2009",5233,2009.0,International District/Chinatown Station
4,98104,0.89525,0.854,0.9045,Pioneer Square Station,"July 18, 2009",4015,2009.0,Pioneer Square Station
5,98105,0.4652,0.5828,0.5042,U District Station,"October 2, 2021",—,2021.0,U District Station
6,98108,0.922,0.952,0.926286,Columbia City Station,"July 18, 2009",2358,2009.0,Columbia City Station
7,98115,0.305846,0.387462,0.299692,Roosevelt Station,"October 2, 2021",—,2021.0,Roosevelt Station
8,98118,0.8725,0.870167,0.8125,Othello Station,"July 18, 2009",2307,2009.0,Othello Station
9,98118,0.8725,0.870167,0.8125,Rainier Beach Station,"July 18, 2009",1858,2009.0,Rainier Beach Station


In [55]:
# reorder columns
finalMerge = finalMerge[["ZipCode", "Station", "Opened", "Opened Year","Weekday ridership", "CompositePercentile","RacePercentile","SocioeconomicPercentile"]]
finalMerge

Unnamed: 0,ZipCode,Station,Opened,Opened Year,Weekday ridership,CompositePercentile,RacePercentile,SocioeconomicPercentile
0,98101,University Street Station,"July 18, 2009",2009.0,5284,0.610571,0.602,0.597714
1,98101,Westlake Station,"July 18, 2009",2009.0,10096,0.610571,0.602,0.597714
2,98102,Capitol Hill Station,"March 19, 2016",2016.0,7116,0.281,0.3865,0.333833
3,98104,International District/Chinatown Station,"July 18, 2009",2009.0,5233,0.89525,0.854,0.9045
4,98104,Pioneer Square Station,"July 18, 2009",2009.0,4015,0.89525,0.854,0.9045
5,98105,U District Station,"October 2, 2021",2021.0,—,0.4652,0.5828,0.5042
6,98108,Columbia City Station,"July 18, 2009",2009.0,2358,0.922,0.952,0.926286
7,98115,Roosevelt Station,"October 2, 2021",2021.0,—,0.305846,0.387462,0.299692
8,98118,Othello Station,"July 18, 2009",2009.0,2307,0.8725,0.870167,0.8125
9,98118,Rainier Beach Station,"July 18, 2009",2009.0,1858,0.8725,0.870167,0.8125


In [57]:
# save to csv format
finalMerge.to_csv("finalMerge.csv",index=False)

In [58]:
# for future use in Python
finalMerge.to_pickle("finalMerge_OK.pkl")

In [59]:
finalMerge_OK=pd.read_pickle("finalMerge_OK.pkl")
finalMerge_OK.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ZipCode                  16 non-null     object 
 1   Station                  16 non-null     object 
 2   Opened                   16 non-null     object 
 3   Opened Year              16 non-null     float32
 4   Weekday ridership        16 non-null     object 
 5   CompositePercentile      16 non-null     float64
 6   RacePercentile           16 non-null     float64
 7   SocioeconomicPercentile  16 non-null     float64
dtypes: float32(1), float64(3), object(4)
memory usage: 1.1+ KB


In [60]:
## Merge ACS Data to Link Data# save to r
!pip install rpy2

from rpy2.robjects import pandas2ri
pandas2ri.activate()

from rpy2.robjects.packages import importr

base = importr('base')
base.saveRDS(finalMerge,file="finalMerge_OK.RDS")



<rpy2.rinterface_lib.sexp.NULLType object at 0x0000022312221300> [RTYPES.NILSXP]