# Data Cleaning

## Light Rail Link Stations

In [1]:
# importing pandas for the notebook
import pandas as pd

#link for the tables
linkLink="https://en.wikipedia.org/wiki/List_of_Link_light_rail_stations"

# fetching the tables
stationsData=pd.read_html(linkLink,header=0,flavor="bs4",attrs={'class':"wikitable"})

In [2]:
stationVars=stationsData[1].copy()

In [3]:
stationVars.head()

Unnamed: 0,Station,Line[1],Location[2],Opened,Weekday ridership[n 1]
0,Angle Lake †,Line 1,SeaTac,"September 24, 2016[12]",3194
1,Beacon Hill,Line 1,"Beacon Hill, Seattle","July 18, 2009[8]",2675
2,Capitol Hill,Line 1,"Capitol Hill, Seattle","March 19, 2016[41]",7116
3,Columbia City,Line 1,"Columbia City, Seattle","July 18, 2009[8]",2358
4,Commerce Street/S 11th St,Line T,Downtown Tacoma,"September 15, 2011[10]",1051


In [4]:
stationVars

Unnamed: 0,Station,Line[1],Location[2],Opened,Weekday ridership[n 1]
0,Angle Lake †,Line 1,SeaTac,"September 24, 2016[12]",3194
1,Beacon Hill,Line 1,"Beacon Hill, Seattle","July 18, 2009[8]",2675
2,Capitol Hill,Line 1,"Capitol Hill, Seattle","March 19, 2016[41]",7116
3,Columbia City,Line 1,"Columbia City, Seattle","July 18, 2009[8]",2358
4,Commerce Street/S 11th St,Line T,Downtown Tacoma,"September 15, 2011[10]",1051
5,Convention Center/S 15th St,Line T,Downtown Tacoma,"August 23, 2003[7]",564
6,International District/Chinatown[n 2],Line 1,"Chinatown-International District, Seattle","July 18, 2009[8][n 3]",5233
7,Mount Baker,Line 1,"Mount Baker, Seattle","July 18, 2009[8]",2237
8,Northgate †,Line 1,"Northgate, Seattle","October 2, 2021[13]",—
9,Othello,Line 1,"NewHolly, Seattle","July 18, 2009[8]",2307


In [5]:
# drop "line" and "location" columns
columnsToDrop=[1,2]

# drop and update the data frame
stationVars.drop(labels=stationVars.columns[columnsToDrop],axis=1,inplace=True)

In [6]:
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership[n 1]'], dtype='object')

In [7]:
# remove brackets
import re
stationVars.columns=stationVars.columns.str.replace("\[(n )?1\]","",regex=True)

In [8]:
# see current columns
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership'], dtype='object')

In [9]:
# check frequnecy of table
stationVars['Station'].value_counts()

Angle Lake †                             1
SeaTac/Airport                           1
University Street                        1
University of Washington                 1
Union Station/S 19th St                  1
U District                               1
Tukwila International Boulevard          1
Theater District/S 9th St †              1
Tacoma Dome †                            1
Stadium                                  1
S 25th St                                1
SODO                                     1
Roosevelt                                1
Beacon Hill                              1
Rainier Beach                            1
Pioneer Square                           1
Othello                                  1
Northgate †                              1
Mount Baker                              1
International District/Chinatown[n 2]    1
Convention Center/S 15th St              1
Commerce Street/S 11th St                1
Columbia City                            1
Capitol Hil

In [10]:
# replace the matching strings
stationVars_updated = stationVars.replace(to_replace ='( †)|\[(n )?[0-9]?[0-9]\]', value = '', regex = True)

# Print the updated dataframe
print(stationVars_updated)

                             Station              Opened Weekday ridership
0                         Angle Lake  September 24, 2016              3194
1                        Beacon Hill       July 18, 2009              2675
2                       Capitol Hill      March 19, 2016              7116
3                      Columbia City       July 18, 2009              2358
4          Commerce Street/S 11th St  September 15, 2011              1051
5        Convention Center/S 15th St     August 23, 2003               564
6   International District/Chinatown       July 18, 2009              5233
7                        Mount Baker       July 18, 2009              2237
8                          Northgate     October 2, 2021                 —
9                            Othello       July 18, 2009              2307
10                    Pioneer Square       July 18, 2009              4015
11                     Rainier Beach       July 18, 2009              1858
12                       

In [11]:
# save to csv format
stationVars_updated.to_csv("stationVars.csv",index=False)

## Station Zip Codes

In [12]:
# extract address from site
import requests
from bs4 import BeautifulSoup

def getdata(url):
    r = requests.get(url)
    return r.text

htmldata = getdata("https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station")
soup = BeautifulSoup(htmldata, 'html.parser')
data = soup.find_all('p')[0].text

In [13]:
# set pandas display option
pd.set_option('display.max_colwidth', None)

# For the scrape:
from bs4 import BeautifulSoup as BShtml
import urllib.request as ur

# Make empty dataframe
station_zips = pd.DataFrame({"station":[], "zip":[]})

urls = ['https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/roosevelt-station',
      'https://www.soundtransit.org/ride-with-us/stops-stations/u-district-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/university-washington-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/capitol-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/westlake-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/university-street-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/pioneer-square-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/international-district-chinatown-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/stadium-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/sodo-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/beacon-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/mount-baker-station-transit-center',
        'https://www.soundtransit.org/ride-with-us/stops-stations/columbia-city-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/othello-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/rainier-beach-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/tukwila-international-boulevard-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/seatac-airport-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/angle-lake-station']

# Populate dataframe with quotes for first three pages
for url in urls:
    r = ur.urlopen(url).read()
    soup = BShtml(r, "html.parser")
    for post in soup.find_all('div', class_="station-map-card-content sidebar-container__content"):
        new_result = pd.DataFrame({
            "station":[post.find_all("span", class_="station-map-card-name")],
            "zip":[post.find_all('p')[0]]
        })
        station_zips = station_zips.append(new_result)
station_zips

Unnamed: 0,station,zip
0,[[Northgate Station]],"[\n, [Northgate Station], \n, [], \n 10200 1st Ave NE\n , [], \n Seattle, WA 98125\n ]"
0,[[Roosevelt Station]],"[\n, [Roosevelt Station], \n, [], \n 6501 12th Ave NE\n , [], \n Seattle, WA 98115\n ]"
0,[[U District Station]],"[\n, [U District Station], \n, [], \n 4300 Brooklyn Ave NE\n , [], \n Seattle, WA 98105\n ]"
0,[[University of Washington Station]],"[\n, [University of Washington Station], \n, [], \n 3720 Montlake Blvd NE\n , [], \n Seattle, WA 98195\n ]"
0,[[Capitol Hill Station]],"[\n, [Capitol Hill Station], \n, [], \n 140 Broadway E\n , [], \n Seattle, WA 98102\n ]"
0,[[Westlake Station]],"[\n, [Westlake Station], \n, [], \n 4th Ave & Pine St\n , [], \n Seattle, WA 98101\n ]"
0,[[University Street Station]],"[\n, [University Street Station], \n, [], \n 3rd Ave & Seneca St\n , [], \n Seattle, WA 98101\n ]"
0,[[Pioneer Square Station]],"[\n, [Pioneer Square Station], \n, [], \n 3rd Ave & James St\n , [], \n Seattle, WA 98104\n ]"
0,[[International District/Chinatown Station]],"[\n, [International District/Chinatown Station], \n, [], \n 5th Ave and S Jackson St\n , [], \n Seattle, WA 98104\n ]"
0,[[Stadium Station]],"[\n, [Stadium Station], \n, [], \n 501 S Royal Brougham Way\n , [], \n Seattle, WA 98134\n ]"


In [14]:
# convert values to strings
station_zips['station'] = station_zips['station'].astype('str')
station_zips['zip'] = station_zips['zip'].astype('str')

In [15]:
# remove html tags from strings
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
station_zips['station']=station_zips['station'].apply(lambda cw : remove_tags(cw))
station_zips['zip']=station_zips['zip'].apply(lambda cw : remove_tags(cw))
station_zips

Unnamed: 0,station,zip
0,[Northgate Station],"\nNorthgate Station\n\n 10200 1st Ave NE\n \n Seattle, WA 98125\n"
0,[Roosevelt Station],"\nRoosevelt Station\n\n 6501 12th Ave NE\n \n Seattle, WA 98115\n"
0,[U District Station],"\nU District Station\n\n 4300 Brooklyn Ave NE\n \n Seattle, WA 98105\n"
0,[University of Washington Station],"\nUniversity of Washington Station\n\n 3720 Montlake Blvd NE\n \n Seattle, WA 98195\n"
0,[Capitol Hill Station],"\nCapitol Hill Station\n\n 140 Broadway E\n \n Seattle, WA 98102\n"
0,[Westlake Station],"\nWestlake Station\n\n 4th Ave &amp; Pine St\n \n Seattle, WA 98101\n"
0,[University Street Station],"\nUniversity Street Station\n\n 3rd Ave &amp; Seneca St\n \n Seattle, WA 98101\n"
0,[Pioneer Square Station],"\nPioneer Square Station\n\n 3rd Ave &amp; James St\n \n Seattle, WA 98104\n"
0,[International District/Chinatown Station],"\nInternational District/Chinatown Station\n\n 5th Ave and S Jackson St\n \n Seattle, WA 98104\n"
0,[Stadium Station],"\nStadium Station\n\n 501 S Royal Brougham Way\n \n Seattle, WA 98134\n"


In [16]:
# clean up row values
station_zips['station'] = station_zips['station'].replace('\[|\]','', regex = True)
station_zips['zip'] = station_zips.zip.str.extract('([0-9]{5})\n')
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
0,Roosevelt Station,98115
0,U District Station,98105
0,University of Washington Station,98195
0,Capitol Hill Station,98102
0,Westlake Station,98101
0,University Street Station,98101
0,Pioneer Square Station,98104
0,International District/Chinatown Station,98104
0,Stadium Station,98134


In [17]:
# reset row numbers
station_zips.reset_index(drop=True,inplace=True)
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
1,Roosevelt Station,98115
2,U District Station,98105
3,University of Washington Station,98195
4,Capitol Hill Station,98102
5,Westlake Station,98101
6,University Street Station,98101
7,Pioneer Square Station,98104
8,International District/Chinatown Station,98104
9,Stadium Station,98134


In [18]:
# save to csv format
station_zips.to_csv("station_zips.csv",index=False)

## ACS Data Using an API

In [19]:
# install censusdata
!pip install CensusData

SyntaxError: invalid syntax (Temp/ipykernel_31076/2727252223.py, line 2)

In [None]:
import censusdata

In [None]:
# set variables for census data
api_key='7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
dsource='zbp'
state='53'
zipcode='98125, 98115,98105,98195,98102,98101,98104,98134,98144,98108,98118,98188'

In [None]:
# 2019: pull median income in the past 12 months (in inflation-adjusted dollars) by place of birth
data_url = f'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,S1903_C01_001E&for=zip%20code%20tabulation%20area:{zipcode}&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
income_response=requests.get(data_url)
print(income_response.text)

In [None]:
# 2018: pull median income in the past 12 months (in inflation-adjusted dollars) by place of birth

## I need to find a way to pull this data for 2010 - 2018 (when zip codes are available in a different url format)
#data_url = f'https://api.census.gov/data/2010/acs/acs5/subject?get=NAME,S1903_C01_001E&for=zip%20code%20tabulation%20area:*&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
#income_response=requests.get(data_url)
#print(income_response.text)

In [None]:
# pull median income into dataframe
data=income_response.json()
medianIncome=pd.DataFrame(data[1:], columns=data[0])
medianIncome.head()

In [None]:
# clean up median income data frame

medianIncome=medianIncome.rename({'S1903_C01_001E':'Household Median Income','zip code tabulation area':'Zip Code'}, axis=1)
medianIncome.drop(labels=medianIncome.columns[0], axis=1, inplace=True)
medianIncome

In [None]:
# move zip code to first column
medianIncome.insert(0, 'Zip Code', medianIncome.pop('Zip Code'))
medianIncome

In [None]:
# save to csv format
medianIncome.to_csv("median_income.csv",index=False)

In [None]:
# pull hispanic or latino origin by race
data_url = f'https://api.census.gov/data/2019/acs/acs5?get=NAME,group(B03002)&for=zip%20code%20tabulation%20area:{zipcode}&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
race_response=requests.get(data_url)
print(race_response.text)

In [None]:
# pull race data into dataframe
data=race_response.json()
race_data=pd.DataFrame(data[1:], columns=data[0])
race_data.head()

In [None]:
# drop unwanted columns
race_data.drop(race_data.filter(regex='EA|M|NAME|state|GEO_ID|_00[1-2]|01[01]|01[3-9]|02[0-1]').columns, axis=1, inplace=True)
race_data

In [None]:
# rename columns
race=race_data.rename({'zip code tabulation area':'Zip Code','B03002_003E':'White alone', 'B03002_004E':'Black or African American alone', 'B03002_005E':'American Indian and Alaska Native alone', 'B03002_006E':'Asian alone', 'B03002_007E':'Native Hawaiian and Other Pacific Islander alone', 'B03002_008E':'Some other race alone', 'B03002_009E':'Two or more races alone', 'B03002_012E':'Hispanic or Latino'}, axis=1)
race

In [None]:
# move zip code to first column
race.insert(0, 'Zip Code', race.pop('Zip Code'))
race

In [None]:
# save to csv format
race.to_csv("race.csv",index=False)

# Data Integration

## Merge ACS Data

In [None]:
medianIncome.columns

In [None]:
# merge race & median income data
acsData=race.merge(medianIncome,left_on='Zip Code',right_on='Zip Code')
acsData

In [None]:
# save to csv format
acsData.to_csv("acsData.csv",index=False)

## Merge Light Rail Link Stations

In [None]:
# link to station wiki data
linkStationWiki='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/stationVars.csv'
stationWiki=pd.read_csv(linkStationWiki)

# link to station zips
linkStationZips='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/station_zips.csv'
stationZips=pd.read_csv(linkStationZips)

# link to acs data
linkACS='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/acsData.csv'
acsMerge=pd.read_csv(linkACS)

In [None]:
# check current columns
stationWiki.columns

In [None]:
stationWiki['Station'] = stationWiki['Station'].astype(str) + ' Station'
stationWiki

In [None]:
stationZips = stationZips.replace(to_replace ='Mount Baker Station &amp; Transit Center', value = 'Mount Baker Station', regex = True)
stationZips

In [None]:
# merge station zips to station wiki data
stationMerge=stationWiki.merge(stationZips,left_on="Station",right_on="station",how='outer',indicator='True')
stationMerge

In [None]:
# drop left_only from merge (not line 1 stops)
stationMerge=stationWiki.merge(stationZips,left_on="Station",right_on="station")
stationMerge

In [None]:
# move zip code to first column
stationMerge.insert(0, 'zip', stationMerge.pop('zip'))
stationMerge

## Merge ACS Data to Link Data

In [None]:
# merge acs data with link data
finalMerge=acsMerge.merge(stationMerge,left_on="Zip Code",right_on="zip",how='outer',indicator=True)
finalMerge.head()

In [None]:
# check merge outputs
finalMerge['_merge'].value_counts()

In [None]:
# drop and update the data frame
columnsToDrop = [10,14,15]
finalMerge.drop(labels=finalMerge.columns[columnsToDrop],axis=1,inplace=True)
finalMerge.head()

In [None]:
# reorder columns
finalMerge = finalMerge[["Zip Code", "Station", "Opened", "Weekday ridership", "Household Median Income", "Hispanic or Latino", "White alone", "Black or African American alone", "Asian alone", "American Indian and Alaska Native alone", "Native Hawaiian and Other Pacific Islander alone", "Some other race alone", "Two or more races alone"]]
finalMerge

In [None]:
finalMerge.sort_values('Station')
finalMerge.reset_index(drop=True,inplace=True)
finalMerge

In [None]:
# save to csv format
finalMerge.to_csv("finalMerge.csv",index=False)