# Data Cleaning

## Light Rail Link Stations

In [79]:
# importing pandas for the notebook
import pandas as pd

#link for the tables
linkLink="https://en.wikipedia.org/wiki/List_of_Link_light_rail_stations"

# fetching the tables
stationsData=pd.read_html(linkLink,header=0,flavor="bs4",attrs={'class':"wikitable"})

In [80]:
stationVars=stationsData[1].copy()

In [96]:
stationVars.head()

Unnamed: 0,Station,Opened,Weekday ridership
0,Angle Lake †,"September 24, 2016[12]",3194
1,Beacon Hill,"July 18, 2009[8]",2675
2,Capitol Hill,"March 19, 2016[41]",7116
3,Columbia City,"July 18, 2009[8]",2358
4,Commerce Street/S 11th St,"September 15, 2011[10]",1051


In [97]:
stationVars

Unnamed: 0,Station,Opened,Weekday ridership
0,Angle Lake †,"September 24, 2016[12]",3194
1,Beacon Hill,"July 18, 2009[8]",2675
2,Capitol Hill,"March 19, 2016[41]",7116
3,Columbia City,"July 18, 2009[8]",2358
4,Commerce Street/S 11th St,"September 15, 2011[10]",1051
5,Convention Center/S 15th St,"August 23, 2003[7]",564
6,International District/Chinatown[n 2],"July 18, 2009[8][n 3]",5233
7,Mount Baker,"July 18, 2009[8]",2237
8,Northgate †,"October 2, 2021[13]",—
9,Othello,"July 18, 2009[8]",2307


In [87]:
# drop "line" and "location" columns
columnsToDrop=[1,2]

# drop and update the data frame
stationVars.drop(labels=stationVars.columns[columnsToDrop],axis=1,inplace=True)

In [88]:
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership[n 1]'], dtype='object')

In [89]:
# remove brackets
import re
stationVars.columns=stationVars.columns.str.replace("\[(n )?1\]","",regex=True)

In [90]:
# see current columns
stationVars.columns

Index(['Station', 'Opened', 'Weekday ridership'], dtype='object')

In [91]:
# check frequnecy of table
stationVars['Station'].value_counts()

Angle Lake †                             1
SeaTac/Airport                           1
University Street                        1
University of Washington                 1
Union Station/S 19th St                  1
U District                               1
Tukwila International Boulevard          1
Theater District/S 9th St †              1
Tacoma Dome †                            1
Stadium                                  1
S 25th St                                1
SODO                                     1
Roosevelt                                1
Beacon Hill                              1
Rainier Beach                            1
Pioneer Square                           1
Othello                                  1
Northgate †                              1
Mount Baker                              1
International District/Chinatown[n 2]    1
Convention Center/S 15th St              1
Commerce Street/S 11th St                1
Columbia City                            1
Capitol Hil

In [98]:
# replace the matching strings
stationVars_updated = stationVars.replace(to_replace ='( †)|\[(n )?[0-9]?[0-9]\]', value = '', regex = True)

# Print the updated dataframe
print(stationVars_updated)

                             Station              Opened Weekday ridership
0                         Angle Lake  September 24, 2016              3194
1                        Beacon Hill       July 18, 2009              2675
2                       Capitol Hill      March 19, 2016              7116
3                      Columbia City       July 18, 2009              2358
4          Commerce Street/S 11th St  September 15, 2011              1051
5        Convention Center/S 15th St     August 23, 2003               564
6   International District/Chinatown       July 18, 2009              5233
7                        Mount Baker       July 18, 2009              2237
8                          Northgate     October 2, 2021                 —
9                            Othello       July 18, 2009              2307
10                    Pioneer Square       July 18, 2009              4015
11                     Rainier Beach       July 18, 2009              1858
12                       

In [103]:
# save to csv format
stationVars_updated.to_csv("stationVars.csv",index=False)

## Station Zip Codes

In [220]:
# extract address from site
import requests

def getdata(url):
    r = requests.get(url)
    return r.text

htmldata = getdata("https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station")
soup = BeautifulSoup(htmldata, 'html.parser')
data = soup.find_all('p')[0].text

In [263]:
import requests
from bs4 import BeautifulSoup as bs
URL = ['https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/roosevelt-station',
      'https://www.soundtransit.org/ride-with-us/stops-stations/u-district-station']

for url in range(0,3):
    req = requests.get(URL[url])
    soup = bs(req.text, 'html.parser')
    
    for post in soup.find_all('p')[0].text:
        new_result = pd.DataFrame({
            "station":[post.find_all(".*Station", data)],
            "zip":[post.find_all("[0-9]{5}")]
        })

AttributeError: 'str' object has no attribute 'findAll'

In [482]:
# set pandas display option
pd.set_option('display.max_colwidth', None)

# For the scrape:
from bs4 import BeautifulSoup as BShtml
import urllib.request as ur

# Make empty dataframe
station_zips = pd.DataFrame({"station":[], "zip":[]})

urls = ['https://www.soundtransit.org/ride-with-us/stops-stations/northgate-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/roosevelt-station',
      'https://www.soundtransit.org/ride-with-us/stops-stations/u-district-station',
       'https://www.soundtransit.org/ride-with-us/stops-stations/university-washington-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/capitol-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/westlake-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/university-street-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/pioneer-square-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/international-district-chinatown-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/stadium-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/sodo-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/beacon-hill-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/mount-baker-station-transit-center',
        'https://www.soundtransit.org/ride-with-us/stops-stations/columbia-city-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/othello-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/rainier-beach-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/tukwila-international-boulevard-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/seatac-airport-station',
        'https://www.soundtransit.org/ride-with-us/stops-stations/angle-lake-station']

# Populate dataframe with quotes for first three pages
for url in urls:
    r = ur.urlopen(url).read()
    soup = BShtml(r, "html.parser")
    for post in soup.find_all('div', class_="station-map-card-content sidebar-container__content"):
        new_result = pd.DataFrame({
            "station":[post.find_all("span", class_="station-map-card-name")],
            "zip":[post.find_all('p')[0]]
        })
        station_zips = station_zips.append(new_result)
station_zips

Unnamed: 0,station,zip
0,[[Northgate Station]],"[\n, [Northgate Station], \n, [], \n 10200 1st Ave NE\n , [], \n Seattle, WA 98125\n ]"
0,[[Roosevelt Station]],"[\n, [Roosevelt Station], \n, [], \n 6501 12th Ave NE\n , [], \n Seattle, WA 98115\n ]"
0,[[U District Station]],"[\n, [U District Station], \n, [], \n 4300 Brooklyn Ave NE\n , [], \n Seattle, WA 98105\n ]"
0,[[University of Washington Station]],"[\n, [University of Washington Station], \n, [], \n 3720 Montlake Blvd NE\n , [], \n Seattle, WA 98195\n ]"
0,[[Capitol Hill Station]],"[\n, [Capitol Hill Station], \n, [], \n 140 Broadway E\n , [], \n Seattle, WA 98102\n ]"
0,[[Westlake Station]],"[\n, [Westlake Station], \n, [], \n 4th Ave & Pine St\n , [], \n Seattle, WA 98101\n ]"
0,[[University Street Station]],"[\n, [University Street Station], \n, [], \n 3rd Ave & Seneca St\n , [], \n Seattle, WA 98101\n ]"
0,[[Pioneer Square Station]],"[\n, [Pioneer Square Station], \n, [], \n 3rd Ave & James St\n , [], \n Seattle, WA 98104\n ]"
0,[[International District/Chinatown Station]],"[\n, [International District/Chinatown Station], \n, [], \n 5th Ave and S Jackson St\n , [], \n Seattle, WA 98104\n ]"
0,[[Stadium Station]],"[\n, [Stadium Station], \n, [], \n 501 S Royal Brougham Way\n , [], \n Seattle, WA 98134\n ]"


In [483]:
# convert values to strings
station_zips['station'] = station_zips['station'].astype('str')
station_zips['zip'] = station_zips['zip'].astype('str')

In [484]:
# remove html tags from strings
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
station_zips['station']=station_zips['station'].apply(lambda cw : remove_tags(cw))
station_zips['zip']=station_zips['zip'].apply(lambda cw : remove_tags(cw))
station_zips

Unnamed: 0,station,zip
0,[Northgate Station],"\nNorthgate Station\n\n 10200 1st Ave NE\n \n Seattle, WA 98125\n"
0,[Roosevelt Station],"\nRoosevelt Station\n\n 6501 12th Ave NE\n \n Seattle, WA 98115\n"
0,[U District Station],"\nU District Station\n\n 4300 Brooklyn Ave NE\n \n Seattle, WA 98105\n"
0,[University of Washington Station],"\nUniversity of Washington Station\n\n 3720 Montlake Blvd NE\n \n Seattle, WA 98195\n"
0,[Capitol Hill Station],"\nCapitol Hill Station\n\n 140 Broadway E\n \n Seattle, WA 98102\n"
0,[Westlake Station],"\nWestlake Station\n\n 4th Ave &amp; Pine St\n \n Seattle, WA 98101\n"
0,[University Street Station],"\nUniversity Street Station\n\n 3rd Ave &amp; Seneca St\n \n Seattle, WA 98101\n"
0,[Pioneer Square Station],"\nPioneer Square Station\n\n 3rd Ave &amp; James St\n \n Seattle, WA 98104\n"
0,[International District/Chinatown Station],"\nInternational District/Chinatown Station\n\n 5th Ave and S Jackson St\n \n Seattle, WA 98104\n"
0,[Stadium Station],"\nStadium Station\n\n 501 S Royal Brougham Way\n \n Seattle, WA 98134\n"


In [485]:
# clean up row values
station_zips['station'] = station_zips['station'].replace('\[|\]','', regex = True)
station_zips['zip'] = station_zips.zip.str.extract('([0-9]{5})\n')
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
0,Roosevelt Station,98115
0,U District Station,98105
0,University of Washington Station,98195
0,Capitol Hill Station,98102
0,Westlake Station,98101
0,University Street Station,98101
0,Pioneer Square Station,98104
0,International District/Chinatown Station,98104
0,Stadium Station,98134


In [486]:
# reset row numbers
station_zips.reset_index(drop=True,inplace=True)
station_zips

Unnamed: 0,station,zip
0,Northgate Station,98125
1,Roosevelt Station,98115
2,U District Station,98105
3,University of Washington Station,98195
4,Capitol Hill Station,98102
5,Westlake Station,98101
6,University Street Station,98101
7,Pioneer Square Station,98104
8,International District/Chinatown Station,98104
9,Stadium Station,98134


In [487]:
# save to csv format
station_zips.to_csv("station_zips.csv",index=False)

## API Data from Decennial Census

In [452]:
# install censusdata
pip install CensusData

Collecting CensusData
  Downloading CensusData-1.15.tar.gz (26.6 MB)
Building wheels for collected packages: CensusData
  Building wheel for CensusData (setup.py): started
  Building wheel for CensusData (setup.py): finished with status 'done'
  Created wheel for CensusData: filename=CensusData-1.15-py3-none-any.whl size=28205534 sha256=555c600e2c1613f8c293fbaed6a8a2d454ef7aef4427b73fefb426ab5f7e36f8
  Stored in directory: c:\users\kimoe\appdata\local\pip\cache\wheels\f7\28\2d\3308dceb3ca282b1479585690170f0a00821d9b36cbf835159
Successfully built CensusData
Installing collected packages: CensusData
Successfully installed CensusData-1.15
Note: you may need to restart the kernel to use updated packages.


In [454]:
import censusdata

In [574]:
# set variables for census data
api_key='7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
year='2016'
dsource='zbp'
state='53'
zipcode='98125, 98115,98105,98195,98102,98101,98104,98134,98144,98108,98118,98188'

In [658]:
# pull median income in the past 12 months (in 2019 inflation-adjusted dollars) by place of birth
data_url = f'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,S1903_C01_001E&for=zip%20code%20tabulation%20area:{zipcode}&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
income_response=requests.get(data_url)
print(income_response.text)

[["NAME","S1903_C01_001E","zip code tabulation area"],
["ZCTA5 98104","8286","98104"],
["ZCTA5 98101","9199","98101"],
["ZCTA5 98105","17146","98105"],
["ZCTA5 98118","17318","98118"],
["ZCTA5 98134","221","98134"],
["ZCTA5 98125","19283","98125"],
["ZCTA5 98102","15650","98102"],
["ZCTA5 98108","8491","98108"],
["ZCTA5 98188","9217","98188"],
["ZCTA5 98195","0","98195"],
["ZCTA5 98115","22922","98115"],
["ZCTA5 98144","13947","98144"]]


In [680]:
# pull median income into dataframe
data=income_response.json()
medianIncome=pd.DataFrame(data[1:], columns=data[0])
medianIncome.head()

Unnamed: 0,NAME,S1903_C01_001E,zip code tabulation area
0,ZCTA5 98104,8286,98104
1,ZCTA5 98101,9199,98101
2,ZCTA5 98105,17146,98105
3,ZCTA5 98118,17318,98118
4,ZCTA5 98134,221,98134


In [682]:
# clean up median income data frame

medianIncome=medianIncome.rename({'S1903_C01_001E':'Household Median Income','zip code tabulation area':'Zip Code'}, axis=1)
medianIncome.drop(labels=medianIncome.columns[0], axis=1, inplace=True)
medianIncome

Unnamed: 0,Household Median Income,Zip Code
0,8286,98104
1,9199,98101
2,17146,98105
3,17318,98118
4,221,98134
5,19283,98125
6,15650,98102
7,8491,98108
8,9217,98188
9,0,98195


In [703]:
# move zip code to first column
medianIncome.insert(0, 'Zip Code', medianIncome.pop('Zip Code'))
medianIncome

KeyError: 'Zip Code'

In [686]:
# save to csv format
medianIncome.to_csv("median_income.csv",index=False)

In [656]:
# pull hispanic or latino origin by race
data_url = f'https://api.census.gov/data/2019/acs/acs5?get=NAME,group(B03002)&for=zip%20code%20tabulation%20area:{zipcode}&in=state:53&key=7502bfbf0a9ccaeb30b91a8e2c0e9425e216c8b5'
race_response=requests.get(data_url)
print(race_response.text)

[["NAME","B03002_001E","B03002_001EA","B03002_001M","B03002_001MA","B03002_002E","B03002_002EA","B03002_002M","B03002_002MA","B03002_003E","B03002_003EA","B03002_003M","B03002_003MA","B03002_004E","B03002_004EA","B03002_004M","B03002_004MA","B03002_005E","B03002_005EA","B03002_005M","B03002_005MA","B03002_006E","B03002_006EA","B03002_006M","B03002_006MA","B03002_007E","B03002_007EA","B03002_007M","B03002_007MA","B03002_008E","B03002_008EA","B03002_008M","B03002_008MA","B03002_009E","B03002_009EA","B03002_009M","B03002_009MA","B03002_010E","B03002_010EA","B03002_010M","B03002_010MA","B03002_011E","B03002_011EA","B03002_011M","B03002_011MA","B03002_012E","B03002_012EA","B03002_012M","B03002_012MA","B03002_013E","B03002_013EA","B03002_013M","B03002_013MA","B03002_014E","B03002_014EA","B03002_014M","B03002_014MA","B03002_015E","B03002_015EA","B03002_015M","B03002_015MA","B03002_016E","B03002_016EA","B03002_016M","B03002_016MA","B03002_017E","B03002_017EA","B03002_017M","B03002_017MA","B030

In [687]:
# pull race data into dataframe
data=race_response.json()
race_data=pd.DataFrame(data[1:], columns=data[0])
race_data.head()

Unnamed: 0,NAME,B03002_001E,B03002_001EA,B03002_001M,B03002_001MA,B03002_002E,B03002_002EA,B03002_002M,B03002_002MA,B03002_003E,...,B03002_020M,B03002_020MA,B03002_021E,B03002_021EA,B03002_021M,B03002_021MA,GEO_ID,NAME.1,state,zip code tabulation area
0,ZCTA5 98104,14522,,809,,13412,,778,,7024,...,74,,71,,52,,8600000US98104,ZCTA5 98104,53,98104
1,ZCTA5 98101,13492,,875,,12816,,858,,8386,...,39,,74,,57,,8600000US98101,ZCTA5 98101,53,98101
2,ZCTA5 98105,50434,,1698,,47716,,1628,,29875,...,82,,357,,191,,8600000US98105,ZCTA5 98105,53,98105
3,ZCTA5 98118,49181,,1713,,45624,,1829,,15122,...,59,,311,,188,,8600000US98118,ZCTA5 98118,53,98118
4,ZCTA5 98134,833,,272,,746,,274,,468,...,12,,14,,33,,8600000US98134,ZCTA5 98134,53,98134


In [688]:
# drop unwanted columns
race_data.drop(race_data.filter(regex='EA|M|NAME|state|GEO_ID|_00[1-2]|01[01]|01[3-9]|02[0-1]').columns, axis=1, inplace=True)
race_data

Unnamed: 0,B03002_003E,B03002_004E,B03002_005E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,B03002_012E,zip code tabulation area
0,7024,1344,245,3797,31,62,909,1110,98104
1,8386,897,87,2733,0,23,690,676,98101
2,29875,1426,255,12813,97,176,3074,2718,98105
3,15122,11834,114,14316,383,86,3769,3557,98118
4,468,91,16,96,0,10,65,87,98134
5,25866,4012,245,6480,35,111,2588,3644,98125
6,18441,629,26,3683,15,64,1914,1251,98102
7,6203,4852,125,8288,55,155,1455,2506,98108
8,8500,6352,135,3893,1112,19,1604,3560,98188
9,22,0,0,0,0,0,0,26,98195


In [689]:
# rename columns
race=race_data.rename({'zip code tabulation area':'Zip Code','B03002_003E':'White alone', 'B03002_004E':'Black or African American alone', 'B03002_005E':'American Indian and Alaska Native alone', 'B03002_006E':'Asian alone', 'B03002_007E':'Native Hawaiian and Other Pacific Islander alone', 'B03002_008E':'Some other race alone', 'B03002_009E':'Two or more races alone', 'B03002_012E':'Hispanic or Latino'}, axis=1)
race

Unnamed: 0,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races alone,Hispanic or Latino,Zip Code
0,7024,1344,245,3797,31,62,909,1110,98104
1,8386,897,87,2733,0,23,690,676,98101
2,29875,1426,255,12813,97,176,3074,2718,98105
3,15122,11834,114,14316,383,86,3769,3557,98118
4,468,91,16,96,0,10,65,87,98134
5,25866,4012,245,6480,35,111,2588,3644,98125
6,18441,629,26,3683,15,64,1914,1251,98102
7,6203,4852,125,8288,55,155,1455,2506,98108
8,8500,6352,135,3893,1112,19,1604,3560,98188
9,22,0,0,0,0,0,0,26,98195


In [691]:
# move zip code to first column
race.insert(0, 'Zip Code', race.pop('Zip Code'))
race

Unnamed: 0,Zip Code,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races alone,Hispanic or Latino
0,98104,7024,1344,245,3797,31,62,909,1110
1,98101,8386,897,87,2733,0,23,690,676
2,98105,29875,1426,255,12813,97,176,3074,2718
3,98118,15122,11834,114,14316,383,86,3769,3557
4,98134,468,91,16,96,0,10,65,87
5,98125,25866,4012,245,6480,35,111,2588,3644
6,98102,18441,629,26,3683,15,64,1914,1251
7,98108,6203,4852,125,8288,55,155,1455,2506
8,98188,8500,6352,135,3893,1112,19,1604,3560
9,98195,22,0,0,0,0,0,0,26


In [692]:
# save to csv format
race.to_csv("race.csv",index=False)

# Data Integration

## Merge ACS Data

In [702]:
medianIncome.columns

Index(['Household Median Income'], dtype='object')

In [700]:
# merge race & median income data
acsData=race.merge(medianIncome,left_on='Zip Code',right_on='Zip Code')

KeyError: 'Zip Code'

## light rail link stations

In [697]:
# link to station wiki data
linkStationWiki='https://raw.githubusercontent.com/oeuyown/computational-thinking/main/stationVars.csv'
stationWiki=pd.read_csv(linkStationWiki)

# link to 

In [698]:
# check current columns
stationWiki.columns

Index(['Station', 'Opened', 'Weekday ridership'], dtype='object')