# Getting information about cities in Pennsylvania

In this program, I used BeautifulSoup to scrape the data from the table on the Wikipedia page 'List of cities in Pennsylvania.' I then minimally cleaned the data before exporting it as a csv.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import html5lib
import numpy as np

In [2]:
wikiurl = "http://en.wikipedia.org/wiki/List_of_cities_in_Pennsylvania"
response = requests.get(wikiurl)
print(response.status_code)

200


In [3]:
soup = bs(response.text, 'html.parser')
pennTable = soup.find('table', {'class':"wikitable"})

In [4]:
pennTable

<table class="wikitable sortable">
<tbody><tr>
<th><b>Name</b>
</th>
<th>Type</th>
<th><b>County</b><sup class="reference" id="cite_ref-cleargov_2-0"><a href="#cite_note-cleargov-2">[2]</a></sup></th>
<th><b>Class<sup class="reference" id="ref_types↑"><a href="#endnote_types↑">[A]</a></sup></b></th>
<th>Population (2018 Estimates)<sup class="reference" id="cite_ref-cleargov_2-1"><a href="#cite_note-cleargov-2">[2]</a></sup><sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup></th>
<th>Incorporation<br/>date (as city)
</th>
<th>Sq Miles<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup>
</th></tr>
<tr>
<td><a href="/wiki/Aliquippa,_Pennsylvania" title="Aliquippa, Pennsylvania">Aliquippa</a>
</td>
<td>City</td>
<td><a href="/wiki/Beaver_County,_Pennsylvania" title="Beaver County, Pennsylvania">Beaver</a></td>
<td>Third</td>
<td>8,908</td>
<td>1987
</td>
<td>4.19
</td></tr>
<tr>
<td><a href="/wiki/Allentown,_Pennsylvania" title="Allentown, Pen

In [5]:
df = pd.read_html(str(pennTable), flavor="bs4")
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Name,Type,County[2],Class[A],Population (2018 Estimates)[2][3],Incorporationdate (as city),Sq Miles[4]
0,Aliquippa,City,Beaver,Third,8908,1987,4.19
1,Allentown†,City,Lehigh,Third,123828,1867,17.55
2,Altoona,City,Blair,Third,43702,1868,9.91
3,Arnold,City,Westmoreland,Third,4980,1939,0.73
4,Beaver Falls,City,Beaver,Third,8387,1928,2.13


###

## Rename columns

In [6]:
df.columns = ['Name', 'Type', 'County', 'Class', 'Population_2018_est', 'Incorporation_Date', 'Sq_Miles']
df.head()

Unnamed: 0,Name,Type,County,Class,Population_2018_est,Incorporation_Date,Sq_Miles
0,Aliquippa,City,Beaver,Third,8908,1987,4.19
1,Allentown†,City,Lehigh,Third,123828,1867,17.55
2,Altoona,City,Blair,Third,43702,1868,9.91
3,Arnold,City,Westmoreland,Third,4980,1939,0.73
4,Beaver Falls,City,Beaver,Third,8387,1928,2.13


## Remove symbol(s) from select city names

In [7]:
for i in range(len(df)):
    name = df.iat[i, 0]
    if name.endswith('†'):
        newName = name.strip('†')
        df.iat[i, 0] = newName
df.head()

Unnamed: 0,Name,Type,County,Class,Population_2018_est,Incorporation_Date,Sq_Miles
0,Aliquippa,City,Beaver,Third,8908,1987,4.19
1,Allentown,City,Lehigh,Third,123828,1867,17.55
2,Altoona,City,Blair,Third,43702,1868,9.91
3,Arnold,City,Westmoreland,Third,4980,1939,0.73
4,Beaver Falls,City,Beaver,Third,8387,1928,2.13


###

In [8]:
df.to_csv('penn_cities.csv', index=False, header=True)