In [48]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup       # parse HTML and XML
import urllib.request               # Open urls

# homepage url
wiki_home_url = "https://en.wikipedia.org"
# crawler wiki page url
us_capitals_url = "https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States"

# open crawler wiki page
page = urllib.request.urlopen(us_capitals_url)
soup = BeautifulSoup(page, "lxml") # (html.parser) used to parse cml and html

In [49]:
# verify that it opened the correct page
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of capitals in the United States - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"406b6dd6-f34e-4aed-b6ec-aa09dbb43b14","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_capitals_in_the_United_States","wgTitle":"List of capitals in the United States","wgCurRevisionId":958880176,"wgRevisionId":958880176,"wgArticleId":255627,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 maint: archived copy as title","Use mdy 

In [50]:
# find table containing US capitals
table = soup.find('table', class_='wikitable sortable')
print(table.prettify())

<table class="wikitable sortable">
 <caption>
  State capitals of the United States
 </caption>
 <tbody>
  <tr>
   <th rowspan="2">
    State
   </th>
   <th rowspan="2">
    Abr.
   </th>
   <th rowspan="2">
    State-hood
   </th>
   <th rowspan="2">
    Capital
   </th>
   <th rowspan="2">
    Capital since
   </th>
   <th rowspan="2">
    Area (mi²)
   </th>
   <th colspan="4">
    Population (2018)
   </th>
   <th rowspan="2">
    Notes
   </th>
  </tr>
  <tr>
   <th>
    <a href="/wiki/List_of_United_States_cities_by_population" title="List of United States cities by population">
     City
    </a>
   </th>
   <th>
    <a href="/wiki/List_of_metropolitan_statistical_areas" title="List of metropolitan statistical areas">
     Metropolitan
    </a>
   </th>
   <th>
    Rank in state
   </th>
   <th>
    Rank in US
   </th>
  </tr>
  <tr>
   <td>
    <a href="/wiki/Alabama" title="Alabama">
     Alabama
    </a>
   </td>
   <td>
    AL
   </td>
   <td align="center">
    1819
   </t

In [51]:
# Print wiki extensions for the US capitals from the table
for row in table.findAll('tr'):
    row_data = row.findAll('td')
    if len(row_data) > 4:
        print(row_data[3].find('a').get('href'))

/wiki/Montgomery,_Alabama
/wiki/Juneau,_Alaska
/wiki/Phoenix,_Arizona
/wiki/Little_Rock,_Arkansas
/wiki/Sacramento,_California
/wiki/Denver,_Colorado
/wiki/Hartford,_Connecticut
/wiki/Dover,_Delaware
/wiki/Tallahassee,_Florida
/wiki/Atlanta
/wiki/Honolulu
/wiki/Boise,_Idaho
/wiki/Springfield,_Illinois
/wiki/Indianapolis
/wiki/Des_Moines,_Iowa
/wiki/Topeka,_Kansas
/wiki/Frankfort,_Kentucky
/wiki/Baton_Rouge,_Louisiana
/wiki/Augusta,_Maine
/wiki/Annapolis,_Maryland
/wiki/Boston
/wiki/Lansing,_Michigan
/wiki/Saint_Paul,_Minnesota
/wiki/Jackson,_Mississippi
/wiki/Jefferson_City,_Missouri
/wiki/Helena,_Montana
/wiki/Lincoln,_Nebraska
/wiki/Carson_City,_Nevada
/wiki/Concord,_New_Hampshire
/wiki/Trenton,_New_Jersey
/wiki/Santa_Fe,_New_Mexico
/wiki/Albany,_New_York
/wiki/Raleigh,_North_Carolina
/wiki/Bismarck,_North_Dakota
/wiki/Columbus,_Ohio
/wiki/Oklahoma_City
/wiki/Salem,_Oregon
/wiki/Harrisburg,_Pennsylvania
/wiki/Providence,_Rhode_Island
/wiki/Columbia,_South_Carolina
/wiki/Pierre,_South

In [52]:
# Make a list of urls to the US capitals

# Save list of wiki extensions to the capitals
capitals = []
for row in table.findAll('tr'):
    row_data = row.findAll('td')
    if len(row_data) > 4:
        capitals.append(row_data[3].find('a').get('href'))

# Create list populated by wiki homepage
us_capitals_links = [wiki_home_url] * len(capitals)

# Concatenate the wiki home page and wiki extensions
for i in range(len(capitals)):
    us_capitals_links[i] += capitals[i]
    
# verify results
for c in us_capitals_links:
    print(c)
    print()

https://en.wikipedia.org/wiki/Montgomery,_Alabama

https://en.wikipedia.org/wiki/Juneau,_Alaska

https://en.wikipedia.org/wiki/Phoenix,_Arizona

https://en.wikipedia.org/wiki/Little_Rock,_Arkansas

https://en.wikipedia.org/wiki/Sacramento,_California

https://en.wikipedia.org/wiki/Denver,_Colorado

https://en.wikipedia.org/wiki/Hartford,_Connecticut

https://en.wikipedia.org/wiki/Dover,_Delaware

https://en.wikipedia.org/wiki/Tallahassee,_Florida

https://en.wikipedia.org/wiki/Atlanta

https://en.wikipedia.org/wiki/Honolulu

https://en.wikipedia.org/wiki/Boise,_Idaho

https://en.wikipedia.org/wiki/Springfield,_Illinois

https://en.wikipedia.org/wiki/Indianapolis

https://en.wikipedia.org/wiki/Des_Moines,_Iowa

https://en.wikipedia.org/wiki/Topeka,_Kansas

https://en.wikipedia.org/wiki/Frankfort,_Kentucky

https://en.wikipedia.org/wiki/Baton_Rouge,_Louisiana

https://en.wikipedia.org/wiki/Augusta,_Maine

https://en.wikipedia.org/wiki/Annapolis,_Maryland

https://en.wikipedia.org/wiki/Bo

In [53]:
# Initialize wanted table headers for scraping
headers = ['Country', 'State', 'Founded', 'Mayor', 'Elevation', 'Population', 'Demonym(s)']

In [54]:
# Create scraping function
def scrape_city(city_link):
    # Open link to city wiki page
    city_url = city_link
    page = urllib.request.urlopen(city_url)
    
    # Set up bs for city wiki page
    soup = BeautifulSoup(page, "lxml") # (html.parser) used to parse cml and html
    
    # Initalize empty array for data
    data = [None] * len(headers)
    
    table = soup.find('table', class_='infobox geography vcard')

    # SCRAPE the heck out of the city wiki
    for row in table.findAll('tr'):
        header = row.find('th')
        if header:
            header = header.text.replace('•', '').lstrip()       
            if header in headers:
                td = row.find('td')
                if (td):
                    data[headers.index(header)] = td.text  
            elif 'Population' in header:
                pop = row.findNext('tr')
                td = pop.find('td')
                if (td):
                    data[headers.index('Population')] = td.text
    
    # Add city name to the data            
    city_name = soup.title.string.split(',', 1)
    data.insert(0, city_name[0])
    
    # Verify data
    print(data)
    
    # return list
    return data

In [55]:
# initalize structure for the capital cities' data
capitals_data = []

# Scrape urls and add to data structure
for link in us_capitals_links:
    capitals_data.append(scrape_city(link))

['Montgomery', '\xa0United States', '\xa0Alabama', None, 'Steven Reed (D)', '240\xa0ft (73\xa0m)', '205,764', None]
['Juneau', '\xa0United States', '\xa0Alaska', None, 'Beth Weldon', '56\xa0ft (17\xa0m)', '31,275', None]
['Phoenix', '\xa0United States', '\xa0Arizona', None, 'Kate Gallego', None, '1,445,632', None]
['Little Rock', '\xa0United States', '\xa0Arkansas', 'June 1, 1821', 'Frank Scott Jr. (D)', '335\xa0ft (102\xa0m)', '193,490', 'Little Rocker']
['Sacramento', '\xa0United States', '\xa0California', None, 'Darrell Steinberg (D)[4]', None, '466,488', 'Sacramentan']
['Denver - Wikipedia', '\xa0United States', '\xa0Colorado', 'November 17, 1858, as Denver City, K.T.[6]', 'Michael Hancock (D)[10]', None, '600,158', 'Denverite']
['Hartford', 'United States', 'Connecticut', None, 'Luke Bronin (D)', '59\xa0ft (18\xa0m)', '122,105', 'Hartfordite']
['Dover', '\xa0United States', '\xa0Delaware', '1693', 'Robin Christiansen (D)', '30\xa0ft (9\xa0m)', '36,047', None]
['Tallahassee', 'Unit

In [56]:
# verify data structure
for c in capitals_data:
    print(c)
    print()

['Montgomery', '\xa0United States', '\xa0Alabama', None, 'Steven Reed (D)', '240\xa0ft (73\xa0m)', '205,764', None]

['Juneau', '\xa0United States', '\xa0Alaska', None, 'Beth Weldon', '56\xa0ft (17\xa0m)', '31,275', None]

['Phoenix', '\xa0United States', '\xa0Arizona', None, 'Kate Gallego', None, '1,445,632', None]

['Little Rock', '\xa0United States', '\xa0Arkansas', 'June 1, 1821', 'Frank Scott Jr. (D)', '335\xa0ft (102\xa0m)', '193,490', 'Little Rocker']

['Sacramento', '\xa0United States', '\xa0California', None, 'Darrell Steinberg (D)[4]', None, '466,488', 'Sacramentan']

['Denver - Wikipedia', '\xa0United States', '\xa0Colorado', 'November 17, 1858, as Denver City, K.T.[6]', 'Michael Hancock (D)[10]', None, '600,158', 'Denverite']

['Hartford', 'United States', 'Connecticut', None, 'Luke Bronin (D)', '59\xa0ft (18\xa0m)', '122,105', 'Hartfordite']

['Dover', '\xa0United States', '\xa0Delaware', '1693', 'Robin Christiansen (D)', '30\xa0ft (9\xa0m)', '36,047', None]

['Tallahassee

In [57]:
# Add header for 'City'. This would have disrupted the scraping if 
# initalized sooner, so make sure this is done right before creating 
# the pandas dataframe
headers.insert(0, 'City')

# Create pandas dataframe
df = pd.DataFrame(capitals_data, columns=headers)
df

Unnamed: 0,City,Country,State,Founded,Mayor,Elevation,Population,Demonym(s)
0,Montgomery,United States,Alabama,,Steven Reed (D),240 ft (73 m),205764,
1,Juneau,United States,Alaska,,Beth Weldon,56 ft (17 m),31275,
2,Phoenix,United States,Arizona,,Kate Gallego,,1445632,
3,Little Rock,United States,Arkansas,"June 1, 1821",Frank Scott Jr. (D),335 ft (102 m),193490,Little Rocker
4,Sacramento,United States,California,,Darrell Steinberg (D)[4],,466488,Sacramentan
5,Denver - Wikipedia,United States,Colorado,"November 17, 1858, as Denver City, K.T.[6]",Michael Hancock (D)[10],,600158,Denverite
6,Hartford,United States,Connecticut,,Luke Bronin (D),59 ft (18 m),122105,Hartfordite
7,Dover,United States,Delaware,1693,Robin Christiansen (D),30 ft (9 m),36047,
8,Tallahassee,United States,Florida,,John Dailey (D),,181376,
9,Atlanta - Wikipedia,United States,Georgia,,Keisha Lance Bottoms (D),"738 to 1,050 ft (225 to 320 m)",420003,


In [47]:
# Save data to csv
df.to_csv(r'~/us_capitals_crawler_data.csv', index=False)

In [None]:
###############################################
# Everything underneath here are edits to the above code.
# Not every page will have all of the headers we wanted, or
# maybe the searching strings were slightly different...
# This is where you analyze your data and update your crawler/scraper
# to obtain better results
###############################################

In [None]:
########################################
##### Copied for possible updates ######
##### So far, nothing has changed

# Good updates would be:
#  - remove "- Wikipedia" from the city name
#  - Verify the "None" values to see if there is an error in 
#    the scraper or if the url did not contain that header

headers = ['Country', 'State', 'Founded', 'Mayor', 'Elevation', 'Population', 'Demonym(s)']

def scrape_city(city_link):
    # Open link to city wiki page
    city_url = city_link
    page = urllib.request.urlopen(city_url)
    
    # Set up bs for city wiki page
    soup = BeautifulSoup(page, "lxml") # (html.parser) used to parse cml and html
    
    # Initalize empty array for data
    data = [None] * len(headers)
    
    table = soup.find('table', class_='infobox geography vcard')

    # SCRAPE the heck out of the city wiki
    for row in table.findAll('tr'):
        header = row.find('th')
        if header:
            header = header.text.replace('•', '').lstrip()       
            if header in headers:
                td = row.find('td')
                if (td):
                    data[headers.index(header)] = td.text  
            elif 'Population' in header:
                pop = row.findNext('tr')
                td = pop.find('td')
                if (td):
                    data[headers.index('Population')] = td.text
    
    # Add city name to the data            
    city_name = soup.title.string.split(',', 1)
    data.insert(0, city_name[0])
    
    print(data)
    
    return data

############################
# Rerun program with updated scraper

capitals_data = []
for link in us_capitals_links:
    capitals_data.append(scrape_city(link))
    
headers.insert(0, 'City')

df = pd.DataFrame(capitals_data, columns=headers)
df

# Save data to csv
df.to_csv(r'~/us_capitals_crawler_data_2.csv', index=False)