Scraping Exercises

Scrape the following into CSV files. Each one is broken up into multiple tiers – the more you scrape the tougher it is!

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re
import numpy as np
import time
import os

# Congress members
Scrape https://www.congress.gov/members (Links to an external site.)
<li>Tier 1: Scrape their name and full profile URL, and additional</li>
<li>Tier 2: Separate their state/party/etc into separate columns</li>
<li>Advanced: Scrape each person's actual data from their personal project</li>

In [2]:
members = []

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

## Tier 1: Scrape their name and full profile URL, and additional
## Tier 2: Separate their state/party/etc into separate columns

In [4]:
#url
#10 pages for 2,429 members
for page in np.arange(1,11):
    url = f'https://www.congress.gov/members?pageSize=250&page={page}'
    
    #get raw data
    response = response = requests.get(url, headers=headers)
    doc = bs(response.content)

    for tag in doc.select("li.compact"):
        #name, url
        element = tag.find("a", href=True)
        name = element.text
        profile_url = element["href"]
        profile_url = ('https://www.congress.gov' + profile_url) if "http" not in profile_url else profile_url
        
        #other items
        state = tag.find(text=re.compile(".*State:.*")).next.next.text
        party = tag.find(text="Party:").next.next.text
        served = tag.find(text="Served:").next.next.text
        try:
            district = tag.find(text="District:").next.next.text
        except:
            district = np.nan
        
        #dictionary of data for a congress person
        data ={
            "name": name,
            "url" : profile_url,
            "state" : state,
            "district" : district,
            "party" : party,
            "served" : served
        }
    
        members.append(data)

    time.sleep(1)
members_df = pd.DataFrame(members)

In [5]:
members_df.served = members_df.served.str.replace("\n","")

In [6]:
members_df["senate"] = members_df.served.str.extract(".*Senate: (\d\d\d\d\-\d\d\d\d)")
members_df["house"] = members_df.served.str.extract(".*House: (\d\d\d\d\-\d\d\d\d)")

In [7]:
members_df

Unnamed: 0,name,url,state,district,party,served,senate,house
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...,South Dakota,,Republican,Senate: 1981-1987 House: 1973-1981,1981-1987,1973-1981
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Hawaii,1,Democratic,"House: 1985-1987, 1991-2011",,1985-1987
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...,South Dakota,,Democratic,Senate: 1973-1979 House: 1971-1973,1973-1979,1971-1973
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Louisiana,5,Republican,House: 2015-2021,,2015-2021
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...,Michigan,,Republican,Senate: 1995-2001,1995-2001,
...,...,...,...,...,...,...,...,...
2424,"Representative Zinke, Ryan K.",https://www.congress.gov/member/ryan-zinke/Z00...,Montana,At Large,Republican,House: 2015-2017,,2015-2017
2425,"Representative Zion, Roger H.",https://www.congress.gov/member/roger-zion/Z00...,Indiana,8,Republican,House: 1967-1975,,1967-1975
2426,"Senator Zorinsky, Edward",https://www.congress.gov/member/edward-zorinsk...,Nebraska,,Democratic,Senate: 1977-1989,1977-1989,
2427,"Representative Zschau, Edwin V. W.",https://www.congress.gov/member/edwin-zschau/Z...,California,12,Republican,House: 1983-1987,,1983-1987


## Advanced: Scrape each person's actual data from their personal project

In [9]:
#I'm getting banned repeatedly...

#for i in range(len(members_df)):
#    result_list = []
#    get_actual_data = True
#    url = members_df.url[i]
#    page =1
#    
#    while get_actual_data:
#        try:
#            personal_url = f"{url}?pageSize=250&page={page}"
#            response = requests.get(personal_url)
#            doc = bs(response.content)
#            results = doc.select(".result-title")
#            result_list = result_list + [result.text for result in results]
#            
#            page+=1
#        except:
#            get_actual_data = False
#    
#    members_df.loc[i, 'results'] = result_list
#    time.sleep(1)

In [10]:
members_df

Unnamed: 0,name,url,state,district,party,served,senate,house,results
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...,South Dakota,,Republican,Senate: 1981-1987 House: 1973-1981,1981-1987,1973-1981,
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Hawaii,1,Democratic,"House: 1985-1987, 1991-2011",,1985-1987,
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...,South Dakota,,Democratic,Senate: 1973-1979 House: 1971-1973,1973-1979,1971-1973,
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Louisiana,5,Republican,House: 2015-2021,,2015-2021,
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...,Michigan,,Republican,Senate: 1995-2001,1995-2001,,
...,...,...,...,...,...,...,...,...,...
2424,"Representative Zinke, Ryan K.",https://www.congress.gov/member/ryan-zinke/Z00...,Montana,At Large,Republican,House: 2015-2017,,2015-2017,
2425,"Representative Zion, Roger H.",https://www.congress.gov/member/roger-zion/Z00...,Indiana,8,Republican,House: 1967-1975,,1967-1975,
2426,"Senator Zorinsky, Edward",https://www.congress.gov/member/edward-zorinsk...,Nebraska,,Democratic,Senate: 1977-1989,1977-1989,,
2427,"Representative Zschau, Edwin V. W.",https://www.congress.gov/member/edwin-zschau/Z...,California,12,Republican,House: 1983-1987,,1983-1987,


# Maryland puclic schools

Scrape https://www.marylandpublicschools.org/stateboard/Pages/Meetings-2018.aspx (Links to an external site.)
<li>Tier 1: Scrape the date, URL to agenda, URL to board minutes</li>
<li>Tier 2: Download agenda items to an "agendas" folder and board minutes to a "minutes" folder</li>

## Tier 1: Scrape the date, URL to agenda, URL to board minutes

In [11]:
url = "https://www.marylandpublicschools.org/stateboard/Pages/Meetings-2018.aspx"

In [12]:
response = requests.get(url)
soup = bs(response.content)

In [13]:
meetings_list = []
parent_directory = "https://www.marylandpublicschools.org"

tags = soup.select("tbody > tr")

for tag in tags[2:]:
    #date
    try:
        date = tag.font.text
    except:
        date = tag.select_one("span").text
    
    #agenda
    ##url to agenda page
    agenda_page = tag.select_one("a[href$='.aspx']")['href']
    agenda_page = parent_directory + agenda_page if "http" not in agenda_page else agenda_page
    ##pdf file for agenda
    response = requests.get(agenda_page)
    soup = bs(response.content)
    agenda = soup.select_one("a[href$='.pdf'][title~=Agenda]")['href']
    agenda = parent_directory + agenda if "http" not in agenda else agenda
    
    #minutes
    minutes = tag.select_one("a[href$='.pdf']")['href']
    minutes = parent_directory + minutes if "http" not in minutes else minutes
    
    #data
    data = {
        "date": date,
        "agenda": agenda,
        "minutes": minutes
    }
    meetings_list.append(data)
    
    time.sleep(1)

In [14]:
meetings_df = pd.DataFrame(meetings_list)

In [15]:
meetings_df.date = meetings_df.date.str.replace('\n','')

In [16]:
meetings_df

Unnamed: 0,date,agenda,minutes
0,"January 29, 2018 Monday",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
1,"January 30, 2018 Tuesday",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
2,"February 27, 2018",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
3,"March 20, 2018 Tuesday",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
4,"April 24, 2018",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
5,"May 22, 2018",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
6,"June 20, 2018",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
7,"July 24, 2018",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
8,"August 28, 2018 Tuesday",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...
9,"September 25, 2018Tuesday",https://www.marylandpublicschools.org/stateboa...,https://www.marylandpublicschools.org/stateboa...


## Tier 2: Download agenda items to an "agendas" folder and board minutes to a "minutes" folder

In [17]:
file_content = '\n'.join(meetings_df.agenda)

with open("urls.txt", "w") as f:
    f.write(file_content)

In [18]:
!wget -i urls.txt --directory-prefix=agendas

--2022-07-12 16:47:35--  https://www.marylandpublicschools.org/stateboard/Documents/01292018/01292018BoardAgenda.pdf
Resolving www.marylandpublicschools.org (www.marylandpublicschools.org)... 167.102.224.186
Connecting to www.marylandpublicschools.org (www.marylandpublicschools.org)|167.102.224.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23062 (23K) [application/pdf]
Saving to: ‘agendas/01292018BoardAgenda.pdf’


2022-07-12 16:47:35 (1.71 MB/s) - ‘agendas/01292018BoardAgenda.pdf’ saved [23062/23062]

--2022-07-12 16:47:35--  https://www.marylandpublicschools.org/stateboard/Documents/01302018/01302018BoardAgenda.pdf
Reusing existing connection to www.marylandpublicschools.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 39305 (38K) [application/pdf]
Saving to: ‘agendas/01302018BoardAgenda.pdf’


2022-07-12 16:48:38 (624 B/s) - ‘agendas/01302018BoardAgenda.pdf’ saved [39305/39305]

--2022-07-12 16:48:38--  https://www.marylandpublicschools

In [45]:
file_content = '\n'.join(meetings_df.minutes)

with open("urls.txt", "w") as f:
    f.write(file_content)

In [46]:
!wget -i urls.txt --directory-prefix=minutes

--2022-07-12 16:49:08--  https://www.marylandpublicschools.org/stateboard/Documents/minutes/2018/January292018.pdf
Resolving www.marylandpublicschools.org (www.marylandpublicschools.org)... 167.102.224.186
Connecting to www.marylandpublicschools.org (www.marylandpublicschools.org)|167.102.224.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75473 (74K) [application/pdf]
Saving to: ‘minutes/January292018.pdf’


2022-07-12 16:49:09 (60.7 KB/s) - ‘minutes/January292018.pdf’ saved [75473/75473]

--2022-07-12 16:49:09--  https://www.marylandpublicschools.org/stateboard/Documents/minutes/2018/January302018.pdf
Reusing existing connection to www.marylandpublicschools.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 793530 (775K) [application/pdf]
Saving to: ‘minutes/January302018.pdf’


2022-07-12 16:49:11 (435 KB/s) - ‘minutes/January302018.pdf’ saved [793530/793530]

--2022-07-12 16:49:11--  https://www.marylandpublicschools.org/stateboard/Documen

# Mineral County School District

Scrape http://www.nvmcsd.org/our-school-board/meetings/agendas (Links to an external site.)
<li>Tier 1: Scrape the name of the link and the URL</li>
<li>Tier 2: Add a column for the date (you'll need to manually edit some, probably [but using pandas!])</li>
<li>Tier 3: Download the PDFs but name them after the date</li>

## Tier 1: Scrape the name of the link and the URL

In [19]:
url = "http://www.nvmcsd.org/our-school-board/meetings/agendas"

In [20]:
response = requests.get(url, headers=headers)

In [21]:
soup = bs(response.content)

In [22]:
tags = soup.select(".kt-accordion-panel-inner")

In [23]:
names = []
urls = []

for tag in tags[:2]:
    elements = tag.select("a[href$='.pdf']")
    names = names + [element.text for element in elements]
    urls = urls + [element['href'] for element in elements]

In [24]:
df = pd.DataFrame({'name':names, "url": urls})

In [25]:
df.head()

Unnamed: 0,name,url
0,"July 12, 2022 School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...
1,"June 8, 2022 Special School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...
2,"May 31, 2022 Special School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...
3,Item #2,https://nvmcsd.wpengine.com/wp-content/uploads...
4,Item #3,https://nvmcsd.wpengine.com/wp-content/uploads...


## Tier 2: Add a column for the date (you'll need to manually edit some, probably [but using pandas!])

In [26]:
#extract dates
df["date"] = df.name.str.extract("(.*\d\d\d\d)")

In [27]:
#manually edit some rows
df.loc[df.name=="8-10-21 Board Agenda","date"] = "August 10, 2021"
df.loc[df.name=="anuary 5, 2022 MCSD Safety Committee Meeting","date"] = "January 5, 2022"

In [28]:
#strftime + ffill
df.date = pd.to_datetime(df.date, errors="coerce").ffill()

In [29]:
df.head()

Unnamed: 0,name,url,date
0,"July 12, 2022 School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...,2022-07-12
1,"June 8, 2022 Special School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...,2022-06-08
2,"May 31, 2022 Special School Board Meeting",https://nvmcsd.wpengine.com/wp-content/uploads...,2022-05-31
3,Item #2,https://nvmcsd.wpengine.com/wp-content/uploads...,2022-05-31
4,Item #3,https://nvmcsd.wpengine.com/wp-content/uploads...,2022-05-31


## Tier 3: Download the PDFs but name them after the date

In [30]:
file_content = '\n'.join(df.url)

with open("urls.txt", "w") as f:
    f.write(file_content)

In [31]:
#I couldn't figure out how I can download multiple files
#and change the file names
#in one line with wget

#wget -i urls.txt -O new_file_nameslist.txt --directory-prefix=mineral-county

In [32]:
#so I did 1) download, 2) rename separately
#step 1: I download files first
!wget -i urls.txt --directory-prefix=mineral-county

--2022-07-12 16:48:57--  https://nvmcsd.wpengine.com/wp-content/uploads/2022/07/7.12.22-backup.pdf
Resolving nvmcsd.wpengine.com (nvmcsd.wpengine.com)... 35.227.4.241
Connecting to nvmcsd.wpengine.com (nvmcsd.wpengine.com)|35.227.4.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3474406 (3.3M) [application/pdf]
Saving to: ‘mineral-county/7.12.22-backup.pdf’


2022-07-12 16:48:57 (9.67 MB/s) - ‘mineral-county/7.12.22-backup.pdf’ saved [3474406/3474406]

--2022-07-12 16:48:57--  https://nvmcsd.wpengine.com/wp-content/uploads/2022/06/6.8.22-agenda.pdf
Reusing existing connection to nvmcsd.wpengine.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 44236 (43K) [application/pdf]
Saving to: ‘mineral-county/6.8.22-agenda.pdf’


2022-07-12 16:48:57 (2.18 MB/s) - ‘mineral-county/6.8.22-agenda.pdf’ saved [44236/44236]

--2022-07-12 16:48:57--  https://nvmcsd.wpengine.com/wp-content/uploads/2022/05/board-backup5.31.22.pdf
Reusing existing connection to n

HTTP request sent, awaiting response... 200 OK
Length: 42168 (41K) [application/pdf]
Saving to: ‘mineral-county/safety-committee-1.5.22.pdf’


2022-07-12 16:49:00 (29.0 MB/s) - ‘mineral-county/safety-committee-1.5.22.pdf’ saved [42168/42168]

--2022-07-12 16:49:00--  https://nvmcsd.wpengine.com/wp-content/uploads/2022/01/1.14.22-board-meeting.pdf
Reusing existing connection to nvmcsd.wpengine.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 36695 (36K) [application/pdf]
Saving to: ‘mineral-county/1.14.22-board-meeting.pdf’


2022-07-12 16:49:00 (26.9 MB/s) - ‘mineral-county/1.14.22-board-meeting.pdf’ saved [36695/36695]

--2022-07-12 16:49:00--  https://nvmcsd.wpengine.com/wp-content/uploads/2022/01/1.14.22-amended-agenda.pdf
Reusing existing connection to nvmcsd.wpengine.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 39416 (38K) [application/pdf]
Saving to: ‘mineral-county/1.14.22-amended-agenda.pdf’


2022-07-12 16:49:00 (37.8 MB/s) - ‘mineral-county/1

HTTP request sent, awaiting response... 200 OK
Length: 3757648 (3.6M) [application/pdf]
Saving to: ‘mineral-county/10.16.21-backup.pdf’


2022-07-12 16:49:03 (6.27 MB/s) - ‘mineral-county/10.16.21-backup.pdf’ saved [3757648/3757648]

--2022-07-12 16:49:03--  https://nvmcsd.wpengine.com/wp-content/uploads/2021/10/10.19.21-insurance-meeting.pdf
Reusing existing connection to nvmcsd.wpengine.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 33220 (32K) [application/pdf]
Saving to: ‘mineral-county/10.19.21-insurance-meeting.pdf’


2022-07-12 16:49:03 (1.52 MB/s) - ‘mineral-county/10.19.21-insurance-meeting.pdf’ saved [33220/33220]

--2022-07-12 16:49:03--  https://nvmcsd.wpengine.com/wp-content/uploads/2021/10/10.19.21-agenda.pdf
Reusing existing connection to nvmcsd.wpengine.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 68650 (67K) [application/pdf]
Saving to: ‘mineral-county/10.19.21-agenda.pdf’


2022-07-12 16:49:04 (133 KB/s) - ‘mineral-county/10.19.21-a

In [33]:
#step 2: rename filename
file_dir = "./mineral-county/"

for i in range(len(df)):
    sourcefile = file_dir + os.path.basename(df.url[i])
    date_count = ("-"+str((df.date[:i+1] == df.date[i]).sum())).replace("-1","")
    filename = file_dir + df.date[i].strftime('%Y-%m-%d') + date_count + ".pdf"
    os.rename(sourcefile, filename)

# RockTumbler
Scrape https://rocktumbler.com/blog/rock-and-mineral-clubs/ (Links to an external site.)
<li>Tier 1: Scrape all of the name and city</li>
<li>Tier 2: Scrape the name, city, and URL</li>
<li>Tier 3: Scrape the name, city, URL, and state name (you'll probably need to learn about "parent" nodes)</li>

## Tier 1: Scrape all of the name and city

## Tier 2: Scrape the name, city, and URL

In [34]:
url = "https://rocktumbler.com/blog/rock-and-mineral-clubs/"

In [35]:
response = requests.get(url)
doc = bs(response.content)

In [36]:
tags = doc.select("section>table>tr>td>a")

In [37]:
clubs = []
for tag in tags:
    try:
        data = {"name": tag.text,
                "city": tag.parent.next_sibling.next_sibling.text,
                "url" : tag['href']}
        clubs.append(data)
    except:
        print(tag.prettify())
        pass

<a href="https://rocktumbler.com/about-us.shtml#who">
 Hobart M. King
</a>



In [38]:
df = pd.DataFrame(clubs)

In [39]:
df.head()

Unnamed: 0,name,city,url
0,Alabama Mineral & Lapidary Society,Birmingham,http://www.lapidaryclub.com/
1,Dothan Gem & Mineral Club,Dothan,http://www.wiregrassrockhounds.com/
2,Huntsville Gem and Mineral Society,Huntsville,http://huntsvillegms.org/
3,Mobile Rock & Gem Society,Mobile,http://www.mobilerockandgem.com/
4,Montgomery Gem & Mineral Society,Montgomery,http://montgomerygemandmineralsociety.com/mgms/


In [40]:
df.tail()

Unnamed: 0,name,city,url
484,Weis'n'Miners Geology Club,Menasha,http://www.weismuseum.org/geology-club.html
485,Wisconsin Geological Society,West Allis,http://www.wisgeologicalsociety.com/
486,Cody 59ers Rock Club,Cody,http://www.cody59ers.com/
487,Riverton Mineral and Gem Society,Riverton,http://www.rivertonmgs.com/
488,Wyoming State Mineral and Gem Society,Cody,http://wymineralandgemsociety.org


## Tier 3: Scrape the name, city, URL, and state name (you'll probably need to learn about "parent" nodes)

In [41]:
clubs = []
for tag in tags[:-1]:
    data = {"name": tag.text,
            "city": tag.parent.next_sibling.next_sibling.text,
            "url" : tag['href'],
            "state" : tag.parent.parent.parent.parent.h3.text}
    clubs.append(data)

In [42]:
df = pd.DataFrame(clubs)

In [43]:
df.state = df.state.str.extract("(.*) Rock and Mineral Clubs")

In [44]:
df

Unnamed: 0,name,city,url,state
0,Alabama Mineral & Lapidary Society,Birmingham,http://www.lapidaryclub.com/,Alabama
1,Dothan Gem & Mineral Club,Dothan,http://www.wiregrassrockhounds.com/,Alabama
2,Huntsville Gem and Mineral Society,Huntsville,http://huntsvillegms.org/,Alabama
3,Mobile Rock & Gem Society,Mobile,http://www.mobilerockandgem.com/,Alabama
4,Montgomery Gem & Mineral Society,Montgomery,http://montgomerygemandmineralsociety.com/mgms/,Alabama
...,...,...,...,...
484,Weis'n'Miners Geology Club,Menasha,http://www.weismuseum.org/geology-club.html,Wisconsin
485,Wisconsin Geological Society,West Allis,http://www.wisgeologicalsociety.com/,Wisconsin
486,Cody 59ers Rock Club,Cody,http://www.cody59ers.com/,Wyoming
487,Riverton Mineral and Gem Society,Riverton,http://www.rivertonmgs.com/,Wyoming
