In [85]:
# webscraping

In [86]:
from bs4 import BeautifulSoup
import requests

In [88]:
url = ("http://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [95]:
# find the first paragraphs <p> tag
first_paragraph = soup.find('p')
first_paragraph

<p id="p1">This is the first paragraph.</p>

In [96]:
# get tex contents of a Tag
first_paragraph_text = soup.p.text
print(first_paragraph_text)
first_paragraph_words = soup.p.text.split()
print(first_paragraph_words)

This is the first paragraph.
['This', 'is', 'the', 'first', 'paragraph.']


In [97]:
# extract a tag's attributes by treating it like a dict
first_paragraph_id = soup.p['id']
print(first_paragraph_id)
first_paragraph_id2 = soup.p.get('id')
first_paragraph_id2

p1


'p1'

In [98]:
# get multiple tags at once
all_paragraphs = soup.find_all('p')
print(all_paragraphs)
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
print(paragraphs_with_ids)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]
[<p id="p1">This is the first paragraph.</p>]


In [101]:
# find tag with specific classes
important_paragraphs = soup('p', {'class':'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]

print(important_paragraphs)
print(important_paragraphs2)
print(important_paragraphs3)

[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]


In [92]:
spans_inside_divs = [span for div in soup('div') for span in div('span')]
spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

In [105]:
url= 'https://www.house.gov/representatives'
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")
all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]
print(len(all_urls))

967


In [124]:
import re
regex = r"^https?://.*\.house\.gov/?$"
good_urls = [url for url in all_urls if re.match(regex, url)]

In [126]:
good_urls.sort()
good_urls

['https://adams.house.gov',
 'https://adams.house.gov',
 'https://adamsmith.house.gov/',
 'https://adamsmith.house.gov/',
 'https://aderholt.house.gov/',
 'https://aderholt.house.gov/',
 'https://adriansmith.house.gov/',
 'https://adriansmith.house.gov/',
 'https://aguilar.house.gov/',
 'https://aguilar.house.gov/',
 'https://alceehastings.house.gov/',
 'https://alceehastings.house.gov/',
 'https://algreen.house.gov',
 'https://algreen.house.gov',
 'https://allen.house.gov',
 'https://allen.house.gov',
 'https://allred.house.gov/',
 'https://allred.house.gov/',
 'https://amodei.house.gov',
 'https://amodei.house.gov',
 'https://andylevin.house.gov/',
 'https://andylevin.house.gov/',
 'https://anthonybrown.house.gov',
 'https://anthonybrown.house.gov',
 'https://anthonygonzalez.house.gov',
 'https://anthonygonzalez.house.gov',
 'https://armstrong.house.gov',
 'https://armstrong.house.gov',
 'https://arrington.house.gov',
 'https://arrington.house.gov',
 'https://auchincloss.house.gov',


In [127]:
good_urls = list(set(good_urls))
good_urls

['https://johnrose.house.gov/',
 'https://seanmaloney.house.gov',
 'https://delbene.house.gov',
 'https://hartzler.house.gov/',
 'https://spartz.house.gov',
 'https://bacon.house.gov',
 'https://kahele.house.gov',
 'https://owens.house.gov',
 'https://lynch.house.gov/',
 'https://sarbanes.house.gov/',
 'https://kind.house.gov/',
 'https://issa.house.gov',
 'https://tlaib.house.gov/',
 'https://murphy.house.gov',
 'https://norcross.house.gov',
 'https://waltz.house.gov',
 'https://halrogers.house.gov/',
 'https://valadao.house.gov',
 'https://gosar.house.gov/',
 'https://gaetz.house.gov',
 'https://stefanik.house.gov/',
 'https://connolly.house.gov/',
 'https://sylviagarcia.house.gov/',
 'https://bost.house.gov/',
 'https://radewagen.house.gov/',
 'https://espaillat.house.gov',
 'https://cheney.house.gov',
 'https://salazar.house.gov',
 'https://mann.house.gov',
 'https://benniethompson.house.gov/',
 'https://buchanan.house.gov/',
 'https://bilirakis.house.gov/',
 'https://garbarino.hou

In [128]:
len(good_urls)

437

In [132]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
links

{'https://jayapal.house.gov/category/news/',
 'https://jayapal.house.gov/category/press-releases/'}

In [134]:
from typing import Dict, Set

In [137]:
press_releases: Dict[str, Set[str]] = {}

In [138]:
for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://johnrose.house.gov/: {'/media/press-releases'}
https://seanmaloney.house.gov: set()
https://delbene.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://hartzler.house.gov/: {'/media-center/press-releases'}
https://spartz.house.gov: {'/media/press-releases'}
https://bacon.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://kahele.house.gov: {'/media/press-releases'}
https://owens.house.gov: {'/media/press-releases'}
https://lynch.house.gov/: {'/press-releases'}
https://sarbanes.house.gov/: {'/media-center/press-releases'}
https://kind.house.gov/: {'/media-center/press-releases'}
https://issa.house.gov: {'/media/press-releases'}
https://tlaib.house.gov/: {'/media/press-releases'}
https://murphy.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://norcross.house.gov: set()
https://waltz.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://halrogers.house.gov/: {'/press-releases'}


KeyboardInterrupt: 

In [141]:
def paragraph_mentions(text, keyword):
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]
    return any(keyword.lower() in paragraph.lower() for paragraph in paragraphs)

In [142]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
keyword = "twitter"
assert paragraph_mentions(text, keyword)
keyword = "facebook"
assert not paragraph_mentions(text, keyword)

In [144]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
        keyword = "data"
        if paragraph_mentions(text, keyword):
            print(f"{house_url}")
            break

https://delbene.house.gov


In [145]:
# API's

In [146]:
import json

In [151]:
serialized = """{ "title":"Data Science Book", "author":"Joel Grus", "publicationYear": 2019, "topics": ["data", "science", "datascience", "data science"] }"""
serialized

'{ "title":"Data Science Book", "author":"Joel Grus", "publicationYear": 2019, "topics": ["data", "science", "datascience", "data science"] }'

In [153]:
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

In [154]:
# unauthenticated API's

In [160]:
github_user = "mcwaage1"
endpoint = f"https://api.github.com/users/{github_user}/repos"
repos = json.loads(requests.get(endpoint).text)

In [161]:
endpoint

'https://api.github.com/users/mcwaage1/repos'

In [174]:
repos

[{'id': 330612750,
  'node_id': 'MDEwOlJlcG9zaXRvcnkzMzA2MTI3NTA=',
  'name': 'datascience',
  'full_name': 'mcwaage1/datascience',
  'private': False,
  'owner': {'login': 'mcwaage1',
   'id': 49893491,
   'node_id': 'MDQ6VXNlcjQ5ODkzNDkx',
   'avatar_url': 'https://avatars.githubusercontent.com/u/49893491?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/mcwaage1',
   'html_url': 'https://github.com/mcwaage1',
   'followers_url': 'https://api.github.com/users/mcwaage1/followers',
   'following_url': 'https://api.github.com/users/mcwaage1/following{/other_user}',
   'gists_url': 'https://api.github.com/users/mcwaage1/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/mcwaage1/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/mcwaage1/subscriptions',
   'organizations_url': 'https://api.github.com/users/mcwaage1/orgs',
   'repos_url': 'https://api.github.com/users/mcwaage1/repos',
   'events_url': 'https://api.github.com/use

In [175]:
len(repos)

7

In [173]:
for repo in repos:
    print(repo['name'])

datascience
django
google-homepage
my-first-blog
python
qs
tributepage


In [191]:
for repo in repos:
    print(f"Name: {repo['name']} \t Created at: {repo['created_at']}")

Name: datascience 	 Created at: 2021-01-18T09:15:14Z
Name: django 	 Created at: 2020-10-23T04:43:18Z
Name: google-homepage 	 Created at: 2019-10-04T09:51:04Z
Name: my-first-blog 	 Created at: 2020-10-03T06:58:41Z
Name: python 	 Created at: 2020-05-25T06:14:17Z
Name: qs 	 Created at: 2020-10-26T10:36:54Z
Name: tributepage 	 Created at: 2019-10-15T19:37:11Z


In [177]:
from collections import Counter
from dateutil.parser import parse

In [192]:
dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

In [193]:
dates

[datetime.datetime(2021, 1, 18, 9, 15, 14, tzinfo=tzutc()),
 datetime.datetime(2020, 10, 23, 4, 43, 18, tzinfo=tzutc()),
 datetime.datetime(2019, 10, 4, 9, 51, 4, tzinfo=tzutc()),
 datetime.datetime(2020, 10, 3, 6, 58, 41, tzinfo=tzutc()),
 datetime.datetime(2020, 5, 25, 6, 14, 17, tzinfo=tzutc()),
 datetime.datetime(2020, 10, 26, 10, 36, 54, tzinfo=tzutc()),
 datetime.datetime(2019, 10, 15, 19, 37, 11, tzinfo=tzutc())]

In [194]:
month_counts

Counter({1: 1, 10: 5, 5: 1})

In [195]:
weekday_counts

Counter({0: 3, 4: 2, 5: 1, 1: 1})

In [201]:
last_5_repos = sorted(repos, key=lambda r:r["pushed_at"], reverse=True)[:5]
last_5_languages = [repo["language"] for repo in last_5_repos]
last_5_languages

['Jupyter Notebook',
 'Jupyter Notebook',
 'Jupyter Notebook',
 'Python',
 'Python']

In [202]:
# twitter API