In [None]:
import nltk
nltk.download('book',quiet=True)

True

In [None]:
import requests
from urllib.parse import urlsplit
from bs4 import BeautifulSoup

We want to download the text from a White House press statement at this URL

In [None]:
url = 'https://www.whitehouse.gov/briefing-room/statements-releases/2021/11/30/bills-signed-bills-signed-s-769-s-894-s-1031-and-s-1095/'

In [None]:
data = requests.get(url).text

In [None]:
data[:100]

'<!doctype html>\n<html class="no-js alert__has-cookie" lang="en-US">\n<head>\n\t<meta charset="utf-8">\n\t'

Convert to a BeautifulSoup object, which is something that we can extract subparts from easily

In [None]:
soup = BeautifulSoup(data)

Using "Inspect element in a browser", we figured out that we want the text out of all the paragraphs that are inside the part that looks like this 
```html
<section class="body-content">
...
</section>
```

In [None]:
sections = soup.find_all('section', class_='body-content')

In [None]:
len(sections)

1

In [None]:
section = soup.find('section', class_='body-content')

In [None]:
ps = section.find_all('p')

In [None]:
len(ps)

9

In [None]:
ps[0]

<p>On Tuesday, November 30, 2021, the President signed into law:</p>

In [None]:
ps[2].get_text()

'Thank you to Senators Warnock, Moran, Menendez, Murray, Blumenthal, Murphy, Leahy, Lujan, and Tester and Representatives Takano and Bost for their leadership;'

In [None]:
ptexts = [p.get_text() for p in section.find_all('p')]

In [None]:
ptexts[7]

'S. 1095, the “Colonel John M. McHugh Tuition Fairness for Survivors Act of 2021,” which requires the Department of Veterans Affairs to disapprove any course of education offered by public institutions of higher learning that do not charge the in-State tuition rate to individuals using education benefits under the Survivors’ and Dependents’ Educational Assistance Program,'

Now for the processing pipeline: sentence tokenization, word tokenization, and part of speech tagging:

In [None]:
[nltk.word_tokenize(s) for s in nltk.sent_tokenize(ptexts[7])]

[['S.',
  '1095',
  ',',
  'the',
  '“',
  'Colonel',
  'John',
  'M.',
  'McHugh',
  'Tuition',
  'Fairness',
  'for',
  'Survivors',
  'Act',
  'of',
  '2021',
  ',',
  '”',
  'which',
  'requires',
  'the',
  'Department',
  'of',
  'Veterans',
  'Affairs',
  'to',
  'disapprove',
  'any',
  'course',
  'of',
  'education',
  'offered',
  'by',
  'public',
  'institutions',
  'of',
  'higher',
  'learning',
  'that',
  'do',
  'not',
  'charge',
  'the',
  'in-State',
  'tuition',
  'rate',
  'to',
  'individuals',
  'using',
  'education',
  'benefits',
  'under',
  'the',
  'Survivors',
  '’',
  'and',
  'Dependents',
  '’',
  'Educational',
  'Assistance',
  'Program',
  ',']]

In [None]:
tagger = nltk.tag.perceptron.PerceptronTagger()

In [None]:
doc = [tagger.tag(nltk.word_tokenize(s)) 
    for p in section.find_all('p') 
    for s in nltk.sent_tokenize(p.get_text())]

In [None]:
doc

[[('On', 'IN'),
  ('Tuesday', 'NNP'),
  (',', ','),
  ('November', 'NNP'),
  ('30', 'CD'),
  (',', ','),
  ('2021', 'CD'),
  (',', ','),
  ('the', 'DT'),
  ('President', 'NNP'),
  ('signed', 'VBD'),
  ('into', 'IN'),
  ('law', 'NN'),
  (':', ':')],
 [('S.', 'NNP'),
  ('796', 'CD'),
  (',', ','),
  ('the', 'DT'),
  ('“', 'NN'),
  ('Protecting', 'NNP'),
  ('Moms', 'NNP'),
  ('Who', 'NNP'),
  ('Served', 'NNP'),
  ('Act', 'NNP'),
  ('of', 'IN'),
  ('2021', 'CD'),
  (',', ','),
  ('”', 'NNP'),
  ('which', 'WDT'),
  ('codifies', 'VBZ'),
  ('the', 'DT'),
  ('Department', 'NNP'),
  ('of', 'IN'),
  ('Veterans', 'NNP'),
  ('Affairs', 'NNP'),
  ('current', 'JJ'),
  ('maternity', 'NN'),
  ('care', 'NN'),
  ('coordination', 'NN'),
  ('program', 'NN'),
  (',', ',')],
 [('Thank', 'NNP'),
  ('you', 'PRP'),
  ('to', 'TO'),
  ('Senators', 'NNPS'),
  ('Warnock', 'NNP'),
  (',', ','),
  ('Moran', 'NNP'),
  (',', ','),
  ('Menendez', 'NNP'),
  (',', ','),
  ('Murray', 'NNP'),
  (',', ','),
  ('Blumenthal',

In [None]:
url = 'https://www.whitehouse.gov/briefing-room/'
data = requests.get(url).text
soup = BeautifulSoup(data)

In [None]:
soup.find_all('link')

[<link href="https://gmpg.org/xfn/11" rel="profile"/>,
 <link href="https://www.whitehouse.gov/briefing-room/" rel="canonical"/>,
 <link href="https://www.whitehouse.gov/briefing-room/page/2/" rel="next"/>,
 <link as="font" crossorigin="anonymous" href="https://www.whitehouse.gov/wp-content/themes/whitehouse/assets/fonts/Decimal-Book.woff2" rel="preload" type="font/woff2"/>,
 <link as="font" crossorigin="anonymous" href="https://www.whitehouse.gov/wp-content/themes/whitehouse/assets/fonts/Decimal-Semibold.woff2" rel="preload" type="font/woff2"/>,
 <link as="font" crossorigin="anonymous" href="https://www.whitehouse.gov/wp-content/themes/whitehouse/assets/fonts/Decimal-Medium_Web.woff2" rel="preload" type="font/woff2"/>,
 <link as="font" crossorigin="anonymous" href="https://www.whitehouse.gov/wp-content/themes/whitehouse/assets/fonts/Decimal-Bold.woff2" rel="preload" type="font/woff2"/>,
 <link as="font" crossorigin="anonymous" href="https://www.whitehouse.gov/wp-content/themes/whiteho