In [88]:
from requests_html import HTMLSession
from pprint import pprint


pages = [
    'https://president.yale.edu/speeches-writings/notes-woodbridge-hall',
    'https://president.yale.edu/speeches-writings/speeches',
    'https://president.yale.edu/speeches-writings/statements',
]

def get_pagination(url):
    session = HTMLSession()
    r = session.get(url)
    paginate = []
    for html in r.html:
        # filter out only if they are pagination URLs
        if html.url.find('page') > 0:
            paginate.append(html.url)
    return paginate


def get_links(url, paginate=False):
    # make it loop
    session = HTMLSession()
    r = session.get(url)

    # get only the links that host articles
    if not paginate:
        sub_page = url.split('/')[-1]  # take only the ones that end with the respective page URL
    else:
        sub_page = 'notes-woodbridge-hall'
    links = [l for l in r.html.absolute_links if f'https://president.yale.edu/speeches-writings/{sub_page}/' in l]
    
    # filtering out the pagination and archive links
    links = [l for l in links if all([l.find('page') == -1, l.find('/archive') == -1])]
    return links


def get_text(article_url):
    name, site = article_url.split('/')[-1], article_url.split('/')[-2]
    session = HTMLSession()
    r = session.get(article_url)
    content_el = r.html.xpath("//div[contains(@class, 'region-inner') and contains(@class, 'region-content-inner')]")
    if len(content_el) > 1:
        print("there might be more to this site! please double-check:")
        print(article_url)
    else:
        with open(f'{site}/{name}.txt', 'w') as fout:
            fout.write(content_el[0].text)

In [86]:
link_list = []
for page in pages:
    link_list.extend(get_links(page))
link_list[:5]  # look at a few

['https://president.yale.edu/speeches-writings/notes-woodbridge-hall/reunions-our-grand-finale', 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/yales-revolutionary-history', 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/great-ideas-making', 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/men-s-lacrosse-wins-national-championship', 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/building-powerful-partnerships']


In [None]:
for link in link_list:
    get_text(link)

In [84]:
more = get_pagination('https://president.yale.edu/speeches-writings/notes-woodbridge-hall')

In [85]:
more

['https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=1',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=2',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=3',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=4',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=5',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=6',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=7',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall?page=8']

In [91]:
wood_pages = []
for link in more:
    wood_pages.extend(get_links(link, paginate=True))
wood_pages[:5]

['https://president.yale.edu/speeches-writings/notes-woodbridge-hall/research-world',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/science-our-planet-and-our-future',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/legacy-public-service',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/traditions-service',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/big-dog-campus',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/living-your-life-yale',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/sharing-science-yale',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/celebrating-lives-courage-love-and-service',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/treasures-chapel-street',
 'https://president.yale.edu/speeches-writings/notes-woodbridge-hall/reflections-after-charlottesville',
 'https://president.yale.edu/speeches-wri

In [92]:
for link in wood_pages:
    get_text(link)