In [1]:
from urllib.parse import urlparse
def make_link_absolute(rel_url, current_url):
    """
    Given a relative URL like "/abc/def" or "?page=2"
    and a complete URL like "https://example.com/1/2/3" this function will
    combine the two yielding a URL like "https://example.com/abc/def"

    Parameters:
        * rel_url:      a URL or fragment
        * current_url:  a complete URL used to make the request that contained a link to rel_url

    Returns:
        A full URL with protocol & domain that refers to rel_url.
    """
    url = urlparse(current_url)
    if rel_url.startswith("/"):
        return f"{url.scheme}://{url.netloc}{rel_url}"
    elif rel_url.startswith("?"):
        return f"{url.scheme}://{url.netloc}{url.path}{rel_url}"
    else:
        return rel_url

In [2]:
from openpyxl import workbook
import requests
import lxml.html
import openpyxl
#the document to write in 
path = "/Users/miaoli/Desktop/scraper.xlsx"

workbook = openpyxl.load_workbook(path)

sheet = workbook.active
sheet.title = "news"

In [None]:
def scrape_napp(end_date = '2032-01-01'):
    '''
    scraping NAPP podcast after a certain date
    input:
        end_data(string): in format like "2010-01-01")
        if no input: scrape the entire website
    '''
    import datetime
    
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    value = []
    url = "https://effectivegov.uchicago.edu/initiatives/podcast"
    response = requests.get(url)
    root = lxml.html.fromstring(response.text)
    episodes = root.find_class("container mb-16")
    for episode in episodes:
        for i,element in enumerate(episode.getchildren()):
            rel_url = element.get("href")
            pod_url = make_link_absolute(rel_url, url)
            for i,x in enumerate(element.getchildren()):
                list = x.getchildren()
                for i, text in enumerate(list):
                    lst = text.getchildren()
                    z = lst[1].getchildren()
                    Episode_title = z[1].text_content()
                    Podcast = "Not Another Politics Podcast"
                    Description = z[2].text_content()
                    tp = z[0].text_content()
                    date = tp.split("|")[1]
                    formatted_date = datetime.datetime.strptime(date, " %B %d, %Y")
                    value.append((Episode_title, Podcast, pod_url, date, Description))
                    if formatted_date > end_date:
                        break
    for i in range(len(value)):
        for j in range(len(value[i])):
            sheet.cell(row = i + 1, column = j + 1, value = str(value[i][j]))
    workbook.save(path)

In [70]:
def scrape_news(range_num):
    '''
    scraping news from the harris news website
    input:
        range_num: the number of pages to scrape
    '''
    value = []
    url_org = "https://harris.uchicago.edu/news-events/news?page="
    for i in range(range_num):
        if i == 0:
            url = url_org
        if i != 0:
            page = i 
            url = url_org + '?page=' + str(page) + '"'
        response = requests.get(url)
        root = lxml.html.fromstring(response.text)
        episodes = root.find_class("node--expanded")
        for episode in episodes:
            text = episode.getchildren()
            title = text[1].text_content().strip()
            date = text[2].getchildren()[0].text_content().strip()
            descrip = episode.find_class("node--expanded--summary")
            if len(descrip) != 0:
                description = descrip[0].text_content().strip()
            else:
                description = ""
            url = ""
            value.append((title, url, date, description))
    for i in range(len(value)):
        for j in range(len(value[i])):
            sheet.cell(row = i + 1, column = j + 1, value = str(value[i][j]))
    workbook.save(path)

In [71]:
def scrape_in_news(range_num):
    '''
    scraping news from the harris news website
    input:
        range_num: the number of pages to scrape
    '''
    value = []
    url_org = "https://harris.uchicago.edu/news-events/news/in-the-news"
    for i in range(range_num):
        if i == 0:
            url = url_org
        if i != 0:
            page = i
            url = url_org + '?page=' + str(page) + '"'
        response = requests.get(url)
        root = lxml.html.fromstring(response.text)
        episodes = root.find_class("node--expanded")
        for episode in episodes:
            text = episode.getchildren()
            title = text[1].text_content().strip()
            date = text[2].getchildren()[0].text_content().strip()
            descrip = episode.find_class("node--expanded--summary")
            if len(descrip) != 0:
                description = descrip[0].text_content().strip()
            else:
                description = ""
            url = ""
            value.append((title, url, date, description))
    for i in range(len(value)):
        for j in range(len(value[i])):
            sheet.cell(row = i + 1, column = j + 1, value = str(value[i][j]))
    workbook.save(path)

In [72]:
scrape_in_news(13)

In [22]:
def scrap_public_money_pd(end_date = '2032-01-01'):
    '''
    scraping public money podcast after a certain date
    input:
        end_data(string): in format like "2010-01-01"
        if no input: scrape the entire website
    '''
    value = []
    import datetime
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    
    url = 'https://munifinance.uchicago.edu/cmf-podcast/'
    response = requests.get(url)
    root = lxml.html.fromstring(response.text)
    episodes = root.find_class("et_pb_code_inner")[0].getchildren()
    
    for i, episode in enumerate(episodes):
        iframe = episode.getchildren()[0]
        sem = iframe.attrib['src']
        response_sem = requests.get(sem)
        sem_root = lxml.html.fromstring(response_sem.text)
        title = sem_root.find_class('episode-title')[0].text_content()
        podcast = 'Public Money Pod'
        date = sem_root.find_class('episode-subtitle')[0].text_content()
        cleaned_date = datetime.datetime.strptime(date.split('•')[1].strip(),'%dth %B %Y')
        value.append((title, podcast, sem, cleaned_date.strftime("%B %d, %Y")))
        if cleaned_date > end_date:
            break
            
    for i in range(len(value)):
        for j in range(len(value[i])):
            sheet.cell(row = i + 1, column = j + 1, value = str(value[i][j]))
    workbook.save(path)