In [1]:
# Set up Imports
from bs4 import BeautifulSoup
from IPython.display import clear_output
import re
import requests
import pandas as pd
import numpy as np

## Production Area

In [2]:
index = ["URL", "TITLE", "DATE", "START_TIME", "LOCATION", "DESCRIPTION"]
page_amount = 0

In [3]:
def get_past_events_urls(verbose = False):
    pages = list(range(1, page_amount + 1))
    url_of_events = []

    for page in pages:
        clear_output(wait=True) if not(verbose) else None
        page_url = "https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all={}".format(page)
        page_request = requests.get(page_url)
        print("*** GETTING EVENT URLS *** ")
        print("Next Page: {}".format(page_url))
        print("Status: {}".format(page_request))

        soup = BeautifulSoup(page_request.text, 'html.parser')
        events = soup.find_all("div", attrs="page-section")[1].find_all('a', {'href': re.compile(r'https:\/\/datascience\.ucsd\.edu\/events')})

        for event in events:
            event_url = event.get("href")
            url_of_events.append(event_url)
        print("{} URL's Obtained \n".format(len(events)))
        
    return url_of_events

In [4]:
def get_event_details(url):
    page = pd.Series(index=index)
    page_request = requests.get(url)
    soup = BeautifulSoup(page_request.text, 'html.parser')
    
    # Add URL to page
    page.URL = url
    
    # Get Title of Event
    event_title = soup.find("div", ["pageinfo"]).text.strip()
    page.TITLE = event_title
    
    # Get Event Logisitcs
    event_logistics = soup.find("ul", ["post-options"]).find_all("li")
    event_date = event_logistics[0].find("span", ["cs-event-time"]).text.strip()
    event_location = event_logistics[2].text.strip()

    page.DATE = event_date
    page.LOCATION = event_location
    
    event_start_time = re.sub("\xa0to \xa0.*", " ", event_logistics[1].text.strip()).strip()
    try:
        event_start_time_pd = pd.to_datetime(event_start_time)
    except (ValueError):
        event_start_time_pd = event_start_time
    page["START_TIME"] = event_start_time_pd

    
    # Get Event Description
    event_description_section = soup.find("div", ["rich_editor_text"]).find_all(["p", "h1", "h2", "h3"])
    event_description_formatted = []
    for event_description_p in event_description_section:
        event_description_formatted.append(event_description_p.text.strip())
    event_description = "\n".join(event_description_formatted)
    page.DESCRIPTION = re.sub('Event Description\n', '', event_description).strip()
    
    return page

In [5]:
def events_list(urls, verbose = False):
    urls_processed = 0
    df = pd.DataFrame(columns = index)
    for url in urls:
        event = get_event_details(url)
        df = df.append(event, ignore_index=True)
        clear_output(wait=True) if not(verbose) else None
        urls_processed += 1
        print("*** GETTING EVENT DESCRIPTIONS ***")
        print("{}%".format(np.round((urls_processed / len(urls)) * 100, 1)))
    print("*** EVENT DESCRIPTIONS DOWNLOADED  ***")
    return df

In [7]:
def get_events(verbose = False, export = True):
    urls = get_past_events_urls(verbose)
    event = events_list(urls, verbose)
    clear_output(wait=True) if not(verbose) else None
    print("*** EXPORTING EVENTS ***") if export else None
    event.to_csv("past_events.csv") if export else None
    print("*** DONE ***")
    return event


page_amount = 12 # how many pages there are

# Run this below to scrape pages
events = get_events(verbose = True)

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=1
Status: <Response [200]>
1 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=2
Status: <Response [200]>
10 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=3
Status: <Response [200]>
10 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=4
Status: <Response [200]>
9 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=5
Status: <Response [200]>
10 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/events-2/?page_id_all=6
Status: <Response [200]>
10 URL's Obtained 

*** GETTING EVENT URLS *** 
Next Page: https://datascience.ucsd.edu/news-and-events/