In [1]:
import requests
import time
import pickle
import os
import random
from bs4 import BeautifulSoup

#### Dealing with Robots.txt

The following code reads the robots.txt file and creates a dictionary. SUNY Brockport website only contains Disallowed, so we are concerned with that alone. Will use regex to parse links and make sure we don't crawl these wepages

In [2]:
result = os.popen("curl https://www2.brockport.edu/robots.txt").read()
result_data_set = {"Disallowed":[], "Allowed":[]}

for line in result.split("\n"):
    if line.startswith('Allow'):    # this is for allowed url
        result_data_set["Allowed"].append(line.split(': ')[1].split(' ')[0])    # to neglect the comments or other junk info
    elif line.startswith('Disallow'):    # this is for disallowed url
        result_data_set["Disallowed"].append(line.split(': ')[1].split(' ')[0])    # to neglect the comments or other junk info

disallow_list = result_data_set['Disallowed']

# Adding my own filters. These files tend to not work throughout.
disallow_list.extend(['/live/files/', '.html/', 'livewhale', '/bounce/', 'instagram.com', 'twitter.com', 'facebook.com', 'youtube.com', '/search/'])

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1620  100  1620    0     0   8475      0 --:--:-- --:--:-- --:--:--  8481


#### `search_page`
Given list of links, will retreive webpages for each. Stores data in dictionary where the link is the key, and the webpage response is the value

---
#### `get_links_from_webpage`
Given some response object (usually one of the values from the output of `search_page`) will get a list of links on that page, eventually gets passed back into `seach_page`

---
#### `recursive_scrape`
Given a base webpage and depth, will call `search_page` and `get_links_from_webpage` to recursively search a website. This obeys robots.txt (see above) and includes a number of other constraints to reduce the number of errors caught.

In [3]:
def search_page(links: list) -> dict:
    data = {}

    # loop over the links and scrape each page
    for link in links:
        try:
            data[link] = requests.get(link)

            # To not overload the website, sleep 1 sec
            time.sleep(random.uniform(1, 3))
        except Exception as e:
            print(f"An error occurred when processing {link}")
            continue

    return data

def get_links_from_webpage(responseObj) -> list:
    links = []

    soup = BeautifulSoup(responseObj.text, 'html.parser')

    # Find all the links on the page
    links = soup.find_all('a')

    # Filter the links to get only those that point to other pages on the same site
    # This is done by checking if the link starts with '/' (indicating it's a relative link)
    # and doesn't contain '#' (which would indicate it's a link to a specific part of the same page)
    links = [link for link in links if link.get('href') is not None]
    links = [link.get('href') for link in links if link.get('href').startswith('/') and '#' not in link.get('href')]

    # APPLY FILTER... Gets rid of robots.txt files, and some others.
    links = [link for link in links if not any(bad_link in link for bad_link in disallow_list)]

    links = [link.rstrip('/') for link in links]

    # Remove duplicates
    links = list(set(links))

    links = ["https://www2.brockport.edu" + link for link in links]

    return links

def recursive_scrape(webpage, depth):
    links_to_visit = [webpage]
    data = {}

    for interation in range(depth):
        print('Pass', interation)
        
        # visit only links we haven't searched before
        links_to_visit = [link for link in links_to_visit if link not in list(data.keys())]
        links_to_visit = list(set(links_to_visit))

        # add all the links_to_visit to the data
        data.update(search_page(links_to_visit))

        # reset links_to_visit
        links_to_visit = []

        # update the links to visit
        for response in data.values():
            links_to_visit.extend(get_links_from_webpage(response))

    return data

#### Use scraper, and save off the data.

Expected to take 2 hours atleast. ETA 9pm?

In [4]:
data = recursive_scrape("https://www2.brockport.edu", 4)

Pass 0
Pass 1
An error occurred when processing https://www2.brockport.edu/directory
Pass 2
An error occurred when processing https://www2.brockport.edu/academics/catalogs
An error occurred when processing https://www2.brockport.edu/tour
An error occurred when processing https://www2.brockport.edu/support/registration_records/GraduateTransferCreditApproval
An error occurred when processing https://www2.brockport.edu/academics/catalogs/2022
An error occurred when processing https://www2.brockport.edu/career
An error occurred when processing https://www2.brockport.edu/directory
An error occurred when processing https://www2.brockport.edu/support/lits/helpdesk
An error occurred when processing https://www2.brockport.edu/support/enrollment_management/basic_needs/index.html
An error occurred when processing https://www2.brockport.edu/support/lits
Pass 3
An error occurred when processing https://www2.brockport.edu/academics/catalogs/current/programs/museum-minor.html
An error occurred when p

In [5]:
pickle.dump(data, open('scraper_output.p', 'wb'))