In [1]:
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib import robotparser
from matplotlib import pyplot as plt, rcParams
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import string

In [2]:
def prob1(url='https://ldsminds.com/', 
          pages=['/chronological-list-of-all-prophetsapostles/']):
    """Using urllib.robotparser, check if the provided webpages are allowed
    based on the website's robots.txt file.
    Parameters:
        url (str): The website's base url
        pages (list): List of strings of webpages to check
    Returns:
        """
    rp = robotparser.RobotFileParser()
    #Set the URL for the robots.txt file. Note that the URL contains 'robots.txt'
    rp.set_url(url + "/robots.txt")
    rp.read()
    # Request the crawl-delay time for the default User-agent
    crawl_delay_time = rp.crawl_delay("*") # * is the default User-agent
    can_access_list = []
    for page in pages:
        # Check if User-agent "* can access the page"
        can_access = rp.can_fetch("*", url + page)
        can_access_list.append(can_access)
    return can_access_list, crawl_delay_time
prob1()

([True], None)

This shows we can access the above sight and the pages. There is no crawl delay time.

In [3]:
def clean_head(head):
    """Given a beautiful soup tag, extract just the Header Title"""
    messy_header = str(head)
    header_group = re.search(r"<span style=\"text-decoration: underline;\">(.*)</span>", messy_header)
    header = header_group.group(1)
    return header

In [4]:
def separate_callings(organized):
    apostles = []
    for i in range(len(organized)):
        callings = []
        for j in range(len(organized[i])):
            strin = str(organized[i][j])
            splt = strin.split('\n')[1:-1]
            callings.append(splt)
        apostles.append(callings)

    list_of_flat = []
    for lists in apostles:
        flat_list = [item for sublist in lists for item in sublist]
        list_of_flat.append(flat_list)
    return list_of_flat


In [5]:
def clean_list(dirty_list):
    """Given list of strings, return relevant info"""
    info_dict = dict()
    for p_tag in dirty_list:
        pattern = re.search(r"<p>([A-Za-z,. ]*)\((\d{0,4})", p_tag)
        #info_dict[Name of Apostle] = year they started
        info_dict[pattern.group(1)] = pattern.group(2)
    return info_dict

In [6]:
def get_apostles(filename="apostles.html", callings=['Presidents of the Church', 'First Counselors in the First Presidency', 'Second Counselors in the First Presidency', 'Apostles in the Quorum of the Twelve Apostles']):
    """Read the specified file and load it into BeautifulSoup. Return list of apostles with service dates
    """
    with open(filename, "r") as my_file:
        file_string = my_file.read()
        file_soup = BeautifulSoup(file_string, 'html.parser')
        #find the header tags to separate callings
        header_tags = file_soup.find_all(style="text-decoration: underline;")
        #find the div tags to get the names and the dates (these include threecol-one last)
        div_tags = file_soup.find_all(class_="threecol-one")
        #clean the headers
        headers = [clean_head(header) for header in header_tags]
        count_by = 3
        prev_count = 0
        organized = []
        for head in headers:
            if head == 'Assistant Presidents of the Church':
                pass
            elif head == 'Assistant Counselors in the First Presidency':
                prev_count += 3
                count_by += 3
            elif head == 'Apostles in the Quorum of the Twelve Apostles':
                organized.append(div_tags[prev_count:])
            else:
                organized.append(div_tags[prev_count: count_by])
                prev_count = count_by
                count_by += 3
        calling_dict = {key:value for (key, value) in zip(head, organized)}
    chunked_callings = separate_callings(organized)
    information = []
    for calling in chunked_callings:
        info_dict = clean_list(calling)
        information.append(info_dict)
    return information

In [7]:
information = get_apostles()

In [8]:
Prophets = information[0]
First_Counselors = information[1]
Second_Counselors = information[2]
Quorum_12_Apostles = information[3]