### Bishop Data Collection Automation Framework

This notebook provides a basic framework for scraping the relevant data from Wikipedia pages where those pages include lists of bishops for specific dioceses. It can collect from a few basic list formats, but it may not work well with tables or other data sources. As I try to run it on more pages I will try to expand its scope to account for these.

Note: The final collection function takes a 'path' argument to enable the selenium webdriver; this path is generally something like '/Users/*yourname*/Downloads/chromedriver'.

#### *CURRENT ISSUES*

1. There was an error when using the *googletrans Translator*; I have not looked into this further, so for now I have commented out that code, and simply instruct *bio_collector* to include a blank "English Bio" field

2. I have not accounted for the binary *Archbishop* column yet... not sure how to do that... maybe we can check Yada's code? https://github.com/pruksmhc/JobDioceScrape/blob/master/main.py

#### Section I: Definition of Functions

In [620]:
# Package import cell
import re
import pandas as pd
import numpy as np
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
from requests import get
from googletrans import Translator
translator = Translator()

In [621]:
# Define function to collect data from well-defined list Wikis and store in dictionary
def list_collector(path, url):
    primary_url = url
    driver = webdriver.Chrome(executable_path = path)
    driver.get(primary_url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    soup_bishops = soup.select('li')
    bishops_with_links = {}
    for bishop in soup_bishops:
        if bishop.findChildren('a'):
            bishops_with_links[bishop.text] = 'https://de.wikipedia.org' + bishop.a.get('href')
        else:
            bishops_with_links[bishop.text] = ''
    for bishop, link in bishops_with_links.items():
        if (('redlink' in link) | ('index' in link)):
            bishops_with_links[bishop] = ''
    return bishops_with_links

In [622]:
# Define function to clean bishop data string and return name, year in, year out
def cleaner(string):
    string = string.replace('ca. ', '').replace('um ', '').replace('seit ', '')
    string = string.replace(' –', '–')
    string = string.replace('– ', '–')
    string = string.replace('- ', '–')
    string = string.replace(' -', '–')
    string = string.replace('0000', '–')
    string = string.replace(':', '')
    string = string.replace('vakant', 'Vacant')
    string = re.sub(r' ?\([^)]+\)', '', string)
    if string[0:3].isdigit():
        string_elements = string.split(' ')
        years = string_elements[0]
        if '–' in years:    
            year_elements = years.split('–')
            year_in = year_elements[0]
            year_out = year_elements[1]
        else:
            year_in = years
            year_out = ''
        name = ' '.join(string_elements[1:])
        string = name+':'+year_in+':'+year_out
    elif ((string[-3:].isdigit()) | (string.endswith('??'))):
        string_elements = string.split(' ')
        years = string_elements[-1]
        if '–' in years:    
            year_elements = years.split('–')
            year_in = year_elements[0]
            year_out = year_elements[1]
        else:
            year_in = years
            year_out = ''
        name = ' '.join(string_elements[:-1])
        string = name+':'+year_in+':'+year_out
    return string

In [647]:
# Define function to set up dataframe, pull in dictionary and clean
def dataframer(dictionary):
    bishops = pd.DataFrame()
    bishops['Name'] = dictionary.keys()
    bishops['Name'] = bishops['Name'].apply(cleaner)
    bishops = pd.DataFrame(bishops['Name'].str.split(':').tolist(), columns = ['Name','From', 'To'])
    bishops['From'] = bishops['From'].astype(str)
    bishops['To'] = bishops['To'].astype(str)
    bishops['Bio Link'] = dictionary.values()
    bishops = bishops[(~bishops['From'].str.startswith('17')) & (~bishops['From'].str.startswith('18')) 
                      & (~bishops['From'].str.startswith('19')) & (~bishops['From'].str.startswith('2')) 
                      & (~bishops['From'].str.startswith('None')) & (~bishops['To'].str.startswith('None'))
                      & (~bishops['To'].str.startswith('18')) & (~bishops['To'].str.startswith('19'))
                      & (~bishops['To'].str.startswith('2'))] 
    bishops = bishops.dropna()
    return bishops

In [624]:
# Define function to process available links and collect biographies in dataframe
def bio_collector(path, dataframe):
    bio_list = []
    for link in dataframe['Bio Link']:
        if link != '':
            driver = webdriver.Chrome(executable_path = path)
            driver.get(link)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            driver.close()
            paragraphs = soup.find_all('p')
            german_bio = ' '.join(paragraph.text for paragraph in paragraphs).replace('\n', ' ')
            english_bio = ''
            bio_list.append([link, german_bio, english_bio])
    bio_dataframe = pd.DataFrame.from_records(bio_list)
    bio_dataframe = bio_dataframe.rename({0:'Bio Link', 1:'German Bio', 2:'English Bio'}, axis='columns')
    return bio_dataframe

In [715]:
# Define function to create final, merged, clean dataframe
def merger(bishop_dataframe, bio_dataframe):
    if not bio_dataframe.empty:
        final = pd.merge(bishop_dataframe, bio_dataframe, on='Bio Link', how='left')
        final = final.fillna('')
    else:
        final = bishop_dataframe
        final['German Bio'] = ''
        final['English Bio'] = ''
        final = final.fillna('')
    final['Name First Letter'] = ''
    final['Bio First Letter'] = ''
    final['Name First Letter'] = final['Name'].str.slice(start=0, stop=1)
    final['Bio First Letter'] = final['German Bio'].str.slice(start=0, stop=1)
    final['Bio Link'] = np.where((final['Name First Letter'] != final['Bio First Letter']),
                                 '', final['Bio Link'])
    final['German Bio'] = np.where((final['Name First Letter'] != final['Bio First Letter']), 
                                   '', final['German Bio'])
    final = final.drop(columns=['Name First Letter', 'Bio First Letter'])
    final = final.drop_duplicates(subset='Name')
    return final

In [712]:
# Define function to translate German column to English column
def translator(dataframe):
    for index, row in dataframe.iterrows():
        translator = Translator()
        english_bio = translator.translate(row['German Bio'], src='de', dest='en').text
        row['English Bio'] = english_bio
    return dataframe

In [713]:
# Define function to automate the collection process for a given diocese using above functions
def collector(path, url, country, diocese):
    bishops = list_collector(path, url)
    bishop_dataframe = dataframer(bishops)
    bio_dataframe = bio_collector(path, bishop_dataframe)
    merged_dataframe = merger(bishop_dataframe, bio_dataframe)
#     final_dataframe = translator(merged_dataframe)
    final_dataframe = merged_dataframe
    final_dataframe['Country'] = country
    final_dataframe['Diocese'] = diocese
    final_dataframe = final_dataframe.set_index(['Country', 'Diocese'])
    return final_dataframe

In [735]:
# Define function to concatenate dataframes for list of country dioceses
def concatenator(path, input_list):
    cumulator = collector(path, input_list[0][0], input_list[0][1], input_list[0][2])
    for specs in input_list[1:]:
        current = collector(path, specs[0], specs[1], specs[2])
        cumulator = pd.concat([cumulator, current])
    return cumulator   

In [738]:
# Define function to export final dataframe to CSV
def exportify(dataframe, filename):
    csv = dataframe.to_csv(filename)

#### Section II: Build & Export CSV files by country

In [743]:
mypath = '/Users/orion/Downloads/chromedriver'

In [742]:
denmark = [['https://de.wikipedia.org/wiki/Bistum_Aalborg', 'Denmark', 'Aalborg'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Aarhus', 'Denmark', 'Aarhus'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_der_F%C3%A4r%C3%B6er', 'Denmark', 'Faroer'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_F%C3%BCnen', 'Denmark', 'Funen'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Gr%C3%B6nland', 'Denmark', 'Gronland'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Roskilde', 'Denmark', 'Roskilde'], 
           ['https://de.wikipedia.org/wiki/Bistum_Viborg', 'Denmark', 'Viborg']]

finland = [['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Turku#Bisch%C3%B6fe_von_Turku_2', 'Finland', 'Turku']]

iceland = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Sk%C3%A1lholt', 'Iceland', 'Skalholt']]



In [None]:
denmark_dataframe = concatenator(mypath, denmark)
exportify(denmark_dataframe, 'denmark.csv')

In [744]:
finland_dataframe = concatenator(mypath, finland)
exportify(finland_dataframe, 'finland.csv')

In [None]:
iceland_dataframe = concatenator(mypath, iceland)
exportify(iceland_dataframe, 'iceland.csv')