### Bishop Data Collection Automation Framework

This notebook provides a basic framework for scraping the relevant data from Wikipedia pages where those pages include lists of bishops for specific dioceses. It can collect from a few basic list formats, but it may not work well with tables or other data sources. As I try to run it on more pages I will try to expand its scope to account for these.

Note: The final collection function takes a 'path' argument to enable the selenium webdriver; this path is generally something like '/Users/*yourname*/Downloads/chromedriver'.

#### *CURRENT ISSUES*

1. There is a character limit on the Google Translate package, so English translations of longer biographies are excluded (looking into additional solutions)

2. The binary 'Archbishop' indicator is going to be a manual operation for now (looking into additional solutions)

#### Section I: Definition of Functions

In [1]:
# Package import cell
import re
import pandas as pd
import numpy as np
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
from requests import get
from googletrans import Translator
translator = Translator()

In [2]:
# Define function to collect data from well-defined list Wikis and store in dictionary
def list_collector(path, url):
    primary_url = url
    driver = webdriver.Chrome(executable_path = path)
    driver.get(primary_url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    body = soup.find('div', {'class': 'mw-parser-output'})    
    soup_bishops = body.findChildren('li')
    bishops_with_links = {}
    for bishop in soup_bishops:
        if bishop.a:
            bishops_with_links[bishop.text] = 'https://de.wikipedia.org' + bishop.a.get('href')
        else:
            bishops_with_links[bishop.text] = ''
    for bishop, link in bishops_with_links.items():
        if (('redlink' in link) | ('index' in link) | ('#' in link)):
            bishops_with_links[bishop] = ''
    return bishops_with_links

In [3]:
# Define function to set up dataframe, pull in dictionary and clean
def dataframer(dictionary):
    bishops = pd.DataFrame()
    bishops['Name'] = dictionary.keys()
    bishops['Name'] = bishops['Name'].apply(cleaner)
    bishops = pd.DataFrame(bishops['Name'].str.split(':').tolist(), columns = ['Name','From', 'To'])
    bishops['From'] = bishops['From'].astype(str)
    bishops['To'] = bishops['To'].astype(str)
    bishops['Bio Link'] = dictionary.values()
    bishops = bishops[(~bishops['From'].str.startswith('17')) & (~bishops['From'].str.startswith('18')) 
                      & (~bishops['From'].str.startswith('19')) & (~bishops['From'].str.startswith('2')) 
                      & (~bishops['From'].str.startswith('None')) & (~bishops['To'].str.startswith('None'))
                      & (~bishops['To'].str.startswith('18')) & (~bishops['To'].str.startswith('19'))
                      & (~bishops['To'].str.startswith('2')) & (~bishops['From'].str.contains('\.'))]
    bishops = bishops[(bishops['To'] != '') | (bishops['From'] != '') | (bishops['Bio Link'] != '')]
    if bishops['To'].str.startswith('17').any():
        bishops = bishops.loc[: bishops[(bishops['To'].str.startswith('17'))].index[0], :]
    return bishops

In [4]:
# Define function to clean bishop data string and return name, year in, year out
def cleaner(string):
    years = ''
    year_in = ''
    year_out = ''
    string = string.replace('ca. ', '').replace('um ', '').replace('seit ', '')
    string = string.replace(' –', '–')
    string = string.replace('– ', '–')
    string = string.replace('- ', '–')
    string = string.replace(' -', '–')
    string = string.replace('0000', '–')
    string = string.replace(':', '')
    string = string.replace('vakant', 'Vacant')
    string = string.replace('(', '')
    string = string.replace(')', '')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('erwähnt', '')
    string = string.replace('genannt', '')
    string = string.replace('mellem', '')
    strings = string.split(' ')
    for string in strings:
        for char in string:
            if char.isdigit():
                years = string
                if len(years) < 3:
                    years = ''
                if '–' in years:    
                    year_elements = years.split('–')
                    year_in = year_elements[0]
                    year_out = year_elements[1]
                else:
                    year_in = years
                    year_out = ''
                strings.remove(string)
                break
    name = ' '.join(strings)
    strings = str(name)+':'+str(year_in)+':'+str(year_out)
    return strings

In [5]:
# Define function to process available links and collect biographies in dataframe
def bio_collector(path, dataframe):
    bio_list = []
    for link in dataframe['Bio Link']:
        if link != '':
            driver = webdriver.Chrome(executable_path = path)
            driver.get(link)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            driver.close()
            paragraphs = soup.find_all('p')
            german_bio = ' '.join(paragraph.text for paragraph in paragraphs).replace('\n', ' ')
            english_bio = ''
            bio_list.append([link, german_bio, english_bio])
    bio_dataframe = pd.DataFrame.from_records(bio_list)
    bio_dataframe = bio_dataframe.rename({0:'Bio Link', 1:'German Bio', 2:'English Bio'}, axis='columns')
    return bio_dataframe

In [6]:
# Define function to create final, merged, clean dataframe
def merger(bishop_dataframe, bio_dataframe):
    if not bio_dataframe.empty:
        final = pd.merge(bishop_dataframe, bio_dataframe, on='Bio Link', how='left')
        final = final.fillna('')
    else:
        final = bishop_dataframe
        final['German Bio'] = ''
        final['English Bio'] = ''
        final = final.fillna('')
    final['Name First Letter'] = ''
    final['Bio First Letter'] = ''
    final['Name First Letter'] = final['Name'].str.slice(start=0, stop=1)
    final['Bio First Letter'] = final['German Bio'].str.slice(start=0, stop=1)
    final['Bio Link'] = np.where((final['Name First Letter'] != final['Bio First Letter']),
                                 '', final['Bio Link'])
    final['German Bio'] = np.where((final['Name First Letter'] != final['Bio First Letter']), 
                                   '', final['German Bio'])
    final = final.drop(columns=['Name First Letter', 'Bio First Letter'])
    final = final.drop_duplicates(subset='Name')
    final = final[(final['To'] != '') | (final['From'] != '') | (final['Bio Link'] != '')]
    return final

In [7]:
# Define function to translate German column to English column
def translator(dataframe):
    for index, row in dataframe.iterrows():
        translator = Translator()
        try:
            english_bio = translator.translate(row['German Bio'], src='de', dest='en').text
            row['English Bio'] = english_bio
        except Exception as e:
            continue
    return dataframe

In [8]:
# Define function to automate the collection process for a given diocese using above functions
def collector(path, url, country, diocese):
    bishops = list_collector(path, url)
    bishop_dataframe = dataframer(bishops)
    bio_dataframe = bio_collector(path, bishop_dataframe)
    merged_dataframe = merger(bishop_dataframe, bio_dataframe)
    final_dataframe = translator(merged_dataframe)
    final_dataframe['Archbishop'] = ''
    final_dataframe['Country'] = country
    final_dataframe['Diocese'] = diocese
    final_dataframe = final_dataframe.set_index(['Country', 'Diocese'])
    return final_dataframe

In [9]:
# Define function to concatenate dataframes for list of country dioceses
def concatenator(path, input_list):
    cumulator = collector(path, input_list[0][0], input_list[0][1], input_list[0][2])
    for specs in input_list[1:]:
        current = collector(path, specs[0], specs[1], specs[2])
        cumulator = pd.concat([cumulator, current])
    return cumulator   

In [10]:
# Define function to export final dataframe to CSV
def exportify(dataframe, filename):
    csv = dataframe.to_csv(filename)

#### Section II: Build & Export CSV files by country

*Orion's list of countries: *
Albania,
Macedonia,
Bosnia & Herzegovina,
Bulgaria,
Croatia,
Denmark,
Finland,
Greece,
Hungary,
Iceland,
Ireland,
Latvia,
Lithuania

In [11]:
mypath = '/Users/orion/Downloads/chromedriver'

In [12]:
albania = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Lezha', 'Albania', 'Lezha'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Durr%C3%ABs', 'Albania', 'Durres'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Shkodra', 'Albania', 'Shkodra',], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Sapa', 'Albania', 'Sapa']]

bosniaherzegovina = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Mostar', 'Bosnia & Herzegovina', 'Mostar-Duvno'], 
                     ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Bosnien', 'Bosnia & Herzegovina', 'Vrhbosna'], 
                     ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Trebinje-Mrkan', 'Bosnia & Herzegovina', 'Trebinje-Mrkan']]

bulgaria = [['https://de.wikipedia.org/wiki/Bistum_Nicopolis', 'Bulgaria', 'Nicopolis']]

croatia = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Dubrovnik', 'Croatia', 'Dubrovnik'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_und_Erzbisch%C3%B6fe_von_Zagreb', 'Croatia', 'Zagreb'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Zadar', 'Croatia', 'Zadar'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_%C5%A0ibenik', 'Croatia', 'Sibenik'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Hvar', 'Croatia', 'Hvar'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Pedena', 'Croatia', 'Pedena'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Kri%C5%BEevci', 'Croatia', 'Krizevci'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Pore%C4%8D', 'Croatia', 'Porec'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Pula', 'Croatia', 'Pula'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Novigrad', 'Croatia', 'Novigrad'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Krk', 'Croatia', 'Krk']]

denmark = [['https://de.wikipedia.org/wiki/Bistum_Aalborg', 'Denmark', 'Aalborg'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Aarhus', 'Denmark', 'Aarhus'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_der_F%C3%A4r%C3%B6er', 'Denmark', 'Faroer'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_F%C3%BCnen', 'Denmark', 'Funen'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Gr%C3%B6nland', 'Denmark', 'Gronland'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Roskilde', 'Denmark', 'Roskilde'], 
           ['https://de.wikipedia.org/wiki/Bistum_Viborg', 'Denmark', 'Viborg']]

finland = [['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Turku#Bisch%C3%B6fe_von_Turku_2', 'Finland', 'Turku']]

greece = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Kefalonia', 'Greece', 'Kefalonia'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Santorini', 'Greece', 'Santorini'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Naxos', 'Greece', 'Naxos'], 
          ['https://de.wikipedia.org/wiki/Bistum_Tinos', 'Greece', 'Tinos'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Athen', 'Greece', 'Athens'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Syros', 'Greece', 'Syros'], 
          ['https://de.wikipedia.org/wiki/Bistum_Milos', 'Greece', 'Milos'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Kreta', 'Greece', 'Krete'], 
          ['https://de.wikipedia.org/wiki/Bistum_Andros', 'Greece', 'Andros'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Korfu', 'Greece', 'Korfu'], 
          ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Chios', 'Greece', 'Chios']]

hungary = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Veszpr%C3%A9m', 'Hungary', 'Veszprem'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_V%C3%A1c', 'Hungary', 'Vac'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Csan%C3%A1d', 'Hungary', 'Szeged-Csanad'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_F%C3%BCnfkirchen', 'Hungary', 'Pecs'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Kalocsa', 'Hungary', 'Kalocsa'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Gy%C5%91r', 'Hungary', 'Gyor'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Esztergom', 'Hungary', 'Esztergom-Budapest'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Eger', 'Hungary', 'Eger']]

iceland = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Sk%C3%A1lholt', 'Iceland', 'Skalholt'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_H%C3%B3lar', 'Iceland', 'Holar']]

ireland = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Waterford_und_Lismore', 'Ireland', 'Waterford & Lismore'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Tuam', 'Ireland', 'Tuam'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Ross', 'Ireland', 'Ross'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Raphoe', 'Ireland', 'Raphoe'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Ossory', 'Ireland', 'Ossory'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Clonmacnoise', 'Ireland', 'Clonmacnoise'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Ardagh', 'Ireland', 'Ardagh'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Limerick', 'Ireland', 'Limerick'],
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Cloyne', 'Ireland', 'Cloyne'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Derry', 'Ireland', 'Derry'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Connor', 'Ireland', 'Connor'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Clonfert', 'Ireland', 'Clonfert'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Clogher', 'Ireland', 'Clogher'], 
           ['https://de.wikipedia.org/wiki/Bistum_Killaloe', 'Ireland', 'Killaloe'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Annadown', 'Ireland', 'Annadown'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Down', 'Ireland', 'Down'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Ardfert', 'Ireland', 'Ardfert'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Dublin', 'Ireland', 'Dublin'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Ferns', 'Ireland', 'Ferns'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Emly', 'Ireland', 'Emly'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Cashel', 'Ireland', 'Cashel'], 
           ['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Achonry', 'Ireland', 'Achonry']]

lithuania = [['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Kaunas', 'Lithuania', 'Kaunas'], 
             ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Vilnius', 'Lithuania', 'Vilnius']]

macedonia = [['https://de.wikipedia.org/wiki/Liste_der_Bisch%C3%B6fe_von_Skopje', 'Macedonia', 'Skopje'], 
             ['https://de.wikipedia.org/wiki/Liste_der_Erzbisch%C3%B6fe_von_Ohrid', 'Macedonia', 'Ohrid']]

In [359]:
albania_dataframe = concatenator(mypath, albania)

In [360]:
bosniaherzegovina_dataframe = concatenator(mypath, bosniaherzegovina)

In [361]:
bulgaria_dataframe = concatenator(mypath, bulgaria)

In [362]:
croatia_dataframe = concatenator(mypath, croatia)

In [363]:
denmark_dataframe = concatenator(mypath, denmark)

In [364]:
finland_dataframe = concatenator(mypath, finland)

In [365]:
greece_dataframe = concatenator(mypath, greece)

In [366]:
hungary_dataframe = concatenator(mypath, hungary)

In [367]:
iceland_dataframe = concatenator(mypath, iceland)

In [368]:
ireland_dataframe = concatenator(mypath, ireland)

In [369]:
lithuania_dataframe = concatenator(mypath, lithuania)

In [370]:
macedonia_dataframe = concatenator(mypath, macedonia)

In [376]:
exportify(albania_dataframe, 'albania.csv')
exportify(bosniaherzegovina_dataframe, 'bosnia&herzegovina.csv')
exportify(bulgaria_dataframe, 'bulgaria.csv')
exportify(croatia_dataframe, 'croatia.csv')
exportify(denmark_dataframe, 'denmark.csv')
exportify(finland_dataframe, 'finland.csv')
exportify(greece_dataframe, 'greece.csv')
exportify(hungary_dataframe, 'hungary.csv')
exportify(iceland_dataframe, 'iceland.csv')
exportify(ireland_dataframe, 'ireland.csv')
exportify(lithuania_dataframe, 'lithuania.csv')
exportify(macedonia_dataframe, 'macedonia.csv')