> Note: Make sure to run `pip install -r requirements.txt` before running so you'll have all the required packages.

In [None]:
!python -m pip install -r requirements.txt

In [1]:
import requests
from lxml import etree
import pandas as pd
import re
import itertools

In [2]:
range = (325, 1700)
is_in_range = lambda x: x >= range[0] and range[1] >= x
all_diocese_link = lambda country: "http://www.catholic-hierarchy.org/country/d{}.html".format(country)
dioces_link = lambda dioces: "http://www.catholic-hierarchy.org/diocese/{}.html".format(dioces)

In [35]:
countries = ["Montenegro"]
all_countries_url = "http://www.catholic-hierarchy.org/diocese/qview7.html"
page_string = requests.get(all_countries_url).content
tree = etree.HTML(page_string)

In [37]:
links = tree.cssselect("table h2 a")
selected_links = [link for link in links if link.text in countries]
country_suffix = re.search("/.+/(.+)\.html", selected_links[0].get("href")).group(1)
diocese_link = all_diocese_link(country_suffix)
diocese_link

'http://www.catholic-hierarchy.org/country/dme.html'

In [12]:
def get_all_diocese_links(countries):
    all_countries_url = "http://www.catholic-hierarchy.org/diocese/qview7.html"
    page_string = requests.get(all_countries_url).content
    tree = etree.HTML(page_string)
    
    all_links = tree.cssselect("table h2 a")
    selected_links = [link for link in all_links if link.text in countries]
    
    country_suffixes = [re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in selected_links]
    country_dioces_links = [all_diocese_link(suffix) for suffix in country_suffixes]
    return list(zip(country_dioces_links, countries))

In [38]:
dlinks = get_all_diocese_links(countries)
dlinks

[('http://www.catholic-hierarchy.org/country/dme.html', 'Montenegro')]

In [39]:
diocese_page_string = requests.get(diocese_link).content
diocese_tree = etree.HTML(diocese_page_string)

In [40]:
diocese_links = list(set([re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in diocese_tree.xpath("/html/body/ul[2]")[0].cssselect("a")]))
diocese_links, dioces_link(diocese_links[0])

(['dkoto', 'db547', 'dbary'],
 'http://www.catholic-hierarchy.org/diocese/dkoto.html')

In [26]:
def get_country_dioces_links(all_dioces_link):
    diocese_page_string = requests.get(all_dioces_link).content
    diocese_tree = etree.HTML(diocese_page_string)
    
    diocese_links = list(set([re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in diocese_tree.xpath("/html/body/ul[2]")[0].cssselect("a")]))
    return [dioces_link(link) for link in diocese_links]

In [41]:
d1s = get_country_dioces_links(dlinks[0][0])
d1s

['http://www.catholic-hierarchy.org/diocese/dkoto.html',
 'http://www.catholic-hierarchy.org/diocese/db547.html',
 'http://www.catholic-hierarchy.org/diocese/dbary.html']

In [43]:
dioces_string = requests.get(dioces_link(diocese_links[0])).content
d_tree = etree.HTML(dioces_string)

In [44]:
dioces_name = re.search("of (.+)", d_tree.cssselect("h1")[0].text).group(1).strip()
dioces_name

'Kotor (Cattaro)'

In [45]:
bishops = d_tree.cssselect("#d3 li")
bishops = [''.join(bishop.itertext()).strip() for bishop in bishops]
bishops[0]

'Marino Contarini † (10 Jul 1430 Appointed - 19 Nov 1453 Appointed, Bishop of Treviso)'

In [46]:
parens = re.search(".+\((.+)\)", bishops[0]).group(1)
parens

'10 Jul 1430 Appointed - 19 Nov 1453 Appointed, Bishop of Treviso'

In [49]:
frm, to = parens.split("-")
frm = int(frm.split()[-2])
frm

1430

In [48]:
to_splitted = to.split()
to = to_splitted[2] if not to_splitted[1].isnumeric() else to_splitted[0]
to = int(to)
to

1453

In [53]:
to = to.strip()

In [63]:
re.search("(\d+?\s?\w?\s?(\d+)", to).group(0)

'19'

In [74]:
# ValueError: invalid literal for int() with base 10: 'Mar 1619 Appointed, Bishop of Nitra'

def get_to(to):
    to_splitted = to.split()
    if len(to_splitted) == 2:
        return int(to_splitted[0])
    elif len(to_splitted) == 3:
        return int(to_splitted[1])
    elif len(to_splitted) > 3:
        if to_splitted[0].isnumeric() and to_splitted[2].isnumeric():
            # then 18 dec 2018
            to = int(to_splitted[2])
        elif not to_splitted[0].isnumeric() and to_splitted[1].isnumeric():
            # then dec 2018
            to = int(to_splitted[1])
        else:
            #then 2018 
            to = int(to_splitted[0])

    return int(to)

def get_from(frm):
    from_pos = -2
    if "Ordained" in frm:
        from_pos = -3
    return int(frm.split()[from_pos])
    
def get_dates(bishop_text, i):
    parens = re.search("\((.+)\)", bishop_text).group(1).strip()
    parens_splt = parens.split("-")
    if len(parens_splt) > 2:
        parens_splt = parens_splt[:2]
    
    frm, to = parens_splt
    frm, to = frm.strip(), to.strip()
    
    frm = get_from(frm)
    
    if len(to) == 0:
        to = None
    elif to == "Did Not Take Effect":
        frm = None
        to = None
    else:
        to = get_to(to)
    
    return frm, to

In [164]:
dates = [get_dates(bishop, i) for i, bishop in enumerate(bishops)]
dates

[(1430, 1453),
 (1453, 1457),
 (1457, 1459),
 (1459, 1471),
 (1471, None),
 (1471, None),
 (1493, 1513),
 (1514, 1540),
 (1540, 1565),
 (1565, 1578),
 (1578, 1581),
 (1581, 1603),
 (1604, 1611),
 (1611, 1620),
 (1620, 1622),
 (1622, 1655),
 (1656, 1688),
 (1688, 1708),
 (1709, 1715),
 (1716, 1718),
 (1718, 1742),
 (1743, 1744),
 (1744, 1761),
 (1762, 1788),
 (1789, 1793),
 (1794, 1796),
 (1796, 1801),
 (1801, 1815),
 (1828, 1853),
 (1854, 1856),
 (1856, 1866),
 (1868, 1879),
 (1879, 1887),
 (1888, 1895),
 (1895, 1937),
 (1938, 1950),
 (1981, 1983),
 (1983, 1996),
 (1996, None)]

In [176]:
re.search("(.+)\(\s?\d", bishops[0]).group(1).replace("†","").replace(", O.P.", "").strip()

'Marino Contarini'

In [8]:
def get_name(bishop_text):
    name = re.search("(.+)\(\s?\d", bishop_text).group(1)
    if "," in name:
        name = re.search("(.+),", name).group(1)
    return name.replace("†","").strip()

In [183]:
names = [get_name(bishop) for bishop in bishops]
names

['Marino Contarini',
 'Bernardo da Venezia',
 'Angelo Fasolo',
 'Marco Negro',
 'Pietro de Bruti',
 'Antonio de Pago',
 'Giovanni Chericato',
 'Trifone Bisanti',
 'Luca Bisanti',
 'Paolo Bisanti',
 'Franjo Župan',
 'Jerónimo Bucchia',
 'Angelo Baroni',
 'Girolamo Rusca',
 'Giuseppe (Giacomo) Pamphilj',
 'Vincenzo Bucchi (Buschio)',
 'Ivan Antun Zboronac',
 'Marino Drago',
 'Francesco Parchich (Parcic)',
 'Simone Gritti',
 'Giacinto Zanobetti',
 'Vincent Drago',
 'Giovanni Antonio Castelli',
 'Stefano dell’Oglio',
 'Giovanni Martino Bernardoni Baccolo',
 'Mihajlo Mate Spalatin',
 'Francesco Pietro Raccamarich',
 'Marco Antonio Gregorina',
 'Stjepan Pavlović-Lučić',
 'Vinko Zubranić',
 'Marko Kalogjera (Calogerà)',
 'Djordje Marčić',
 'Casimiro Forlani',
 'Trifon Radoničić',
 'Franjo Uccelini-Tice',
 'Pavao Butorac',
 'Marko Perić',
 'Ivo Gugic',
 'Ilija Janjić']

In [76]:
def get_dioces_info(dioces_link):
    dioces_string = requests.get(dioces_link).content
    d_tree = etree.HTML(dioces_string)
    
    dioces_name = re.search("of (.+)", d_tree.cssselect("h1")[0].text).group(1).strip()
    
    bishops = d_tree.cssselect("#d3 li")
    bishops = [''.join(bishop.itertext()).strip() for bishop in bishops]
    
    names = [get_name(bishop) for bishop in bishops]
    dates = [get_dates(bishop, i) for i, bishop in enumerate(bishops)]
    
    info = []
    for name, (frm, to) in zip(names, dates):
        info.append({
            "name": name,
            "from": frm,
            "to": to,
            "diocese": dioces_name,
            "source_link": dioces_link
        })
    
    return info

In [208]:
info = get_dioces_info(d1s[0])
info[:5]

[{'name': 'Marino Contarini',
  'from': 1430,
  'to': 1453,
  'diocese': 'Kotor (Cattaro)',
  'source_link': 'http://www.catholic-hierarchy.org/diocese/dkoto.html'},
 {'name': 'Bernardo da Venezia',
  'from': 1453,
  'to': 1457,
  'diocese': 'Kotor (Cattaro)',
  'source_link': 'http://www.catholic-hierarchy.org/diocese/dkoto.html'},
 {'name': 'Angelo Fasolo',
  'from': 1457,
  'to': 1459,
  'diocese': 'Kotor (Cattaro)',
  'source_link': 'http://www.catholic-hierarchy.org/diocese/dkoto.html'},
 {'name': 'Marco Negro',
  'from': 1459,
  'to': 1471,
  'diocese': 'Kotor (Cattaro)',
  'source_link': 'http://www.catholic-hierarchy.org/diocese/dkoto.html'},
 {'name': 'Pietro de Bruti',
  'from': 1471,
  'to': None,
  'diocese': 'Kotor (Cattaro)',
  'source_link': 'http://www.catholic-hierarchy.org/diocese/dkoto.html'}]

In [28]:
def get_info(countries):
    all_country_diocese_links = get_all_diocese_links(countries)
    info = []
    for all_country_diocese_link, country in all_country_diocese_links:
        dioces_links = get_country_dioces_links(all_country_diocese_link)
        for dioces_link in dioces_links:
            dioces_bishops = get_dioces_info(dioces_link)
            for bishop in dioces_bishops:
                bishop["country"] = country
            
            info += dioces_bishops
    return info

In [78]:
infs = get_info(["Romania", "Montenegro"])

In [79]:
len(infs)

292

In [86]:
df = pd.DataFrame(infs)
df

Unnamed: 0,country,diocese,from,name,source_link,to
0,Romania,Kotor (Cattaro),1430.0,Marino Contarini,http://www.catholic-hierarchy.org/diocese/dkot...,1453.0
1,Romania,Kotor (Cattaro),1453.0,Bernardo da Venezia,http://www.catholic-hierarchy.org/diocese/dkot...,1457.0
2,Romania,Kotor (Cattaro),1457.0,Angelo Fasolo,http://www.catholic-hierarchy.org/diocese/dkot...,1459.0
3,Romania,Kotor (Cattaro),1459.0,Marco Negro,http://www.catholic-hierarchy.org/diocese/dkot...,1471.0
4,Romania,Kotor (Cattaro),1471.0,Pietro de Bruti,http://www.catholic-hierarchy.org/diocese/dkot...,
5,Romania,Kotor (Cattaro),1471.0,Antonio de Pago,http://www.catholic-hierarchy.org/diocese/dkot...,
6,Romania,Kotor (Cattaro),1493.0,Giovanni Chericato,http://www.catholic-hierarchy.org/diocese/dkot...,1513.0
7,Romania,Kotor (Cattaro),1514.0,Trifone Bisanti,http://www.catholic-hierarchy.org/diocese/dkot...,1540.0
8,Romania,Kotor (Cattaro),1540.0,Luca Bisanti,http://www.catholic-hierarchy.org/diocese/dkot...,1565.0
9,Romania,Kotor (Cattaro),1565.0,Paolo Bisanti,http://www.catholic-hierarchy.org/diocese/dkot...,1578.0
