> Note: Make sure to run `pip install -r requirements.txt` before running so you'll have all the required packages.

In [None]:
!python -m pip install -r requirements.txt

In [153]:
import requests
from lxml import etree
import pandas as pd
import re
import itertools
import json
import googletrans
from googletrans import Translator

In [2]:
range = (325, 1700)
is_in_range = lambda x: x >= range[0] and range[1] >= x
all_diocese_link = lambda country: "http://www.catholic-hierarchy.org/country/d{}.html".format(country)
dioces_link = lambda dioces: "http://www.catholic-hierarchy.org/diocese/{}.html".format(dioces)

In [189]:
countries = ["Russian Federation"]
translator = Translator()
all_countries_url = "http://www.catholic-hierarchy.org/diocese/qview7.html"
page_string = requests.get(all_countries_url).content
tree = etree.HTML(page_string)

In [190]:
links = tree.cssselect("table h2 a")
selected_links = [link for link in links if link.text in countries]
country_suffix = re.search("/.+/(.+)\.html", selected_links[0].get("href")).group(1)
diocese_link = all_diocese_link(country_suffix)
diocese_link

'http://www.catholic-hierarchy.org/country/dru.html'

In [191]:
def get_all_diocese_links(countries):
    all_countries_url = "http://www.catholic-hierarchy.org/diocese/qview7.html"
    page_string = requests.get(all_countries_url).content
    tree = etree.HTML(page_string)
    
    all_links = tree.cssselect("table h2 a")
    selected_links = [link for link in all_links if link.text in countries]
    
    country_suffixes = [re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in selected_links]
    country_dioces_links = [all_diocese_link(suffix) for suffix in country_suffixes]
    return list(zip(country_dioces_links, countries))

In [192]:
dlinks = get_all_diocese_links(countries)
dlinks

[('http://www.catholic-hierarchy.org/country/dru.html', 'Russian Federation')]

In [193]:
diocese_page_string = requests.get(diocese_link).content
diocese_tree = etree.HTML(diocese_page_string)

In [194]:
diocese_links = list(set([re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in diocese_tree.xpath("/html/body/ul[2]")[0].cssselect("a")]))
diocese_links, dioces_link(diocese_links[0])

(['dc613',
  'dirku',
  'dk560',
  'dv532',
  'dv533',
  'dr528',
  'dnovo',
  'druru',
  'dsara',
  'dyuzh',
  'dl567',
  'dm584',
  'dmosc'],
 'http://www.catholic-hierarchy.org/diocese/dc613.html')

In [4]:
def get_country_dioces_links(all_dioces_link):
    diocese_page_string = requests.get(all_dioces_link).content
    diocese_tree = etree.HTML(diocese_page_string)
    
    diocese_links = list(set([re.search("/.+/(.+)\.html", link.get("href")).group(1) for link in diocese_tree.xpath("/html/body/ul[2]")[0].cssselect("a")]))
    return [dioces_link(link) for link in diocese_links]

In [41]:
d1s = get_country_dioces_links(dlinks[0][0])
d1s

['http://www.catholic-hierarchy.org/diocese/dkoto.html',
 'http://www.catholic-hierarchy.org/diocese/db547.html',
 'http://www.catholic-hierarchy.org/diocese/dbary.html']

In [43]:
dioces_string = requests.get(dioces_link(diocese_links[0])).content
d_tree = etree.HTML(dioces_string)

In [44]:
dioces_name = re.search("of (.+)", d_tree.cssselect("h1")[0].text).group(1).strip()
dioces_name

'Kotor (Cattaro)'

In [45]:
bishops = d_tree.cssselect("#d3 li")
bishops = [''.join(bishop.itertext()).strip() for bishop in bishops]
bishops[0]

'Marino Contarini † (10 Jul 1430 Appointed - 19 Nov 1453 Appointed, Bishop of Treviso)'

In [46]:
parens = re.search(".+\((.+)\)", bishops[0]).group(1)
parens

'10 Jul 1430 Appointed - 19 Nov 1453 Appointed, Bishop of Treviso'

In [49]:
frm, to = parens.split("-")
frm = int(frm.split()[-2])
frm

1430

In [48]:
to_splitted = to.split()
to = to_splitted[2] if not to_splitted[1].isnumeric() else to_splitted[0]
to = int(to)
to

1453

In [53]:
to = to.strip()

In [63]:
re.search("(\d+?\s?\w?\s?(\d+)", to).group(0)

'19'

In [6]:
def get_to(to):
    to_splitted = to.split()
    if len(to_splitted) == 2:
        return int(to_splitted[0])
    elif len(to_splitted) == 3:
        return int(to_splitted[1])
    elif len(to_splitted) > 3:
        if to_splitted[0].isnumeric() and to_splitted[2].isnumeric():
            # then 18 dec 2018
            to = int(to_splitted[2])
        elif not to_splitted[0].isnumeric() and to_splitted[1].isnumeric():
            # then dec 2018
            to = int(to_splitted[1])
        else:
            #then 2018 
            to = int(to_splitted[0])

    return int(to)

def get_from(frm):
    from_pos = -2
    if "Ordained" in frm:
        from_pos = -3
    return int(frm.split()[from_pos])
    
def get_dates(bishop_text, i):
    parens = re.search("\((.+)\)", bishop_text).group(1).strip()
    parens_splt = parens.split("-")
    if len(parens_splt) > 2:
        parens_splt = parens_splt[:2]
    
    frm, to = parens_splt
    frm, to = frm.strip(), to.strip()
    
    frm = get_from(frm)
    
    if len(to) == 0:
        to = None
    elif to == "Did Not Take Effect":
        frm = None
        to = None
    else:
        to = get_to(to)
    
    return frm, to

In [None]:
dates = [get_dates(bishop, i) for i, bishop in enumerate(bishops)]
dates

In [176]:
re.search("(.+)\(\s?\d", bishops[0]).group(1).replace("†","").replace(", O.P.", "").strip()

'Marino Contarini'

In [184]:
def get_name(bishop_text):
    if "†" in bishop_text:
        name = re.search("(.+)\s†", bishop_text).group(1)
    else:
        name = re.search("(.+)\(\s?\d", bishop_text).group(1)

    if "," in name:
        name = re.search("(.+),", name).group(1)
    return name.replace("†","").strip()

In [None]:
names = [get_name(bishop) for bishop in bishops]
names

In [161]:
t = requests.get("https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=text&page=Jan_Zamoyski")
html = t.json()["parse"]["text"]["*"]
tree = etree.HTML(html)

translator.translate("".join(tree.cssselect(".mw-parser-output > p")[2].itertext()), dest = "en").text

'Jan Zamoyski was born on 19 March 1542 to Stanisław Zamoyski and Anna Herburt in Skokówka.[2] He started his education in a school in Krasnystaw but when he was thirteen years old he was sent to study abroad; from 1555 to 1559 he was a page at the royal court in Paris.[3] Already at this young age he attended lectures at the Sorbonne University and Collège de France.[3] In 1559 he briefly visited Poland, then attended the University of Strasbourg; after a few months there he moved to University of Padua, where from 1561 he studied law and received a doctorate in 1564.[2][4] During his years abroad he converted from Calvinism to Roman Catholicism.[4]'

In [206]:
def get_bio(name, translator):
    wiki_suffixes = ["en", "pl", "de", "ro", "ie", "sr", "si", "sk"]
    link = "https://{}.wikipedia.org/w/api.php?action=parse&format=json&prop=text&redirects=True&page={}"
    
    spl = name.split()
    n = "{}_{}".format(spl[0], spl[-1])
    
    resps = []
    for suffix in wiki_suffixes:
        y = requests.get(link.format(suffix, n)).json()
        if "error" not in list(y.keys()):
            resps.append(y)
    
    if len(resps) > 0:
        tree = etree.HTML(resps[0]["parse"]["text"]["*"])
        ps = tree.cssselect(".mw-parser-output > p")
        return translator.translate("" .join(["".join(p.itertext()).strip() for p in ps[1:5]]), dest = "en").text
    else:
        return None

In [166]:
def get_dioces_info(dioces_link):
    dioces_string = requests.get(dioces_link).content
    d_tree = etree.HTML(dioces_string)
    
    dioces_name = re.search("of (.+)", d_tree.cssselect("h1")[0].text).group(1).strip()
    
    bishops = d_tree.cssselect("#d3 li")
    bishops = [''.join(bishop.itertext()).strip() for bishop in bishops]
    
    names = [get_name(bishop) for bishop in bishops]
    dates = [get_dates(bishop, i) for i, bishop in enumerate(bishops)]
    
    info = []
    translator = Translator()
    for name, (frm, to) in zip(names, dates):
        if (frm is None or (frm >= 325 and frm <= 1700)) and (to is None or (to < 1700 and to >= 325)):
            info.append({
                "name": name,
                "from": frm,
                "to": to,
                "diocese": dioces_name,
                "source_link": dioces_link,
                "bio": get_bio(name, translator)
            })
    
    return info

In [None]:
info = get_dioces_info(d1s[0])
info[:5]

In [198]:
def get_info(countries):
    all_country_diocese_links = get_all_diocese_links(countries)
    info = []
    for all_country_diocese_link, country in all_country_diocese_links:
        dioces_links = get_country_dioces_links(all_country_diocese_link)
        for dioces_link in dioces_links:
            dioces_bishops = get_dioces_info(dioces_link)
            for bishop in dioces_bishops:
                bishop["country"] = country
            
            info += dioces_bishops
    return info

In [204]:
infs = get_info(["Romania", "Montenegro", "Ireland", "Serbia", "Slovenia", "Slovakia"])
df = pd.DataFrame(infs)
df.describe()

http://www.catholic-hierarchy.org/diocese/dbrby.html
http://www.catholic-hierarchy.org/diocese/dspis.html
http://www.catholic-hierarchy.org/diocese/dkoby.html
http://www.catholic-hierarchy.org/diocese/dkosi.html
http://www.catholic-hierarchy.org/diocese/dtrna.html
http://www.catholic-hierarchy.org/diocese/dnitr.html
http://www.catholic-hierarchy.org/diocese/dprby.html
http://www.catholic-hierarchy.org/diocese/dzili.html
http://www.catholic-hierarchy.org/diocese/dbrat.html
http://www.catholic-hierarchy.org/diocese/dmlsk.html
http://www.catholic-hierarchy.org/diocese/drozn.html
http://www.catholic-hierarchy.org/diocese/dbaby.html


Unnamed: 0,from,to
count,29.0,28.0
mean,1535.310345,1546.035714
std,150.913671,150.688662
min,880.0,900.0
25%,1484.0,1498.75
50%,1550.0,1569.5
75%,1619.0,1629.25
max,1691.0,1695.0


In [205]:
df

Unnamed: 0,bio,country,diocese,from,name,source_link,to
0,,Slovakia,Nitra,880,Wiching,http://www.catholic-hierarchy.org/diocese/dnit...,900.0
1,He was born into a magnate family as the son N...,Slovakia,Nitra,1438,Dénes Szécsi,http://www.catholic-hierarchy.org/diocese/dnit...,1439.0
2,,Slovakia,Nitra,1440,Ladislav Bebek de Csetnek,http://www.catholic-hierarchy.org/diocese/dnit...,1447.0
3,,Slovakia,Nitra,1449,Miklós,http://www.catholic-hierarchy.org/diocese/dnit...,1456.0
4,,Slovakia,Nitra,1458,Albert Hangácsi,http://www.catholic-hierarchy.org/diocese/dnit...,1458.0
5,,Slovakia,Nitra,1460,Elias,http://www.catholic-hierarchy.org/diocese/dnit...,
6,,Slovakia,Nitra,1463,Thomas de Debrenthe (Branche),http://www.catholic-hierarchy.org/diocese/dnit...,1484.0
7,,Slovakia,Nitra,1484,Gregor,http://www.catholic-hierarchy.org/diocese/dnit...,1492.0
8,,Slovakia,Nitra,1493,Anton von Sankfalva,http://www.catholic-hierarchy.org/diocese/dnit...,1501.0
9,,Slovakia,Nitra,1501,Mikuláš Csáky (de Bačka),http://www.catholic-hierarchy.org/diocese/dnit...,1503.0


TODO:
- keep a list of when each diocese was erected. if we have diocese where the earliest bishop's start year is > erecrected year, go out to the country's wiki page and get the missing bishops. ex: https://ro.wikipedia.org/wiki/Lista_episcopilor_romano-catolici_ai_Transilvaniei (Romania, Alba Iulia)