In [1]:
import requests
from bs4 import BeautifulSoup
import lxml
import pickle
import csv
import re

In [2]:
prefix = "https://en.wikipedia.org"
descartes = "/wiki/Ren%C3%A9_Descartes"
#print(parse_connections(prefix, descartes))
ibn = "/wiki/Ibn_al-Haytham"
#print(parse_connections(prefix, ibn))

In [13]:
# This function is passed two components of a url, the general English wikipedia url
# and the reference to a particular page. It then pulls the html of the page specified
# and turns it into a parseable "soup" using the BeautifulSoup package. It verifies
# that the page has "div" components with the labels "Influenced" and "Influences" or 
# "Influenced by". These are reliable indicators of the page being a philosopher or
# at the very least an relevant figure in philosophy. If the page doesn't match an expected
# format then it returns four None. Otherwise, it calls influences_parse or influenced_by_parse
# depending on the pages format.
def parse_connections(prefix, ref):
    page = requests.get(prefix + ref)
    soup = BeautifulSoup(page.content, "lxml")

    influences = soup.find("div", string = "Influences")
    influenced = soup.find("div", string = "Influenced")
    influenced_by = soup.find("div", string = "Influenced\xa0by")

    if influences is None and influenced_by is None: return None, None, None, None
    if influenced is None: return None, None, None, None

    if influenced_by is None: return influences_parse(soup, ref)
    else: return influenced_by_parse(soup, ref)

# -----------------------------------------------------------------
# It's important to know that most philosophers had both "Influenced" and "Influences" as
# "div" components whereas most Islamic scholars had "Influenced" and "Influenced by". The
# division of the parse functions is to deal with these two cases.
# -----------------------------------------------------------------

# This function parses pages that have a "div" component called "Influences". It
# has a page's html "soup" passed as well as it's ref. It begins by finding the title
# text of the page which is the philosopher's name. It then finds the "divs" of class
# "center". The first two of which are (hopefully) reliably the "Influences" and
# "Influenced" components. Then the groups of philosophers labeled "Influences" and 
# "Influenced" are converted into list objects. 
def influences_parse(soup, ref):

    name = soup.find("h1", id = "firstHeading").text
    divs = soup.find_all("div", class_ = "center")

    infs = divs[0]
    infd = divs[1]

    infs = bs4_list_convert(infs)
    infd = bs4_list_convert(infd)

    return name, ref, infs, infd

# This function does the same as above except instead of finding "div" components 
# of class "center" it must find "ul" components of class "NavContent". This is due
# to the different html structure of pages with "div" components called "Influenced by". 
def influenced_by_parse(soup, ref):

    name = soup.find("h1", id = "firstHeading").text
    uls = soup.find_all("ul", class_ = "NavContent")

    infs = uls[0]
    infd = uls[1]

    infs = bs4_list_convert(infs)
    infd = bs4_list_convert(infd)

    return name, ref, infs, infd

# This function parses an html component containing a list of philosophers. 
# It receives a component, div, and finds all of the "a" components and extracts
# their text and ref (this may cause some issues with philosophers whose name is 
# different in their page heading than when their in a list). It then returns a list
# of all the names and refs of all of the philosophers. 
def bs4_list_convert(div):
    l = []

    for person in div.find_all("a", href = True):
        name = person.get_text()
        ref = person["href"]

        if not "[" in name and not "wikipedia" in ref:
            l.append([name,ref]) 

    return l

In [14]:
# This function is passed two string components of a url for a philosopher's wiki
# page as well as a dictionary of philosophers. If the philosophers dictionary
# is None then the function simply parses the wiki page of the one philosopher and
# returns a dictionary with just that entry. If the dictionary already has entries
# then it iterates through that philosophers influencers and influencees, parses
# their wiki page and adds them to the philosophers dictionary. It then returns the
# larger dictionary. 
def phil_crawl(prefix, ref, philosophers):

    name, ref, infs, infd = parse_connections(prefix, ref)
    if philosophers is None: return {name:[ref, [row[0] for row in infs], [row[0] for row in infd]]}

    for person in infs:
        if not person[0] in philosophers.keys():
            name, ref, ifs, ifd = parse_connections(prefix, person[1])
            if not name is None: 
                philosophers[name] = [ref, [row[0] for row in ifs], [row[0] for row in ifd]]

    for person in infd: 
        if not person[0] in philosophers.keys():
            name, ref, ifs, ifd = parse_connections(prefix, person[1])
            if not name is None: 
                philosophers[name] = [ref, [row[0] for row in ifs], [row[0] for row in ifd]]

    return philosophers


In [15]:
# This function is where the network is fully collected. It is passed two
# components of a philosopher's wiki page url to start the web crawl at. 
# It also takes the number of iterations which is essentially the number of
# steps away from the starting philosopher you would want to go. It starts
# by initializing the dictionary with a phil_crawl that returns a dictionary
# with just one entry. Then it iteratively goes through the entire dictionary
# and for any entry that it hasn't already, it uses phil_crawl to add all
# their connections to the dictionary. This results in a network with all the
# connections to the first philosopher that lie within the same number of
# degree as iterations
def iterated_crawl(prefix, ref, iterations):
    phils = phil_crawl(prefix, ref, None)
    temp = phils.copy()
    searched = []
    print(len(phils))

    for i in range(iterations):
        for key, value in phils.items():
            if not key in searched: 
                temp = phil_crawl(prefix, value[0], temp).copy()
                searched.append(key)
        phils = temp.copy()
        print(len(phils))

    return phils
    
# philosophers = iterated_crawl(prefix, descartes, 15)

In [118]:
# saves the network in .pkl file
# f = open(r'philosophers3.pkl', 'wb')
# pickle.dump(philosophers, f)
# f.close()

# loads the first version of the network from .pkl file 
# (didn't account for "influenced by" formats)
f = open(r'philosophers.pkl', 'rb')
phil_1 = pickle.load(f)
f.close()

# loads the second version of the network from .pkl file 
# (now accounts for "influenced by")
f = open(r'philosophers2.pkl', 'rb')
phil_2 = pickle.load(f)
f.close()

# loads the third version of the network from .pkl file 
# (more efficient crawling for some reason has more nodes)
f = open(r'philosophers3.pkl', 'rb')
phil_3 = pickle.load(f)
f.close()


In [17]:
# checks for which entries were in network version 2
# that weren't in network version 1
for p in phil_2.keys():
    if not p in phil_1.keys():
        print(p)

print("------------------")

# checks for which entries were in network version 3
# that weren't in network version 2
for p in phil_3.keys():
    if not p in phil_2.keys():
        print(p)

Ibn al-Haytham
Omar Khayyam
Al-Khazini
Mulla Sadra
Al-Khalil ibn Ahmad al-Farahidi
Miskawayh
Nasir al-Din al-Tusi
Ibn Tufail
Abu Amr of Basra
Sibawayh
Sam Harris
Michael Sandel
Rumi
Muhammad Asad
Israr Ahmed
Ibn Tumart
Ibn al-Nafis
Abu Madyan
Abu Said al-Baji
Al-Suyuti
Ismail Haqqi Bursevi
Said Nursî
Niftawayh
Abu Turab al-Zahiri
Bayazid Bastami
Muhammad Abduh
Rashid Rida
Ibn Hazm
Abu Hayyan al-Gharnati
Abdul Qadir Gilani
Abd as-Salam ibn Mashish al-Alami
Abul Hasan ash-Shadhili
Ibn Taymiyyah
Al-Kamal ibn al-Humam
Al-Sha'rani
Osman Fazli
Dawud al-Zahiri
Muhammad bin Dawud al-Zahiri
Abu Abd al-Rahman Ibn Aqil al-Zahiri
Al-Hallaj
Jamāl al-Dīn al-Afghānī
Muhammad Nasiruddin al-Albani
Ibn Khaldun
Ibn Mada'
Al-Dhahabi
Muhammad al-Jazuli
Al-Hasan ibn 'Ali al-Barbahari
Ibn Battah
Ibn Qudamah
Muhammad ibn Abd al-Wahhab
Abu Hanifa
Zakariyya al-Ansari
Ibn Hajar al-Haytami
Ibrahim al-Bajuri
Al-Shafiʽi
Ahmad ibn Hanbal
Ibn Kullab
Ishaq ibn Rahwayh
Yaḥya ibn Maʻin
Al-Tabari
Ruwaym
Imadaddin Nasimi


In [115]:

def date_clean(date):

    if date is None: return None

    # "^[0|00] Month [000|0000]"
    temp = re.search("^[0-9]{1,2} [A-Za-z]* [0-9]{3,4}[^0-9]", date)
    if not temp is None: return temp.group()[:-1]
    
    # same as above but at anywhere in the string
    temp = re.search("[^0-9][0-9]{1,2} [A-Za-z]* [0-9]{3,4}[^0-9]", date)
    if not temp is None: return temp.group()[1:-1]

    # same as above but it's the entire string
    temp = re.search("^[0-9]{1,2} [A-Za-z]* [0-9]{3,4}$", date)
    if not temp is None: return temp.group()

    # same as above but it's at the end of the string
    temp = re.search("[^0-9][0-9]{1,2} [A-Za-z]* [0-9]{3,4}$", date)
    if not temp is None: return temp.group()[1:]

    # ------------------------------------------------------------------------
    
    # "^Month [0|00], [000|0000]"
    temp = re.search("^[A-Z][a-z]* [0-9]{1,2},? [0-9]{3,4}[^0-9]",date)
    if not temp is None: return temp.group()[:-1]
    
    # same as above but at anywhere in the string
    temp = re.search("[^A-Z][A-Z][a-z]* [0-9]{1,2},? [0-9]{3,4}[^0-9]",date)
    if not temp is None: return temp.group()[1:-1]

    # same as above but it's the entire string
    temp = re.search("^[A-Z][a-z]* [0-9]{1,2},? [0-9]{3,4}$",date)
    if not temp is None: return temp.group()

    # same as above but it's at the end of the string
    temp = re.search("[^A-Z][A-Z][a-z]* [0-9]{1,2},? [0-9]{3,4}$",date)
    if not temp is None: return temp.group()[1:]

    # ------------------------------------------------------------------------
    
    # "^[00|000|0000]-00-[00|000|0000]"
    temp = re.search("^[0-9]{2,4}-[0-9]{2}-[0-9]{2,4}[^0-9]",date)
    if not temp is None: return temp.group()[:-1]

    # same as above but at anywhere in the string
    temp = re.search("[^0-9][0-9]{2,4}-[0-9]{2}-[0-9]{2,4}[^0-9]",date)
    if not temp is None: return temp.group()[1:-1]

    # same as above but it's the entire string
    temp = re.search("^[0-9]{2,4}-[0-9]{2}-[0-9]{2,4}$",date)
    if not temp is None: return temp.group()

    # same as above but it's at the end of the string
    temp = re.search("[^0-9][0-9]{2,4}-[0-9]{2}-[0-9]{2,4}$",date)
    if not temp is None: return temp.group()[1:]

    # ------------------------------------------------------------------------
    
    # "^[000|0000]"
    temp = re.search("^[0-9]{2,4}[^0-9]",date)
    if not temp is None: return temp.group()[:-1]
    
    # same as above but at anywhere in the string
    temp = re.search("[^0-9][0-9]{2,4}[^0-9]",date)
    if not temp is None: return temp.group()[1:-1]

    # same as above but it's the entire string
    temp = re.search("^[0-9]{2,4}$",date)
    if not temp is None: return temp.group()

    # same as above but it's at the end of the string
    temp = re.search("[^0-9][0-9]{2,4}$",date)
    if not temp is None: return temp.group()[1:]

    # ------------------------------------------------------------------------
    
for i in range(len(phil_4)):
    p = list(phil_4.values())[i]
    print("index: ", i, "\t",p[3],"\t", p[4],"\n")
    print(date_clean(p[3]),"\n")
    print(date_clean(p[4]),"\n")


dex:  615 	 (1917-01-26)26 January 1917Vienna, Austria-Hungary 	 (2010-11-07)7 November 2010 

26 January 1917 

7 November 2010 

index:  616 	 1947Como, Italy 	 None 

1947 

None 

index:  617 	 28 January 1920Paris 	 24 August 2016 (96 years) 

28 January 1920 

24 August 2016 

index:  618 	 28 September 1828Wald near Solingen 	 21 November 1875(1875-11-21) (aged 47)Marburg 

28 September 1828 

21 November 1875 

index:  619 	 1017 	 1073 (age 56) 

1017 

1073 

index:  620 	 1033 	 1107 (aged 74) 

1033 

1107 

index:  621 	 (1934-09-12)September 12, 1934Daegu,Korea (now in S. Korea) 	 November 27, 2019(2019-11-27) (aged 85) 

September 12, 1934 

November 27, 2019 

index:  622 	 (1938-11-16)November 16, 1938Brooklyn, New York, U.S. 	 January 23, 2002(2002-01-23) (aged 63)Cambridge, Massachusetts, U.S. 

November 16, 1938 

January 23, 2002 

index:  623 	  (1935-06-30) June 30, 1935 (age 85)New York, United States 	 None 

June 30, 1935 

None 

index:  624 	 Étienne Émile M

In [123]:
list(phil_4.values())[150]

['/wiki/Emmanuel_Levinas',
 ['Henri Bergson',
  'Buber',
  'Monsieur Chouchani',
  'Descartes',
  'Heidegger',
  'Husserl',
  'Kierkegaard',
  'Shestov',
  'Maimonides',
  'Marcel',
  'Rosenzweig',
  'Wahl'],
 ['Agamben',
  'Bauman',
  'Bernasconi',
  'Blanchot',
  'Butler',
  'Critchley',
  'Derrida',
  'Dussel',
  'Finkielkraut',
  'B. Lévy',
  'B.-H. Lévy',
  'Marion',
  'Putnam',
  'Ricoeur',
  'Ronell'],
 '12 January 1906',
 '25 December 1995']

In [117]:

vars = ["Born","Died"]

def table_scrape(prefix, ref, vars):

    page = requests.get(prefix + ref)
    soup = BeautifulSoup(page.content, "lxml")

    table = soup.find("table", class_ = "infobox biography vcard")
    if table is None: table = soup.find("tbody")
    rows = table.find_all("tr")

    row_dict = {}
    for r in rows:
        th = r.find("th")
        td = r.find("td")
        if not th is None and not td is None and th.text in vars:
            if th.text in ["Born","Died"]: row_dict[th.text] = date_clean(td.text)
    
    var_list = []
    for var in vars:
        if var in row_dict:
            var_list.append([var,row_dict[var]])
        else: var_list.append([var,None])
        
    return var_list

table_scrape(prefix, descartes, vars)


[['Born', '31 March 1596'], ['Died', '11 February 1650']]

In [119]:
def add_info(prefix, phils, vars):

    for key, value in phils.items():
        attrs = table_scrape(prefix,value[0],vars)
        for a in attrs:
            value.append(a[1])

    return phils

phil_4 = add_info(prefix, phil_3, vars)

In [124]:
#saves the network in .pkl file
#f = open(r'philosophers4.pkl', 'wb')
#pickle.dump(phil_4, f)
#f.close()

# loads the fourth version of the network from .pkl file 
# (includes date of birth and death and nationality)
f = open(r'philosophers4.pkl', 'rb')
phil_4 = pickle.load(f)
f.close()

In [8]:
# This function creates a list of all the directed edges by 
# taking the union of connections. (e.g. if it's listed
# on Descartes page that he influenced Kant but it 
# doesn't list on Kant's page that he was influenced
# by Descartes it will appear as a connection). It
# takes in the network as collected by iterated_crawl
# then returns a list of pairs where the first entry
# in each pair influenced the second entry in the pair
def edge_finder(network):

    edges = []

    for key, value in network.items():

        for infs in value[1]:
            if infs in network:
                edges.append([infs,key])

        for infd in value[2]:
            if infd in network:
                temp = [key,infd]
                if not temp in edges:
                    edges.append(temp)

    return edges

edges = edge_finder(philosophers)

In [9]:
# This converts the network and list of edges into a standard
# format .gdf file which is typically how network data is stored.
# For more relevant information on this file format look here:
# https://gephi.org/users/supported-graph-formats/gdf-format/
with open("philosophers.gdf", "w", encoding = "utf-8") as f:
    fwrite = csv.writer(f, lineterminator = '\n') 
    
    node_list = {}

    fwrite.writerow(["nodedef>name VARCHAR","label VARCHAR"])

    ctr = 0
    for key, value in philosophers.items():
        ctr += 1
        node_list[key] = "n" + str(ctr)
        fwrite.writerow(["n"+ str(ctr),key])
        #f.write("\n")

    fwrite.writerow(["edgedef>node1 VARCHAR","node2 VARCHAR","directed BOOLEAN"])

    for edge in edges:
        fwrite.writerow([node_list[edge[0]], node_list[edge[1]],"true"])