# Extracting the Graph of Mathematicians
Amirabbas Asadi

In [4]:
import requests as rq
from bs4 import BeautifulSoup

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Extracting doctoral advisors and doctoral students

In [6]:
url_pref = 'https://en.wikipedia.org'
def get_info(url):
  page = rq.get(url_pref+url)
  soup = BeautifulSoup(page.content, 'html.parser')
  info = soup.find('table', class_='infobox')
  rows = info.find_all('tr')
  advisors = []
  index = None
  for i, row in enumerate(rows):
    t = row.get_text().lower()
    if t.find('advisor') > -1:
      index = i
      break
  if index != None:
    advisors_temp = rows[index].find('td').find_all('a', href=True)
    advisors_list = [ (i.get_text(), i['href'])  for i in advisors_temp]
    for i_n, i_u in advisors_list:
      if(i_n[0].isupper() and i_n[0].lower().find('university') == -1):
        advisors.append((i_n, i_u))
  
  students = []
  s_index = None
  for i, row in enumerate(rows):
    t = row.get_text().lower()
    if t.find('student') > -1:
      s_index = i
      break
  if s_index != None:
    students_temp = rows[s_index].find('td').find_all('a', href=True)
    students_list = [ (i.get_text(), i['href'])  for i in students_temp]
    for i_n, i_u in students_list:
      if(i_n[0].isupper() and i_n[0].lower().find('university') == -1):
        students.append((i_n, i_u))

  nationality = []
  n_index = None
  for i, row in enumerate(rows):
    t = row.get_text().lower()
    if t.find('nationality') > -1:
      n_index = i
      break
  if n_index != None:
    nationality_temp = rows[n_index].find('td').find_all('a', href=True)
    nationality_list = [ (i.get_text(), i['href'])  for i in nationality_temp]
    for i_n, i_u in nationality_list:
      if(i_n[0].isupper() and i_n[0].lower().find('nationality') == -1):
        nationality.append((i_n, i_u))
  return advisors, students, nationality

In [7]:
get_info('/wiki/Rudolf_Lipschitz')

([('Gustav Dirichlet', '/wiki/Peter_Gustav_Lejeune_Dirichlet'),
  ('Martin Ohm', '/wiki/Martin_Ohm')],
 [('Felix Klein', '/wiki/Felix_Klein')],
 [('Germany', '/wiki/Germany')])

## Performing BFS

In [8]:
import networkx as nx
S = [('William Feller', '/wiki/William_Feller')]
graph = nx.DiGraph()
visited = {}
max_visit = 1000000
attrs = {}
while len(S) > 0 and len(visited) < max_visit:
  v = S.pop(0)
  visited[v[1]] = v[0]
  advisors = None
  students = None
  nationality = None
  try:
    advisors, students,nationality  = get_info(v[1])
  except:
    advisors = None
    students = None
    nationality = None

  if(advisors != None):
    for adv_n, adv_u in advisors:
      graph.add_edge(adv_u, v[1])
      if(adv_u not in visited):
        S.append((adv_n, adv_u))
        visited[adv_u] = adv_n
        print(adv_n)

  if(students != None):
    for std_n, std_u in students:
      graph.add_edge(v[1], std_u)
      if(std_u not in visited):
        S.append((std_n, std_u))
        visited[std_u] = std_n
        print(std_n)
  
  if(nationality != None):
    for natio_n, natio_u in nationality:
      attrs[v[1].strip()] = {"nationality" : natio_n} #  node0: {attr0: val00, attr1: val01}
      print(natio_n)
  nx.set_node_attributes(graph, attrs) # add attrs to graph

  

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Hubert Newton
American
A.V. Balakrishnan
J. Thomas Beale
Andrew Majda
Michael C. Reed
American
Ronald W. Yeung
R. Cengiz Ertekin
Athanasios Papoulis
Lida Barrett
Arthur Milgram
D. Weldon Woodard
Leo Zippin
United States
James Munkres
Peter Shalen
Beatrice Aitchison
Edwin E. Floyd
M. K. Fort, Jr.
John L. Kelley
Alexander Doniphan Wallace
American
C. Anthony Anderson
Peter Andrews
George Alfred Barnard
William W. Boone
Martin Davis
William Easton
Alfred Foster
Leon Henkin
David Kaplan
John George Kemeny
Stephen Cole Kleene
Simon B. Kochen
Maurice L'Abbé
Isaac (Richard) Malitz
Gary R. Mar
Michael O. Rabin
Nicholas Rescher
Joel Robbin
Hartley Rogers, Jr
J. Barkley Rosser
Dana Scott
Norman Shapiro
Raymond Smullyan
Alan Turing
Alan Perlis
Seymour Geisser
United States
Carl B. Allendoerfer
Hugo von Seeliger
German
Scottish
J. C. Wichmannshausen
Abraham Gotthelf Kästner
German
German
Johann Schweigger
Johann Andreas Segner
Johann



```
# This is formatted as code
```

## Saving the graph

In [9]:
import pickle 
with open('gdrive/My Drive/graph.data', 'wb') as fp:
  pickle.dump(graph, fp, pickle.HIGHEST_PROTOCOL)
with open('gdrive/My Drive/visited.data', 'wb') as fp:
  pickle.dump(visited, fp, pickle.HIGHEST_PROTOCOL)

## Searching for a mathematician

In [10]:
for i in graph.nodes:
  if i.find('Henri') > -1:
    print(i)

/wiki/Peter_Henrici_(mathematician)
/wiki/Olaus_Henrici
/wiki/Peter_K._Henrici
/wiki/Henri_Villat
/wiki/Henri_Lebesgue
/wiki/Henri_Pad%C3%A9
/wiki/Henri_Poincar%C3%A9
/wiki/Henri_Bortoft
/wiki/%C3%89mile_Henriot_(chemist)
/wiki/Henri_Cartan
/wiki/Henri_Hogbe_Nlend
/wiki/Henri_Darmon
/wiki/Henrik_Steffens
/wiki/Henri_Gouraud_(computer_scientist)
/wiki/Henri_Gillet
/wiki/Jacobus_Henricus_van_%27t_Hoff
/wiki/Henri_Berestycki
/wiki/Henri_Victor_Regnault
/wiki/Henri_Moscovici
/wiki/Henricus_Regius


## Saving the graph as csv for visualization

In [11]:
edges = []
attrs = nx.get_node_attributes(graph,'nationality') # get all attrs
print(attrs)
for s, t in graph.edges:
  n = attrs[s] if s in attrs else ''
  edges.append([visited[s], visited[t],n])

{'/wiki/Hale_Trotter': 'American', '/wiki/Benjamin_Weiss': 'Israeli', '/wiki/David_A._Freedman': 'Canadian', '/wiki/David_Hilbert': 'German', '/wiki/Kurt_O._Friedrichs': 'German American', '/wiki/Hans_Lewy': 'American', '/wiki/Jacob_Tamarkin': 'Russian American', '/wiki/Richard_Brent_(scientist)': 'Australian', '/wiki/Elon_Lindenstrauss': 'Israeli', '/wiki/Ferdinand_von_Lindemann': 'German', '/wiki/Wilhelm_Ackermann': 'German', '/wiki/Werner_Boy': 'German', '/wiki/Rudolf_Fueter': 'Swiss', '/wiki/Ernst_Hellinger': 'German', '/wiki/Margarete_Kahn': 'German', '/wiki/Oliver_Dimon_Kellogg': 'American', '/wiki/Hellmuth_Kneser': 'Baltic German', '/wiki/Klara_L%C3%B6benstein': 'German', '/wiki/Hugo_Steinhaus': 'Polish', '/wiki/Teiji_Takagi': 'Japanese', '/wiki/Ernst_Zermelo': 'German', '/wiki/Peter_Lax': 'American', '/wiki/Cathleen_Synge_Morawetz': 'Canadian', '/wiki/Wolfgang_R._Wasow': 'American', '/wiki/Sergiu_Klainerman': 'Romanian American', '/wiki/Andrey_Markov': 'Russian', '/wiki/Derrick

In [12]:
import pandas as pd
graph_df = pd.DataFrame(edges)
graph_df


Unnamed: 0,0,1,2
0,Richard Courant,William Feller,
1,Richard Courant,Leifur Ásgeirsson,
2,Richard Courant,Herbert Busemann,
3,Richard Courant,Kurt Friedrichs,
4,Richard Courant,Harold Grad,
...,...,...,...
9070,Wolferd Senguerd,Pieter van Musschenbroek,
9071,Angelica Stacy,Amy Prieto,
9072,Michell J. Sienko,Angelica Stacy,
9073,Manuela Veloso,Peter Stone,Portuguese


In [13]:
graph_df.to_csv('math-graph.csv', index=False, header=False)