Although there are some Python packages (wikipediaapi, wikipedia) that conveniently access Wikipedia, they are not powerful enough for our purposes.

In [30]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from collections import Counter
import re
import numpy as np

In [50]:
def get_summary_links(title):
    url = "https://en.wikipedia.org/wiki/" + str(title).replace(" ", "_")
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    for elem in soup.find_all(class_='mw-indicators'): # for good articles
        elem.decompose()

    summ = soup.find(class_='mw-parser-output') # Find the element or elements that contain the summary

    # cleaning of thumbnails, shortdescriptions, hatnotes and style tags
    to_clean_divs = summ.find_all('div', class_='thumb tright') + summ.find_all('div', class_='shortdescription') + summ.find_all('div', class_='hatnote') + summ.find_all('style') + summ.find_all('link') + summ.find_all('table', class_='metadata')
    for div in to_clean_divs:
        div.decompose()

    if soup.find('table', class_=re.compile(r'infobox.*biography')):
        print("person detected on " + url)
        return None

    cutoff_div = summ.find(id='toc') # Find the div element with the id "toc"
    
    if cutoff_div is None:
        cutoff_div = summ.find('h2')
        if cutoff_div is None:
            print(" unexpected page content on "+ url)
            return None
   
    if 'toc' in cutoff_div.parent.get('class')[0]: cutoff_div = cutoff_div.parent
    preceding_elements = cutoff_div.find_previous_siblings()

    links = []
    offset = 0
    for paragraph, element in enumerate(reversed(preceding_elements)):
        element_links = element.find_all('a', recursive=False)
        links.extend([(offset, link, paragraph) for link in element_links])
        offset += len(element.get_text())

    return [(offset+link.parent.get_text().find(link.get_text()), link.get('title'), paragraph) for (offset, link, paragraph) in links]

cached_links = {}
def get_cached_links(title):
    if title not in cached_links: cached_links[title] = get_summary_links(title)
    return cached_links[title]

[t for t in get_summary_links("Determinant")]

[(3, 'Mathematics', 0),
 (37, 'Scalar (mathematics)', 0),
 (60, 'Function (mathematics)', 0),
 (89, 'Square matrix', 0),
 (159, 'Linear map', 0),
 (268, 'Invertible matrix', 0),
 (330, 'Linear isomorphism', 0),
 (1006, 'Leibniz formula for determinants', 5),
 (1230, 'Factorial', 5),
 (1297, 'Laplace expansion', 5),
 (1364, 'Linear combination', 5),
 (1479, 'Gaussian elimination', 5),
 (1568, 'Diagonal matrix', 5),
 (1620, 'Elementary row operation', 5),
 (1841, 'Identity matrix', 6),
 (2230, 'Coefficient', 7),
 (2248, 'System of linear equations', 7),
 (2331, "Cramer's rule", 7),
 (2462, 'Characteristic polynomial', 7),
 (2521, 'Eigenvalue', 7),
 (2537, 'Geometry', 7),
 (2572, 'Volume', 7),
 (2598, 'Parallelepiped', 7),
 (2660, 'Calculus', 7),
 (2674, 'Exterior differential form', 7),
 (2710, 'Jacobian determinant', 7),
 (2750, 'Integration by substitution', 7),
 (2774, 'Multiple integral', 7)]

In [51]:
#for concept, links in cached_links.items():
#    if links == "person": cached_links[concept] = None

In [10]:
wiki_concepts = []
path = "../dat/index_by_wiki/"

for book in os.listdir(path):
    index = pd.read_csv(path + book)
    wiki_concepts.extend(index.wiki_concept) # add concepts to list

count = Counter(wiki_concepts)
concepts = pd.DataFrame({"concept": count.keys(), "number_of_books": count.values()})

In [20]:
concepts['links'] = concepts['concept'].apply(get_cached_links)

 unexpected page content on https://en.wikipedia.org/wiki/Box_product
 unexpected page content on https://en.wikipedia.org/wiki/Elementary_operations
 unexpected page content on https://en.wikipedia.org/wiki/Included_angle
 unexpected page content on https://en.wikipedia.org/wiki/Summation_notation
 unexpected page content on https://en.wikipedia.org/wiki/Adjoint
 unexpected page content on https://en.wikipedia.org/wiki/Characteristic_vector
 unexpected page content on https://en.wikipedia.org/wiki/Diagonalization
 unexpected page content on https://en.wikipedia.org/wiki/Minimal_polynomial
 unexpected page content on https://en.wikipedia.org/wiki/Rooting
 unexpected page content on https://en.wikipedia.org/wiki/Subfield
person detected on https://en.wikipedia.org/wiki/Abraham_de_Moivre
person detected on https://en.wikipedia.org/wiki/Andrey_Markov
person detected on https://en.wikipedia.org/wiki/Augustin-Louis_Cauchy
person detected on https://en.wikipedia.org/wiki/Camille_Jordan
perso

In [23]:
count = Counter(sum([[x[1] for x in v] for v in concepts.links.values if v is not None], []))
count.most_common(30)

[('Mathematics', 478),
 ('Real number', 139),
 ('Vector space', 125),
 ('Linear algebra', 118),
 ('Matrix (mathematics)', 105),
 ('Function (mathematics)', 97),
 ('Complex number', 93),
 (None, 93),
 ('Geometry', 87),
 ('Field (mathematics)', 83),
 ('Set (mathematics)', 80),
 ('Integer', 76),
 ('Physics', 74),
 ('Polynomial', 72),
 ('Euclidean space', 62),
 ('Statistics', 52),
 ('Ring (mathematics)', 42),
 ('Square matrix', 41),
 ('Group (mathematics)', 41),
 ('Variable (mathematics)', 39),
 ('Basis (linear algebra)', 39),
 ('Algebra', 38),
 ('Computer science', 36),
 ('Functional analysis', 36),
 ('Engineering', 35),
 ('Domain of a function', 35),
 ('Euclidean geometry', 32),
 ('Determinant', 31),
 ('Addition', 31),
 ('Number theory', 31)]

In [27]:
blacklist = ['Mathematics', 'Linear algebra', 'Physics', 'Mathematical object', 'Engineering','Geometry','Statistics','Computer science', None]

In [52]:
# Now we prune the candidates
def prune_deps(concept, deps, titles, k=5):
    # 1. Blacklist
    deps = list(filter(lambda dep: dep[1] not in blacklist, deps))
    # 2. Link to another concept in our set of concepts
    #deps = filter(lambda dep: dep[1] in titles, deps)
    # 3. Position in summary
    first_paragraph_with_link = np.min([dep[2] for dep in deps]) if len(deps) else -1
    deps = list(filter(lambda dep: dep[2] == first_paragraph_with_link and dep[0]<300, deps))
    # 4. dependencies on unexpected pages and persons
    deps = list(filter(lambda dep: get_cached_links(dep[1]) is not None, deps))
    # 5. Earlier Backlink: check if these articles link earlier to this article to delete cycles
    def has_earlier_backlink(concept, dep_title, before_pos):
        links = get_cached_links(dep_title)
        #print(links)
        backlink = list(filter(lambda bl: bl[1] == concept, links))
        return backlink[0][0] > before_pos if len(backlink) else False
    deps = list(filter(lambda dep: not has_earlier_backlink(concept, dep[1], dep[0]), deps))
    # 5. maximum number of dependencies
    deps = deps[:min(k, len(deps))]
    print(concept + ": "+ ", ".join([dep[1] for dep in deps]))
    return deps
    
concepts = concepts.dropna()
concepts['dependencies'] = concepts.apply(lambda c: prune_deps(c.concept, c.links, concepts.concept.values), axis=1)

A Random Walk Down Wall Street: Burton Malkiel
Absolute value: Real number, Non-negative, Sign (mathematics), Negative number
Adjugate matrix: Square matrix, Transpose, Cofactor matrix, Hermitian adjoint
Affine transformation: Euclidean geometry, Geometric transformation, Line (geometry), Parallelism (geometry), Euclidean distance
Aircraft principal axes: Aircraft
Algebra over a field: 
Axiom of empty set: Axiomatic set theory, Axiom, Kripke–Platek set theory, General set theory, Zermelo set theory
Basic feasible solution: Linear programming, Polyhedron
Basis: 
Basis (linear algebra): Set (mathematics), Vector space, Linear combination
Box product: 
Cardioid: Greek language, Plane curve, Epicycloid, Cusp (singularity), Sinusoidal spiral
Cartesian coordinate system: Plane (geometry), Coordinate system, Point (geometry), Number, Positive and negative numbers
Cauchy–Schwarz inequality: Inequality (mathematics)
Change of basis: 
Characteristic equation: 
Cholesky decomposition: Help:Pronun

In [53]:
print(concepts)

                              concept  number_of_books  \
0      A Random Walk Down Wall Street                1   
1                      Absolute value                7   
2                     Adjugate matrix                5   
3               Affine transformation                6   
4             Aircraft principal axes                2   
...                               ...              ...   
1307                           Vertex                1   
1308               Vertical line test                1   
1309                      Y-intercept                1   
1310  Young's inequality for products                1   
1311            Zero-product property                1   

                                                  links  \
0     [(289, Burton Malkiel, 1), (314, Princeton Uni...   
1     [(4, Mathematics, 1), (52, Real number, 1), (1...   
2     [(3, Linear algebra, 0), (58, Square matrix, 0...   
3     [(3, Euclidean geometry, 0), (109, Geometric t...   
4     [(

In [55]:
adj_list = {concept: [dep[1] for dep in deps] for concept, deps in zip(concepts.concept, concepts.dependencies)}
adj_list

{'A Random Walk Down Wall Street': ['Burton Malkiel'],
 'Absolute value': ['Real number',
  'Non-negative',
  'Sign (mathematics)',
  'Negative number'],
 'Adjugate matrix': ['Square matrix',
  'Transpose',
  'Cofactor matrix',
  'Hermitian adjoint'],
 'Affine transformation': ['Euclidean geometry',
  'Geometric transformation',
  'Line (geometry)',
  'Parallelism (geometry)',
  'Euclidean distance'],
 'Aircraft principal axes': ['Aircraft'],
 'Algebra over a field': [],
 'Axiom of empty set': ['Axiomatic set theory',
  'Axiom',
  'Kripke–Platek set theory',
  'General set theory',
  'Zermelo set theory'],
 'Basic feasible solution': ['Linear programming', 'Polyhedron'],
 'Basis': [],
 'Basis (linear algebra)': ['Set (mathematics)',
  'Vector space',
  'Linear combination'],
 'Box product': [],
 'Cardioid': ['Greek language',
  'Plane curve',
  'Epicycloid',
  'Cusp (singularity)',
  'Sinusoidal spiral'],
 'Cartesian coordinate system': ['Plane (geometry)',
  'Coordinate system',
  'Po

In [56]:
concepts.to_csv("../dat/wiki/graph.csv")

In [58]:
# the format is needed for plotly dashboard
import json
elements_all = []

#all
for target in adj_list.keys():
    elements_all.append({"data":{"id":target,"label":target}})
    for source in adj_list[target]:
        elements_all.append({"data":{"source":source,"target":target}})
        elements_all.append({"data":{"id":source,"label":source}})

with open('../dat/wiki/graph_all.json', 'w') as f:
    json.dump(elements_all, f)

#known
elements = []
for target in adj_list.keys():
    elements.append({"data":{"id":target,"label":target}})
    for source in adj_list[target]:
        if source in adj_list.keys():
            elements.append({"data":{"source":source,"target":target}})
            elements.append({"data":{"id":source,"label":source}})

with open('../dat/wiki/graph_known.json', 'w') as f:
    json.dump(elements, f)