First Python API

In [1]:
import wikipediaapi

In [2]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
page_py = wiki_wiki.page('Python_(programming_language)')
# page_py.text[0:5000]
# print(page_py.links)

Second Python API

In [3]:
import wikipedia

In [4]:
pypage = wikipedia.page("Python_(programming_language)")
# print(pypage.content[0:2000])
# print(pypage.links)

Now without an API to get the HTML anchors.

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from collections import Counter
import re

In [2]:
def get_summary_links(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    for elem in soup.find_all(class_='mw-indicators'): # for good articles
        elem.decompose()

    summ = soup.find(class_='mw-parser-output') # Find the element or elements that contain the summary
    #print(summ) 
    # cleaning of thumbnails, shortdescriptions, hatnotes and style tags
    to_clean_divs = summ.find_all('div', class_='thumb tright') + summ.find_all('div', class_='shortdescription') + summ.find_all('div', class_='hatnote') + summ.find_all('style') + summ.find_all('link') + summ.find_all('table', class_='metadata')
    for div in to_clean_divs:
        div.decompose()

    if soup.find('table', class_=re.compile(r'infobox.*biography')):
        print("person detected on " + url)
        return "person"

    cutoff_div = summ.find(id='toc') # Find the div element with the id "toc"
    
    if cutoff_div is None:
        cutoff_div = summ.find('h2')
        if cutoff_div is None:
            print(" unexpected page content on "+ url)
            return []
   
    if 'toc' in cutoff_div.parent.get('class')[0]: cutoff_div = cutoff_div.parent
    preceding_elements = cutoff_div.find_previous_siblings()

    links = []
    offset = 0
    for element in reversed(preceding_elements):
        element_links = element.find_all('a', recursive=False)
        links.extend([(offset, link) for link in element_links])
        offset += len(element.get_text())

    return [(offset+link.parent.get_text().find(link.get_text()), link.get('title')) for (offset, link) in links]

In [5]:
[t[1] for t in get_summary_links("https://en.wikipedia.org/wiki/Determinant")]

['Mathematics',
 'Scalar (mathematics)',
 'Function (mathematics)',
 'Square matrix',
 'Linear map',
 'Invertible matrix',
 'Linear isomorphism',
 'Leibniz formula for determinants',
 'Factorial',
 'Laplace expansion',
 'Linear combination',
 'Gaussian elimination',
 'Diagonal matrix',
 'Elementary row operation',
 'Identity matrix',
 'Coefficient',
 'System of linear equations',
 "Cramer's rule",
 'Characteristic polynomial',
 'Eigenvalue',
 'Geometry',
 'Volume',
 'Parallelepiped',
 'Calculus',
 'Exterior differential form',
 'Jacobian determinant',
 'Integration by substitution',
 'Multiple integral']

In [8]:
wiki_concepts = []
path = "../dat/index_by_wiki/"

for book in os.listdir(path):
    index = pd.read_csv(path + book)
    wiki_concepts.extend(index.wiki_concept) # add concepts to list

count = Counter(wiki_concepts)
frequent_concepts = filter(lambda x: count[x] >= 5, count)

concept_links = {}
for concept in list(frequent_concepts):
    concept_links[concept] = get_summary_links("https://en.wikipedia.org/wiki/" + concept.replace(" ", "_"))
    if concept_links[concept] == "person": del concept_links[concept]

In [9]:
count = Counter(sum([[x[1] for x in v] for v in concept_links.values()], []))
count.most_common(20)

[('Mathematics', 60),
 ('Linear algebra', 42),
 ('Vector space', 29),
 ('Matrix (mathematics)', 28),
 ('Real number', 26),
 ('Complex number', 18),
 ('Field (mathematics)', 17),
 ('Square matrix', 16),
 ('Euclidean space', 15),
 ('Determinant', 14),
 ('Set (mathematics)', 12),
 (None, 11),
 ('Physics', 11),
 ('Dimension (vector space)', 9),
 ('Invertible matrix', 9),
 ('Geometry', 9),
 ('Linear map', 8),
 ('Basis (linear algebra)', 8),
 ('Transpose', 8),
 ('Inner product space', 8)]

In [10]:
filter_concepts = ['Mathematics', 'Linear algebra', 'Physics', 'Mathematical object', 'Engineering','Geometry', None]


In [11]:
cached_links = {}
def get_cached_summary(url):
    if url not in cached_links: cached_links[url] = get_summary_links(url)
    return cached_links[url]

In [12]:
adj_list = {}

for concept, deps in concept_links.items():
    candidates = [dep for dep in deps if dep[0]<300 and dep[1] not in filter_concepts]
    # now check if these articles link earlier to this article to create directionality
    filtered_candidates = []
    for c in candidates:
        #print(c)
        links_from_candidate = concept_links[c[1]] if c[1] in frequent_concepts else get_cached_summary("https://en.wikipedia.org/wiki/" + c[1].replace(" ", "_"))
        if links_from_candidate == "person": continue # persons should not be dependencies
        pos = [l[0] for l in links_from_candidate if l[1] == concept]
        if len(pos):
            if pos[0] > c[0]:
                filtered_candidates.append(c[1])
            else:
                print(concept + " links to " + c[1] + " at " + str(c[0]) + " which links back at " + str(pos[0]) + " what led to a rejection")
        else:
            filtered_candidates.append(c[1])

    adj_list[concept] = filtered_candidates

person detected on https://en.wikipedia.org/wiki/Georg_Hamel
Vector space links to Scalar multiplication at 145 which links back at 80 what led to a rejection
person detected on https://en.wikipedia.org/wiki/Euclid
Polynomial links to Coefficient at 103 which links back at 75 what led to a rejection
Real number links to Decimal expansion at 298 which links back at 43 what led to a rejection
person detected on https://en.wikipedia.org/wiki/Augustin-Louis_Cauchy
 unexpected page content on https://en.wikipedia.org/wiki/Vector_algebra
person detected on https://en.wikipedia.org/wiki/Pierre-Simon_Laplace


In [13]:
adj_list

{'Absolute value': ['Real number',
  'Non-negative',
  'Sign (mathematics)',
  'Positive number',
  'Negative number'],
 'Affine transformation': ['Euclidean geometry',
  'Geometric transformation',
  'Line (geometry)',
  'Parallelism (geometry)',
  'Euclidean distance',
  'Angle',
  'Automorphism',
  'Affine space'],
 'Augmented matrix': ['Matrix (mathematics)', 'Elementary row operations'],
 'Basis': [],
 'Change of basis': [],
 'Characteristic polynomial': ['Square matrix',
  'Polynomial',
  'Matrix similarity',
  'Eigenvalues',
  'Root of a polynomial',
  'Determinant',
  'Trace (linear algebra)',
  'Endomorphism'],
 'Complex conjugate': ['Complex number',
  'Real number',
  'Imaginary number',
  'Sign (mathematics)'],
 'Complex number': ['Number system',
  'Real number',
  'Imaginary unit',
  'Equation'],
 'Conjugate transpose': ['Complex number',
  'Matrix (mathematics)',
  'Transpose'],
 'Consistent and inconsistent equations': ['Algebra',
  'System of equations',
  'Linear equa

In [14]:
#pd.DataFrame({'concept': adj_list.keys(), 'deps':adj_list.values()}).to_csv("../dat/wiki/graph.csv")

In [16]:
# the format is needed for plotly dashboard
import json
elements_all = []

#all
for target in adj_list.keys():
    elements_all.append({"data":{"id":target,"label":target}})
    for source in adj_list[target]:
        elements_all.append({"data":{"source":source,"target":target}})
        elements_all.append({"data":{"id":source,"label":source}})

with open('../dash_wiki/data/wiki_graph_all.json', 'w') as f:
    json.dump(elements_all, f)

#known
elements = []
for target in adj_list.keys():
    elements.append({"data":{"id":target,"label":target}})
    for source in adj_list[target]:
        if source in adj_list.keys():
            elements.append({"data":{"source":source,"target":target}})
            elements.append({"data":{"id":source,"label":source}})

with open('../dash_wiki/data/wiki_graph_known.json', 'w') as f:
    json.dump(elements, f)