# Wikipedia Concept Dependency Graph
Although there are some Python packages (wikipediaapi, wikipedia) that conveniently access Wikipedia, they are not powerful enough for our purposes. We hence developed our own API.

In [16]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import aiohttp
import asyncio
import sys
sys.path.append('../src/')
import wiki_api
import data_layer
from importlib import reload
wiki_api = reload(wiki_api)

MAX_NR_DEPS=5 # maximum number of dependencies kept

Read in wikipedia concepts from book indices

In [26]:
_, wiki_concepts = data_layer.read_index_and_wiki_concepts()
df_concepts = pd.DataFrame({"concept": wiki_concepts})

Scrape all outgoing for these concept on Wikipedia

In [28]:
async with aiohttp.ClientSession() as session:
    tasks = [wiki_api.get_summary_links(session, title) for title in df_concepts.concept]
    links = [await result for result in tqdm(asyncio.as_completed(tasks), total=len(tasks))]
links_df = pd.DataFrame(links, columns=['concept', 'links'])
df_concepts = df_concepts.merge(links_df)

100%|██████████| 1464/1464 [00:21<00:00, 68.92it/s]


In [29]:
count = Counter(sum([[x[1] for x in v] for v in df_concepts.links if v is not None], []))
count.most_common(50)

[('Mathematics', 635),
 ('Real number', 140),
 ('Vector space', 132),
 ('Function (mathematics)', 129),
 ('Matrix (mathematics)', 116),
 ('Linear algebra', 115),
 ('Geometry', 101),
 ('Set (mathematics)', 100),
 ('Physics', 96),
 ('Field (mathematics)', 95),
 ('Complex number', 90),
 ('Integer', 75),
 ('Polynomial', 72),
 ('Euclidean space', 63),
 ('Ring (mathematics)', 57),
 ('Statistics', 57),
 ('Computer science', 52),
 ('Group (mathematics)', 45),
 ('Engineering', 43),
 ('Algebra', 42),
 ('Algorithm', 41),
 ('Number theory', 40),
 ('Variable (mathematics)', 39),
 ('Mathematical analysis', 39),
 ('Square matrix', 38),
 ('Abstract algebra', 38),
 ('Quantum mechanics', 38),
 ('Numerical analysis', 37),
 ('Derivative', 36),
 ('Algebraic geometry', 36),
 ('Determinant', 35),
 ('Euclidean geometry', 35),
 ('Calculus', 35),
 ('Rational number', 34),
 ('Point (geometry)', 33),
 ('Functional analysis', 33),
 ('Domain of a function', 33),
 ('Basis (linear algebra)', 31),
 ('Probability theor

Manually create a blacklist of categories and fields in mathematics

In [30]:
blacklist = ['Mathematics', 'Linear algebra', 'Physics', 'Mathematical object', 'Engineering','Geometry','Statistics','Computer science', 'Algebra', 'Number theory','Algebraic geometry','Euclidean geometry', None]

Define functions to extract the final dependencies from the outgoing links

In [72]:
# this function returns if a dependency of a concept links earlier than before_pos to the concept
async def get_links(session, title):
    _, links = await wiki_api.get_summary_links(session, title)
    return links

async def has_earlier_backlink(concept, dep_title, before_pos):
    links = await get_links(session, dep_title)
    backlink = [l for l in links if l[1] == concept]
    if not len(backlink): return False
    return backlink[0][0] < before_pos

# this function deletes links that are presumably no dependencies (categories, persons, disambiguations, deps with earlier backlinks)
async def prune_deps(session, concept, deps, titles, k=MAX_NR_DEPS):
    # 1. Blacklist
    deps = [dep for dep in deps if dep[1] not in blacklist]

    # 2. Link to another concept in our set of concepts
    #deps = [dep for dep in deps if dep[1] in titles]

    # 3. Position in summary
    first_paragraph_with_link = np.min([dep[2] for dep in deps]) if len(deps) else -1
    first_sentence_with_link = np.min([dep[3] for dep in deps if dep[2] == first_paragraph_with_link]) if first_paragraph_with_link >= 0 else -1
    deps = [dep for dep in deps if 
            dep[2] == first_paragraph_with_link and
            dep[3] == first_sentence_with_link]

    # 4. dependencies on unexpected pages and persons
    deps = [dep for dep in deps if await get_links(session, dep[1]) is not None]

    # 5. Earlier Backlink: check if these articles link earlier to this article to delete cycles
    deps = [dep for dep in deps if not await has_earlier_backlink(concept, dep[1], dep[0])]

    # 5. maximum number of dependencies
    deps = deps[:min(k, len(deps))]
    print(concept + ": "+ ", ".join([dep[1] for dep in deps]))
    return deps

async def get_deps(session, concept):
    if concept.links is None: return []
    return await prune_deps(session, concept.concept, concept.links, [])

In [79]:
async with aiohttp.ClientSession() as session:
    all_deps = [await get_deps(session, concept) for _, concept in df_concepts.iterrows()]

Absolute value: Real number, Non-negative, Sign (mathematics), Negative number
Adjugate matrix: Square matrix, Transpose, Cofactor matrix
Affine space: Structure (mathematics), Euclidean space, Parallel (geometry), Line segment
Affine transformation: Geometric transformation, Line (geometry), Parallelism (geometry), Euclidean distance, Angle
Algebraic number theory: Abstract algebra, Integers, Rational numbers, Algebraic number field, Rings of integers
Axiom: Statement (logic), Truth, Premise
Basic solution (linear programming): Linear programming, Applied mathematics
Basis (linear algebra): Set (mathematics), Vector space, Linear combination
Bijection: Function (mathematics), Set (mathematics), Injective function, Surjective function, Injective function
Borel equivalence relation: Polish space, Equivalence relation, Borel algebra, Product topology
Canonical form: Mathematical expression
Cardioid: Greek language, Plane curve
Cauchy–Schwarz inequality: Inequality (mathematics)
Change of

CancelledError: 

In [77]:
df_concepts.to_json("../dat/wiki/graph.json")
df_concepts.to_csv("../dat/wiki/graph.csv",index=False)

# Test old graph before correction against new one

In [78]:
df_old = pd.read_json("../dat/wiki/graph_old.json").sort_values("concept")
old_deps = [[dep[1] for dep in deps] for deps in df_old.dep_articles]
df_new = pd.read_json("../dat/wiki/graph.json").sort_values("concept")
removed = [set(old) - set(new) for old, new in zip(old_deps, df_new.dep_articles)]
print(f"Removed {round(np.mean([len(r) for r in removed]),2)} concepts on average.")
added = [set(new) - set(old) for old, new in zip(old_deps, df_new.dep_articles)]
print(f"Added {round(np.mean([len(r) for r in added]),2)} concepts on average.")

Removed 2.94 concepts on average.
Added 2.83 concepts on average.


In [74]:
ana = pd.read_excel("../dat/validation/deps_to_validate_ana.xlsx")
hanqi = pd.read_excel("../dat/validation/deps_to_validate_hanqi.xlsx")
dom = pd.read_excel("../dat/validation/deps_to_validate_Dom.xlsx")
atc = pd.read_excel("../dat/validation/deps-all-validated-ATC.xlsx")
source = pd.read_csv("../dat/validation/source_column.csv")

In [83]:
concepts = [(c, d) for c, d, s in zip(dom.concept, dom.dependency, source.source) if s=="wiki"]

In [105]:
concepts_rem = [(c[0], c[1]) for c in concepts if not c[1] in list(df_new.loc[df_new.concept==c[0]].dep_articles)[0]]
concepts_rem

[('Equivalence relation', 'Transitive relation'),
 ('Substitution (logic)', 'Formal language'),
 ('Polynomial ring', 'Polynomial'),
 ('Matrix addition', 'Addition'),
 ('Set (mathematics)', 'Empty set'),
 ('Mathematics', 'Mathematical analysis'),
 ('Empty set', 'Axiomatic set theories')]

In [106]:
list(zip(df_concepts.concept, old_deps, added, removed))

[('Absolute value',
  ['Quantity',
   'Place-value notation',
   'Hindu%E2%80%93Arabic numeral system',
   'Numerical digit',
   'Multiplication'],
  {'Number'},
  {'Hindu%E2%80%93Arabic numeral system',
   'Multiplication',
   'Numerical digit'}),
 ('Adjugate matrix',
  ['Number', 'Entity', 'Numerical digit', 'Unit (measurement)', 'Counting'],
  {'Line segment', 'Measurement'},
  {'Counting', 'Number'}),
 ('Affine space',
  ['Dragon C2%2B',
   'Commercial spacecraft',
   'Higgs boson',
   'Guinea worm disease',
   '2012 in spaceflight'],
  set(),
  set()),
 ('Affine transformation',
  ['Digital image'],
  {'2D geometric model', 'Computer-generated imagery'},
  set()),
 ('Algebraic number theory',
  ['ALGOL', 'Programming language', 'ALGOL 60'],
  set(),
  {'ALGOL 60'}),
 ('Axiom', ['Infinity', 'Mathematician'], set(), set()),
 ('Basic solution (linear programming)',
  ['Real number', 'Non-negative', 'Sign (mathematics)', 'Negative number'],
  set(),
  set()),
 ('Basis (linear algebra)