# Index entry disambigation
Here, the texbook index entries are mapped to Wikipedia articles using the Wikisearch disambiguation method.

In [9]:
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter
import sys
sys.path.append('../src/')
import wiki_api
from importlib import reload
wiki_api = reload(wiki_api)

In [3]:
path = "../dat/index/"

def unique_sorted(pages):
    pages = list(set(pages.split(","))) # remove duplicates
    pages = [p.strip() for p in pages]
    while "nan" in pages: pages.remove("nan")
    pages.sort(key=int)
    return ", ".join(pages)

In [13]:
disamb = wiki_api.disambiguate("injection", None, "Mathematics", True)


Search for injection (Mathematics)
Response: ['Injection', 'Bijection', 'Function', 'Inclusion map', 'Injection locking', 'Injective function', 'Surjective function', 'Supervised injection site', 'Stratospheric aerosol injection', 'Bijection, injection and surjection']
Distances: [('Injection', 0), ('Bijection', 2), ('Function', 4), ('Inclusion map', 8), ('Injection locking', 8), ('Injective function', 9), ('Surjective function', 12), ('Supervised injection site', 16), ('Stratospheric aerosol injection', 22), ('Bijection, injection and surjection', 26)]


In [16]:
for book in os.listdir(path)[:6]:
    index = pd.read_csv(path + book)
    for title in index.concept:
        print(f"Index entry: {title}")
        disamb = wiki_api.disambiguate(str(title), None, "Mathematics", True)
        print(f"Wiki result: {disamb}")


Index entry: ∩
Search for ∩ (Mathematics)
Response: ['Ring', 'Group', 'Matrix', 'Series', 'Function', 'Mathematics', 'Pure mathematics', 'Indian mathematics', 'Applied mathematics', 'Discrete mathematics']
Distances: [('Ring', 4), ('Group', 5), ('Matrix', 6), ('Series', 6), ('Function', 8), ('Mathematics', 11), ('Pure mathematics', 16), ('Indian mathematics', 18), ('Applied mathematics', 19), ('Discrete mathematics', 20)]
Wiki result: Ring (mathematics)
Index entry: ∪
Search for ∪ (Mathematics)
Response: ['Ring', 'Group', 'Matrix', 'Series', 'Function', 'Mathematics', 'Pure mathematics', 'Indian mathematics', 'Applied mathematics', 'Discrete mathematics']
Distances: [('Ring', 4), ('Group', 5), ('Matrix', 6), ('Series', 6), ('Function', 8), ('Mathematics', 11), ('Pure mathematics', 16), ('Indian mathematics', 18), ('Applied mathematics', 19), ('Discrete mathematics', 20)]
Wiki result: Ring (mathematics)
Index entry: \
Search for \ (Mathematics)
Response: ['Ring', 'Group', 'Matrix', 'Ser

KeyboardInterrupt: 

In [6]:
for book in os.listdir(path):
    index = pd.read_csv(path + book)
    index.wiki_concept = [wiki_api.disambiguate(str(title), None, "Mathematics") for title in tqdm(index.concept)]
    
    index.pages = index.pages.apply(str) # to prepare join
    wiki_index = index.groupby("wiki_concept").agg({'pages': ', '.join})
    wiki_index.pages = wiki_index.pages.apply(unique_sorted)
    wiki_index.to_csv("../dat/index_by_wiki/"+ book)

100%|██████████| 237/237 [03:23<00:00,  1.16it/s]
100%|██████████| 302/302 [04:13<00:00,  1.19it/s]
100%|██████████| 638/638 [07:58<00:00,  1.33it/s]
100%|██████████| 173/173 [02:20<00:00,  1.23it/s]
100%|██████████| 462/462 [05:58<00:00,  1.29it/s]
100%|██████████| 477/477 [05:48<00:00,  1.37it/s]


## Descriptive stats

In [13]:
path = "../dat/index_by_wiki/"
wiki_concepts = []
for book in os.listdir(path):
    index = pd.read_csv(path + book)
    wiki_concepts.extend(index.wiki_concept)

count = Counter(wiki_concepts)

Number of unique concepts

In [17]:
len(count)

1469

Concepts that appear in at least 4 books

In [15]:
common_concepts = [c for c in count if count[c]>=5]
len(common_concepts)

175

In [16]:
print(common_concepts)

['Absolute value', 'Adjoint', 'Adjugate matrix', 'Affine transformation', 'Basis (linear algebra)', 'Canonical form', 'Cauchy–Schwarz inequality', 'Change of basis', 'Closure (mathematics)', 'Coefficient', 'Complex conjugate', 'Complex conjugate root theorem', 'Complex number', 'Conjugate transpose', 'Consistency', 'Coordinate vector', "Cramer's rule", 'Cross product', 'Definite matrix', 'Determinant', 'Diagonal', 'Diagonal matrix', 'Diagonalizable group', 'Dimension', 'Dimension (vector space)', 'Diophantine equation', 'Distance', 'Distance matrix', 'Dot product', 'Dual space', 'Echelon formation', 'Eigendecomposition of a matrix', 'Eigenvalue algorithm', 'Elementary mathematics', 'Elementary matrix', 'Empty set', 'Equivalence relation', 'Euclidean vector', 'Function composition', 'Fundamental theorem of algebra', 'Gamma matrices', 'Gaussian elimination', 'Generalized inverse', 'Geometric transformation', 'Hermite normal form', 'Hyperplane', 'Intersection', 'Invariant (mathematics)', 