This experiment is designed to extract dependencies for all index entries based on common occurences of other entities around one entity's page of introduction. Hence, we need a mapping from page to character offset and a data structure that is able to capture the concepts that are common across books. In a second step, maybe a Gaussian or asymmetric weighting of distance is necessary.

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from collections import Counter

In [2]:
# get character offset for page number
full_text = pd.read_json("../dat/parsed_books/parsed_books.json")
page_offsets = full_text.pages.apply(lambda pages: np.cumsum(list(map(len, pages))))

In [3]:
page_offsets

Beezer_First_Course                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157...
CollegeAlgCoreq-WEB                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Hefferon_LinAlgebra                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1009, 2132, 357...
Kuttler-LinearAlgebra-AFirstCourse-2017A    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 224...
Linear algebra done right — Axler           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Math1410_print                              [0, 0, 0, 0, 0, 0, 0, 0, 2188, 4084, 7158, 972...
Nicholson-OpenLAWA-2019A                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
interactive_textbook                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
linear-Cherey, Denton                       [0, 0, 0, 0, 0, 0, 0, 0, 1208, 2553, 3655, 496...
textbook_Hoffman_Kunze                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 697, 3135, 5028, 7...
Name: pages, dtype: object

In [48]:
page_correction = {
    'Kuttler-LinearAlgebra-AFirstCourse-2017A.csv': 12,
    'Beezer_First_Course.csv': 14,
    'textbook_Hoffman_Kunze.csv': 9,
    'Nicholson-OpenLAWA-2019A.csv': 22,
    'Linear algebra done right — Axler.csv': 17,
    'Math1410_print.csv': 8,
    'Hefferon_LinAlgebra.csv': 10,
    'linear-Cherey, Denton.csv': 0,
    'interactive_textbook.csv': 18,
    'CollegeAlgCoreq-WEB.csv': 10
}

In [53]:
path = "../dat/index_by_wiki/"

def get_page_list(page_string):
    if not isinstance(page_string, float) and "nan" in page_string: return None
    pages_int = [int(page_string)] if isinstance(page_string, float) else list(map(int, page_string.split(",")))
    return pages_int

wiki_concepts = {}

for book in os.listdir(path):
    print(book)
    # get concepts we are interested in
    index = pd.read_csv(path + book).dropna()
    index['pages'] = index.pages.apply(get_page_list)
    index = index.dropna()
    index['first_page'] = index.pages.apply(min) # add first page of index entry
    
    # get annotations around this offset
    title = book.replace(".csv", "")
    annotations = pickle.load(open("../dat/annotations/" + title + ".pkl", "rb"))
    mentions_by_offset = {}

    for a in annotations:
        for s in a['support']:
            if s['pageRank'] < 0.001: continue
            if s['chTo'] - s['chFrom'] < 3: continue
            if s['chFrom'] in mentions_by_offset: # if there was already something at this offset
                if s['pageRank'] > mentions_by_offset[s['chFrom']]['pr']: # if this support has a higher PR
                    mentions_by_offset[s['chFrom']] = {'title': a['title'], 'pr': s['pageRank']} # override
            else: # if no mention by this offset
                mentions_by_offset[s['chFrom']] = {'title': a['title'], 'pr': s['pageRank']} # add

    offset_index = np.array(list(mentions_by_offset.keys()))

    for concept, page in zip(index.wiki_concept, index.first_page):
        #print(concept + str(page))
        page = page + page_correction[book]
        page_slice = slice(page-1, page+1) # current and previous page
        intro_offsets = page_offsets[title][page_slice]
        # get same page mentions
        after_page_start = (offset_index > intro_offsets[0])
        before_page_end = (offset_index < intro_offsets[1])
        is_same_page = after_page_start & before_page_end
        closest = offset_index[is_same_page]
        
        if concept not in wiki_concepts: wiki_concepts[concept] = []

        wiki_concepts[concept] += list(map(lambda x: mentions_by_offset[x]['title'], list(closest)))
        
for concept, deps in wiki_concepts.items():
    x = Counter(deps)
    print(concept + ": " + str(x.most_common()))
# ranking criteria:
# proximity, page_rank, number of occurences, ratio of introduced before


Kuttler-LinearAlgebra-AFirstCourse-2017A.csv
Beezer_First_Course.csv
textbook_Hoffman_Kunze.csv
Nicholson-OpenLAWA-2019A.csv
Linear algebra done right — Axler.csv
Math1410_print.csv
Hefferon_LinAlgebra.csv
linear-Cherey, Denton.csv
interactive_textbook.csv
CollegeAlgCoreq-WEB.csv
A Random Walk Down Wall Street: [('Stochastic matrix', 1)]
Absolute value: [('Absolute value', 9), ('Complex number', 3), ('Real number', 3), ('Line segment', 3), ('Triangle inequality', 2), ('Ordered pair', 2), ('F(x) (musical group)', 2), ('Exponentiation', 2), ('Product rule', 2), ('Differential equation', 1), ('Antiderivative', 1), ('Contraposition', 1), ('Distance', 1)]
Adjugate matrix: [('Cofactor (biochemistry)', 7), ('Minor (linear algebra)', 6), ('Transpose', 6), ('Adjugate matrix', 6), ('Commutative ring', 4), ('Matrix (mathematics)', 3), ('Laplace expansion', 3), ('System of linear equations', 2), ('Complex number', 1), ('Scalar multiplication', 1), ('Gaussian elimination', 1), ('.tn', 1), ('Diagona