This experiment is designed to extract dependencies for all index entries based on common occurences of other entities around one entity's page of introduction. Hence, we need a mapping from page to character offset and a data structure that is able to capture the concepts that are common across books. In a second step, maybe a Gaussian or asymmetric weighting of distance is necessary.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import sys
sys.path.append('../src/')
import data_layer
#from importlib import reload
#data_layer = reload(data_layer)

In [10]:
MAX_DEPS = 5
MIN_PAGERANK = 0.00005
MIN_SPAN = 3

In [11]:
book_indices, wiki_concepts = data_layer.read_index_and_wiki_concepts()
page_offsets = data_layer.get_page_offsets()

## Print annotations

In [None]:
for book, index in list(book_indices.items()):
    #print(book)
    mentions_by_offset, offset_index = data_layer.get_mentions_by_offset(book, min_pr=0.00005)
    def get_mentions_around(index_page):
        page = index_page + data_layer.page_correction.get(book)
        print(f"1-indexed pdf page {page}")
        if page<2: return []
        page_slice = slice(page-2, page)
        #print(page_slice)
        intro_offsets = page_offsets[book.replace(".csv","")][page_slice] # offsets of the two pages
        #print(intro_offsets)
        after_page_start = (offset_index > intro_offsets[0])
        before_page_end = (offset_index < intro_offsets[1])
        is_same_page = after_page_start & before_page_end
        closest = offset_index[is_same_page]
        return list(map(lambda x: mentions_by_offset[x]['title'], list(closest)))
    
    #print(len(page_offsets[book.replace(".csv","")])-data_layer.page_correction.get(book)-1)

    for i in range(1,len(page_offsets[book.replace(".csv","")])-data_layer.page_correction.get(book)):
        print(i)
        print(book + " book page " + str(i))
        print(get_mentions_around(i))


## Extract dependencies

In [12]:
potential_deps = {concept: [] for concept in wiki_concepts}

for book, index in book_indices.items():
    mentions_by_offset, offset_index = data_layer.get_mentions_by_offset(book, min_pr=MIN_PAGERANK, min_span=MIN_SPAN)
    def get_mentions_around(page):
        page = page + data_layer.page_correction.get(book)
        if page<2: return []
        page_slice = slice(page-2, page)
        intro_offsets = page_offsets[book.replace(".csv","")][page_slice]
        after_page_start = (offset_index > intro_offsets[0])
        before_page_end = (offset_index < intro_offsets[1])
        is_same_page = after_page_start & before_page_end
        closest = offset_index[is_same_page]
        return [(mentions_by_offset[x]['title'], mentions_by_offset[x]['pr']) for x in closest]

    for concept, page in zip(index.wiki_concept, index.first_page):
        potential_deps[concept].append(get_mentions_around(page))

In [24]:
dep_articles = []
for concept, deps in potential_deps.items():
    
    titles = [[dep[0] for dep in occs] for occs in deps]
    title_counter = Counter(sum(titles,[]))
    
    page_ranks = {occ[0]: occ[1] for occ in sum(deps, [])}
    unique_titles = [list(set(book_titles)) for book_titles in titles]
    unique_title_counter = Counter(sum(unique_titles,[]))
    
    concepts = [(unique_title_counter[title], # number of books with appearance,
                 title_counter[title]/unique_title_counter[title], # average number of appeareances
                 page_ranks[title],
                 title) for title in title_counter.keys() if title != concept]
    concepts.sort(reverse=True) # rank from first to third tupel element
    dep_articles.append([d for d in concepts[:min(MAX_DEPS, len(concepts))]])
    #if concept.find("Cross") != -1:
    #    print(concept)
    #    print(unique_title_counter.most_common())
    #    print(dep_articles[-1])


df_concepts = pd.DataFrame({'concept': potential_deps.keys(), 'dep_articles': dep_articles})

In [25]:
df_concepts.to_json("../dat/textbooks/de3.json")