In this notebook, which tries to infer dependencies using embeddings, only 7 textbooks were used. Later, 3 additional books will be added to the dataset. The dataset contains only main text without introductions, appendices and etc. 

In [1]:
import pandas as pd

In [2]:
data = pd.read_json("data_chapters.json")
data.chapter_title.unique()

array(['Systems of Linear Equations', 'Vectors', 'Matrices',
       'Vector Spaces', 'Determinants', 'Eigenvalues',
       'Linear Transformations', 'Representations', 'Preliminaries',
       'Reference', 'Linear Systems', 'Maps Between Spaces', 'Similarity',
       'Contributors to this textbook', 'Variants of this textbook',
       'Overview', 'Systems of Linear Equations: Algebra',
       'Systems of Linear Equations: Geometry',
       'Linear Transformations and Matrix Algebra',
       'Eigenvalues and Eigenvectors', 'Orthogonality', 'Complex Numbers',
       'Notation', 'GNU Free Documentation License',
       'Systems of Equations', 'Rn', 'Spectral Theory',
       'Some Curvilinear Coordinate Systems', 'CHAPTER 1', 'CHAPTER 2',
       'CHAPTER 3', 'CHAPTER 4', 'CHAPTER 5', 'CHAPTER 6', 'CHAPTER 7',
       'CHAPTER 8', 'CHAPTER 9', 'CHAPTER 10', 'What is Linear Algebra?',
       'Systems of Linear Equations ', 'The Simplex Method',
       'Vectors in Space, n-Vectors', 'Subspaces 

In [3]:
data = data.loc[~data.chapter_title.str.contains("|".join(['Preliminaries','Reference','Contributors to this textbook', 'Variants of this textbook',
       'Overview','Notation', 'GNU Free Documentation License','List of Symbols', 'Fields'])) ]
data = data.loc[:,["filename","text"]]
data = data.groupby("filename")['text'].apply(lambda x: ' '.join(x)).reset_index()
data = data.drop_duplicates()
data

Unnamed: 0,filename,text
0,Beezer_First_Course,Chapter SLE\nSystems of Linear Equations\nWe w...
1,Hefferon_LinAlgebra,Chapter One\nLinear Systems\nI Solving Linear ...
2,Kuttler-LinearAlgebra-AFirstCourse-2017A,1. Systems of Equations\n1.1 Systems of Equati...
3,Linear algebra done right — Axler,2 CHAPTER 1Vector Spaces\n1.A Rnand Cn\nComple...
4,Nicholson-OpenLAWA-2019A,1. Systems of Linear Equations\n1.1 Solutions ...
5,interactive_textbook,6 CHAPTER 1. SYSTEMS OF LINEAR EQUATIONS: ALGE...
6,"linear-Cherey, Denton",1\nWhat is Linear Algebra?\nMany dicult probl...


### Simple preprocessing

In [13]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re

class utils:
    def __init__(self) -> None:
        ...

    @staticmethod
    def remove_repeated_sentences(text, times = 4):
        text_tokenized = sent_tokenize(text)
        freqs = Counter(text_tokenized)
        garbage = [el for el in freqs.elements() if freqs[el]>=times]
        text = [i for i in text_tokenized if i not in garbage]
        return " ".join(text)

    #def remove_formulas(text):
    #    return 
    
    def remove_new_line(text):
        return re.sub("\n"," ",text)
        
    def text_only(text):
        regex=u"[A-Za-z]+"
        regex = re.compile(regex)
        text = " ".join(regex.findall(text))
        return text

    def remove_stopwords(text):
        text = text.lower()
        word_tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        return " ".join([w for w in word_tokens if not w in stop_words])

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        word_tokens = word_tokenize(text)
        return " ".join([lemmatizer.lemmatize(w) for w in word_tokens])
    
    def remove_short_words(text):
        word_tokens = word_tokenize(text)
        word_tokens = [word for word in word_tokens if len(word)>2]
        return " ".join(word_tokens)


In [None]:
data.text = data.text.apply(utils.remove_repeated_sentences)
data.text = data.text.apply(utils.remove_new_line)
data.text = data.text.apply(utils.text_only)

In [7]:
data

Unnamed: 0,filename,text
0,Beezer_First_Course,Chapter SLE Systems of Linear Equations We wil...
1,Hefferon_LinAlgebra,Chapter One Linear Systems I Solving Linear Sy...
2,Kuttler-LinearAlgebra-AFirstCourse-2017A,Systems of Equations Systems of Equations Geom...
3,Linear algebra done right — Axler,CHAPTER Vector Spaces A Rnand Cn Complex Numbe...
4,Nicholson-OpenLAWA-2019A,Systems of Linear Equations Solutions and Elem...
5,interactive_textbook,CHAPTER SYSTEMS OF LINEAR EQUATIONS ALGEBRA Th...
6,"linear-Cherey, Denton",What is Linear Algebra Many di cult problems c...


### Chapters and subchapters collection

In [10]:
import os
path = "../dat/books/all_books/"
from PyPDF2 import PdfReader

def get_outline_first_level():
    '''Get first level chapters in the outline from PyPDF2 reader object. Note: the outline contains only top 1 level and ingores sublevels under the chapters.
    Returns:
    (list) : titles of chapters
    '''
    outline_first_level = []
    for element in reader.outline:
        if isinstance(element, dict):
            outline_first_level.append(element["/Title"]) 
    return outline_first_level

def get_subchapter(outline):
    ''' This functions is a recursion that helps finding subchapters at 2nd and deeper layers
    '''
    subchapters =[]
    for element in outline:
        if isinstance(element, list):
            subchapters.extend(get_subchapter(element))    
        else: 
            # base case
            subchapters.append(element["/Title"])
    return subchapters

def get_outline_all_levels(reader):
    '''Get all level chapters and subchapters in the outline from PyPDF2 reader object. 
    Returns:
    (list) : titles of chapters and subchapters
    '''   
    outline = reader.outline

    outline_all_levels = []
    outline_all_levels = get_subchapter(outline)

    return outline_all_levels
    
outlines = {}
for book in os.listdir(path):
    reader = PdfReader(path+book)
    outlines[book] = [get_outline_first_level()]
    outlines[book].append(get_outline_all_levels(reader))
    print(book)
    print(outlines[book][1])

Beezer_First_Course.pdf
['Preface', 'Acknowledgements', 'Systems of Linear Equations', 'What is Linear Algebra?', 'Solving Systems of Linear Equations', 'Reduced Row-Echelon Form', 'Types of Solution Sets', 'Homogeneous Systems of Equations', 'Nonsingular Matrices', 'Vectors', 'Vector Operations', 'Linear Combinations', 'Spanning Sets', 'Linear Independence', 'Linear Dependence and Spans', 'Orthogonality', 'Matrices', 'Matrix Operations', 'Matrix Multiplication', 'Matrix Inverses and Systems of Linear Equations', 'Matrix Inverses and Nonsingular Matrices', 'Column and Row Spaces', 'Four Subsets', 'Vector Spaces', 'Vector Spaces', 'Subspaces', 'Linear Independence and Spanning Sets', 'Bases', 'Dimension', 'Properties of Dimension', 'Determinants', 'Determinant of a Matrix', 'Properties of Determinants of Matrices', 'Eigenvalues', 'Eigenvalues and Eigenvectors', 'Properties of Eigenvalues and Eigenvectors', 'Similarity and Diagonalization', 'Linear Transformations', 'Linear Transformatio

### sciBert: 1. filtering

1. NER rekognition via sciBert + embeddings in the entire corpus (merged 7 books)

2. Measure similarity score between concepts and titles of chapters, subchapters

3. Filter out the concepts that have very low maximal similarity score

In [11]:
import pandas as pd
import spacy
#! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
nlp = spacy.load("en_core_sci_md")
doc=nlp(' '.join(data.text))
entities = doc.ents

In [None]:
text = ' '.join(data.text)
text = utils.remove_stopwords(text)
text = utils.remove_short_words(text)

#split_text = [text[i:i+800000] for i in range(0, len(text), 800000)]
#nlp_text = []
#for piece in split_text:
#    piece = nlp(piece)
#    nlp_text.append(piece)

doc=nlp(text)
entities = doc.ents