# Create a table of contents for theorems and definitions

In this workbook, text from a statistics textbook is extracted and regex is used to construct a custom table of contents. I used this during my stats class at UC Berkeley as the theorems and definitions were scattered across the book and  having them in one place was useful for preparing for the exam!

In [None]:
# Import libraries/packages
# https://anaconda.org/conda-forge/pdftotext
# conda install -c conda-forge pdftotext
import pdftotext
import numpy as np
import pandas as pd
import re

In [None]:
# Load your PDF
path = '../../../desktop/'
fname = 'agnostic_statistics.pdf'
with open(path + fname, "rb") as f:
    pdf = pdftotext.PDF(f)

In [None]:
theorems = []
definitions = []
page_num = -16 # Correct for pdf pages
pat_thm = '\nTheorem\s([0-9. ]+[a-zA-Z -]+)'
pat_dfn = '\nDefinition\s([0-9. ]+[a-zA-Z -]+)'

# Loop through pages of pdf
for page in pdf:
    # Get list of theorems names and append to list w/ page num
    if len(re.findall(pat_thm,page))>0:
        thm = re.findall(pat_thm,page)
        thm_with_pg_num = [[''.join(t),page_num] for t in thm]
        theorems.extend(thm_with_pg_num)
    # Get list of definition names and append to list w/ page num
    if len(re.findall(pat_dfn,page))>0:
        dfn = re.findall(pat_dfn,page)
        dfn_with_pg_num = [[''.join(d),page_num] for d in dfn]
        definitions.extend(dfn_with_pg_num)
    # Update page number for next loop
    page_num += 1

def convert_to_df(data,type):
    """Convert data to data frame and split # and Name"""
    df_dfn = pd.DataFrame(data)
    # Create new columns
    col_name = f'{type} name'
    df_dfn[['num',col_name]] = df_dfn[0].str.extract(r'([0-9.]+)\s(.*)', expand=True)
    df_dfn.rename(columns={1:'page_num'},inplace=True)
    df_dfn = df_dfn[df_dfn[col_name]!='']
    return df_dfn[['num',col_name,'page_num']].copy()

# And we have the dataframes!
dft = convert_to_df(theorems,type='Theorem')
dfd = convert_to_df(definitions, type='Definition')

dfd.to_csv('as_definitions.csv')
dft.to_csv('as_theorems.csv')
# Check out the results in your directory

In [None]:
# Show the results
dfd.head()

Unnamed: 0,num,Definition name,page_num
0,1.1.1.,Event Space,6
1,1.1.2.,Kolmogorov Axioms,7
2,1.1.5.,Joint Probability,9
3,1.1.8.,Conditional Probability,10
4,1.1.12.,Partition,12
