In [None]:
from pdfminer.high_level import extract_text
import json
import pandas
import pathlib
import tqdm
import uuid

# convert pdf to text and split by line breaks.

text = extract_text("20160920 Fiaf Manual-WEB.pdf")
segments = [x.strip() for x in text.split('\n')]

# try and identify page numbers.

pages = {}
for x in range(1, 400):
    indexes = [i for i, y in enumerate(segments) if y == str(x)]
    pages[x] = indexes

# clean up multiple candidates for page numbers via closest to previous page.

pages = {k:v for k, v in reversed(pages.items()) if len(v)}
current_page = 999999
for k,v in pages.items():
    closest = min(v, key=lambda x: abs(x-current_page))
    pages[k] = [closest]
    current_page = closest

pages = {k:v for k, v in reversed(pages.items()) if len(v)}

# scrub pages from segments

for x in pages.values():
    segments[x[0]] = ''

# print(json.dumps(pages, indent=4))

In [None]:

def extract_page(row):

    ''' Extract page number from contents text. '''

    if '......' in row['text']:
        return 1
    else: return 0

def extract_label(row):

    ''' Extract label from contents text. '''
    
    return row['text'].split('...')[0].strip()

def find_segment(row):

    ''' Locate section header within expected page. '''

    label = row['label'].upper()
    page_numb = int(row['page'])
    page_sections = segments[pages[page_numb-1][0]:pages[page_numb][0]]
    page_sections = [x[:len(label)].upper() for x in page_sections]
    if label in page_sections:
        ind = page_sections.index(label)
        return ind+pages[page_numb-1][0]

    label = row['label'][:18].upper().strip()
    page_numb = int(row['page'])
    page_sections = segments[pages[page_numb-1][0]:pages[page_numb][0]]
    page_sections = [x[:18].upper().strip() for x in page_sections]
    if label in page_sections:
        ind = page_sections.index(label)
        return ind+pages[page_numb-1][0]

def reindex(row):

    ''' Reindex to four digit syntax. '''

    index = row['label'].replace('Appendix','').strip()
    index = index.split(' ')[0]
    index = index.replace(',','')
    if index[-1] == '.':
        index = index[:-1]
    index = index.split('.')+[0,0,0,0]
    index = '.'.join([str(x) for x in index[:4]])+'-'+str(uuid.uuid4())[:4]

    return index

contents_start = segments.index('Table of Contents')
contents_end = segments.index('Introduction')
contents = [x for x in segments[contents_start:contents_end] if x != '']
content_df = pandas.DataFrame(contents, columns=['text'])
content_df = content_df.loc[content_df.text.str.len() > 5]
content_df = content_df.iloc[::-1]

content_df['section'] = content_df.apply(extract_page, axis=1)
content_df['section'] = content_df['section'].cumsum()
content_df = content_df.iloc[::-1]
content_df = content_df.pivot_table(index=['section'], aggfunc=lambda x: ' '.join(x)).reset_index(drop=True)
content_df['section'] = content_df.index[::-1]
content_df = content_df.sort_values('section')
content_df['page'] = content_df['text'].str.split('.').str[-1].str.strip()
content_df['label'] = content_df.apply(extract_label, axis=1)

content_df = content_df.loc[content_df.page.isin([str(x) for x in range(2, 999)])]
content_df['index'] = content_df.apply(reindex, axis=1)

content_df['segment_link_start'] = content_df.apply(find_segment, axis=1)
content_df['segment_link_end'] = list(content_df['segment_link_start'])[1:]+[999]

content_df.head()

In [None]:
# extract chunks to markdown.

for i, x in tqdm.tqdm(enumerate(content_df.to_dict('records'))):
    path = pathlib.Path.cwd() / 'markdown' / x['index'] / 'en.md'
    path.parents[0].mkdir(parents=True, exist_ok=True)
    with open(path, 'w') as export:
        export.write('\n'.join(segments[x['segment_link_start']:x['segment_link_end']]))