In [None]:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

def extract_text(filename):
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        return output_string.getvalue()

In [None]:
for filename in os.listdir('wittgenstein_pdfs'):
    if filename.endswith('.pdf'):
        output_text = extract_text(f'wittgenstein_pdfs/{filename}')
        output_filename = f'extracted_text/{filename[:-4]}.txt'
        output_file = open(output_filename, "w")
        n = output_file.write(output_text)
        output_file.close()

In [None]:
def clean_text(filename):
    with open(filename, 'r') as file:
        text = file.read()
    text = ' '.join([item for item in text.split('\n') if item != ''])
    for i in reversed(range(501)):
        text = text.replace(f'Page Break {str(i)}', '')
    for i in reversed(range(501)):
        text = text.replace(f'Page {str(i)}', '\n\n')
    text = text.replace('  ', ' ').replace(' \n', '\n').replace('\n ', '\n')
    return text

def clean_pi_2(filename):
    with open(filename, 'r') as file:
        text = file.read()
    lines = []
    linebreak = False
    for line in text.split('\n'):
        if (not line.startswith('9781405159289_4_002.qxd')) and (
            not line.startswith(
                'Philosophy of Psychology a A Fragment')) and (
            not (line.endswith('e') and (line[:-1].isdigit()))) and (
            line != '*') and (
            line not in ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii',
                         'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv']):
            if line == '' and linebreak == False:
                lines.append('\n\n')
                linebreak = True
            elif line.endswith('-'):
                lines.append(line[:-1])
                linebreak = False
            elif line != '':
                lines.append(line)
                linebreak = False
    new_lines = []
    for i in range(1, len(lines)):
        if lines[i][0].islower():
            if lines[i-1] != '\n\n':
                new_lines.append(lines[i-1])
        else:
            new_lines.append(lines[i-1])
    text = ' '.join(new_lines).replace('  ', ' ').replace(
        ' \n', '\n').replace('\n ', '\n')
    return text

In [None]:
import os
for filename in os.listdir('extracted_text'):
    if filename.endswith('.txt'):
        if filename != 'Philosophical_Investigations_Part_II.txt':
            output_text = clean_text(f'extracted_text/{filename}')
        else:
            output_text = clean_pi_2(f'extracted_text/{filename}')
        output_filename = f'cleaned_texts_2/{filename}'
        output_file = open(output_filename, "w")
        n = output_file.write(output_text)
        output_file.close()

In [None]:
def create_section_dict(filename):
    with open(filename, 'r') as file:
        text = file.read()
    section_dict = {}
    i = 1
    while True:
        if text.strip().startswith(f'{i}. '):
            if len(text.split(f'{i + 1}. ')) > 1:
                section_dict[i] = text.split(f'{i + 1}. ')[0].strip()[
                    len(str(i)) + 1:].strip()
            elif len(text.split(f'{i + 1} ')) > 1:
                section_dict[i] = text.split(f'{i + 1} ')[0].strip()[
                    len(str(i)):].strip()
            else:
                section_dict[i] = text.split(f'{i + 1}. ')[0].strip()[
                    len(str(i)) + 1:].strip()
            text = text.replace(text.split(f'{i + 1}. ')[0], '')
        elif text.strip().startswith(f'{i} '):
            if len(text.split(f'{i + 1} ')) > 1:
                section_dict[i] = text.split(f'{i + 1} ')[0].strip()[
                    len(str(i)):].strip()
            elif len(text.split(f'{i + 1}. ')) > 1:
                section_dict[i] = text.split(f'{i + 1}. ')[0].strip()[
                    len(str(i)) + 1:].strip()
            else:
                section_dict[i] = text.split(f'{i + 1} ')[0].strip()[
                    len(str(i)):].strip()
            text = text.replace(text.split(f'{i + 1} ')[0], '')
        else:
            break
        i += 1
    return section_dict

In [None]:
import json
for filename in os.listdir('cleaned_texts'):
    if filename.endswith('.txt'):
        output_dict = create_section_dict(f'cleaned_texts/{filename}')
        output_filename = f'section_dicts/{filename[:-4]}.json'
        output_file = open(output_filename, "w")
        n = json.dump(output_dict, output_file)
        output_file.close()