## 1. Load data

In [19]:
import glob
import re

In [20]:
paths = glob.glob('books/*.txt')

In [21]:
paths[0]

'books/The Iliad_Homer.txt'

In [22]:
with open(paths[0]) as f:
    text = f.read()

In [23]:
text



## 2. Preprocess

In [24]:
def remove_start_end(text):
    # get rid of the start
    start = re.finditer('[\s\S]+?(?=\*\*\* ?START[\s\S]+ ?\*\*\*)(\*\*\* ?START[\s\S]+? ?\*\*\*)', text, re.DOTALL)
    text = text[next(start).span()[1]:]
    # get rid of the end
    end = re.finditer('(?=\*\*\* ?END[\s\S]+ ?\*\*\*)[\s\S]+', text, re.DOTALL)
    text = text[:next(end).span()[0]]
    return text

In [25]:
def remove_special_characters(text):
    # remove three or more dots in a row
    text = re.sub('\.{3,}', '', text)
    # remove special characters
    text = re.sub('[^A-Za-z0-9 .\n]+', '', text)
    # convert next lines to spaces
    text = re.sub('\n+', ' ', text)
    # convert next lines to spaces
    text = re.sub(' +', ' ', text)
    return text

## 3. Save to json

In [26]:
book_data = []
for path in paths:
    name_author = path[:-4].split('/')[1].split('_')
    name = name_author[0]
    author = name_author[1]
    print(name, author)
    with open(path) as f:
        text = f.read()
    text = remove_start_end(text)
    text = remove_special_characters(text)
    book_data.append(
        {
        "Name": name, 
        "Author": author, 
        "Text": text.lower()
        }
    )

The Iliad Homer
The War of the Worlds H. G. Wells
Cranford Elizabeth Cleghorn Gaskell
The Great Gatsby F. Scott Fitzgerald
Heidi Johanna Spyri
The Prince Niccolo Machiavelli
Ivanhoe: A Romance Walter Scott
The Importance of Being Earnest Oscar Wilde
Around the World in Eighty Days Jules Verne
A Doll's House Henrik Ibsen
Kim Rudyard Kipling
Grimm's Fairy Tales Jacob Grimm and Wilhelm Grimm
The Blue Castle L. M. Montgomery
The Trial Franz Kafka
The Picture of Dorian Gray Oscar Wilde
Oliver Twist Charles Dickens
Hamlet William Shakespeare
The Tempest William Shakespeare
Moby Dick; Or, The Whale Herman Melville
The Strange Case of Dr. Jekyll and Mr. Hyde Robert Louis Stevenson
The Hound of the Baskervilles Arthur Conan Doyle
Dracula Bram Stoker
Pollyanna Eleanor H. Porter
Great Expectations Charles Dickens
Tarzan and the Lost Empire Edgar Rice Burroughs
Frankenstein Mary Shelley
Winnie-the-Pooh A. A. Milne
The Murder on the Links Agatha Christie
Through the Looking-Glass Lewis Carroll
Beyo

In [27]:
import json
with open('processed_books.json', 'w') as f:
    json.dump(book_data, f, indent = 4)