In [1]:
import re
import numpy as np
from pathlib import Path
from bs4 import BeautifulSoup
from nltk import tokenize, download
from textwrap import TextWrapper

In [2]:
file_path = '1232-h.htm'

In [3]:
def read_html(file_path):      
    download('punkt', quiet=True)
#     wrapper = TextWrapper(cf.MAX_CHAR_LEN, fix_sentence_endings=True)
    wrapper = TextWrapper(140, fix_sentence_endings=True)

    with open(file_path) as f:
        book = f.read()
    
    input_text = BeautifulSoup(book, "html.parser").text
    text_list = []
    for paragraph in input_text.split('\n'):
        paragraph = paragraph.replace('â€”', '-')
        paragraph = re.sub(r'[^\x00-\x7f]', "", paragraph)
        sentences = tokenize.sent_tokenize(paragraph)
        
        # Wrap sentences to maximum character limit
        sentence_list = []
        for sentence in sentences:
            wrapped_sentences = wrapper.wrap(sentence)
            sentence_list.append(wrapped_sentences)
            
        # Flatten list of list of sentences and append
        trunc_sentences = [phrase for sublist in sentence_list for phrase in sublist]
        text_list.append(trunc_sentences)
    text_list = [[text for sentences in text_list for text in sentences]]
    
    # Parse out title from imported file path
    txt_title = Path(file_path).stem.lower().replace(' ', '_')

    return text_list, txt_title

In [4]:
corpus, title = read_html(file_path)

In [5]:
np.shape(corpus)

(1, 5476)

In [6]:
corpus[0][2]

'This eBook is for the use of anyone anywhere in the United States and'

In [7]:
title

'1232-h'

In [8]:
corpus

[['The Project Gutenberg eBook of The Prince, by Nicolo Machiavelli',
  'The Project Gutenberg eBook of The Prince, by Nicolo Machiavelli',
  'This eBook is for the use of anyone anywhere in the United States and',
  'most other parts of the world at no cost and with almost no restrictions',
  'whatsoever.',
  'You may copy it, give it away or re-use it under the terms',
  'of the Project Gutenberg License included with this eBook or online',
  'at www.gutenberg.org.',
  'If you',
  'are not located in the United States, you will have to check the laws of the',
  'country where you are located before using this eBook.',
  'Title: The Prince',
  'Author: Nicolo Machiavelli',
  'Translator: W. K. Marriott',
  'Release Date: March, 1998 [eBook #1232]',
  '[Most recently updated: July 1, 2022]',
  'Language: English',
  'Character set encoding: UTF-8',
  'Produced by: John Bickers, David Widger and Others',
  '*** START OF THE PROJECT GUTENBERG EBOOK THE PRINCE ***',
  'The Prince',
  'by 

In [9]:
assert title == "1232-h"
assert np.shape(corpus) == (1, 5476)
assert corpus[0][0] == 'The Project Gutenberg eBook of The Prince, by Nicolo Machiavelli'
assert corpus[0][2] == 'This eBook is for the use of anyone anywhere in the United States and'