In [1]:
# Import necessary libraries

import re, string
import nltk
from bs4 import BeautifulSoup
import codecs
import contractions
import inflect
from nltk.corpus import stopwords
import krovetzstemmer
from nltk.stem import WordNetLemmatizer

In [3]:
"""
Following preprocessing is performed on html file. It can be a text file also. 
Just modify file_read function according to data to be read

1. Strip html tags and get raw text
2. Remove data between square brackets
3. Expand contracted words
4. Tokenize sentences into words
5. Remove any non-ascii character
6. Lower case complete corpus
7. Remove punctuations
8. Replace numbers with word equivalents
9. Remove stopwords using predefined list of words in english language
10.Stem words. Krovetz stemmer is used. Other options are available as well
11.Lemmatize words

"""

def read_file(filename):
    html_file = filename
    file = codecs.open(html_file, 'r', 'utf-8').read()
    return file

def strip_html(text):
    file_data = BeautifulSoup(text, 'html.parser').get_text()
    return file_data

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def replace_contractions(text):
    """Replace contractions in string of text
    E.g can't --> cannot
    didn't --> did not
    """
    
    return contractions.fix(text)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_non_ascii(text):
    text_list = []    
    for word in text:
        text_list.append(''.join([i if ord(i) < 128 else '' for i in word]))
    return text_list

def to_lowercase(text):
    text_list = [word.lower() for word in text]
    return text_list

def remove_punc(text):
    word_list = []
    
    for word in text:
        # Regex needs to be changed as per input text
        word = re.sub(r'[^\w\s]', '', word)
        if word != '':
            word_list.append(word)
    return word_list

def replace_numbers(text):
    word_list = []
    p = inflect.engine()
    
    for word in text:
        if word.isdigit():
            word = p.number_to_words(word)
        else:
            word = word
        word_list.append(word)
    return word_list
 
def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    word_list = [word for word in text if word not in stop]
    return word_list

def stemmer(text):
    # Used krovetz stemmer. Other stemmers can be used
    stemmer = krovetzstemmer.Stemmer()
    word_list = [stemmer.stem(word) for word in text]
    return word_list
    
def lemmatizer(text):
    lemm = WordNetLemmatizer()
    word_list = [lemm.lemmatize(word) for word in text]
    return word_list

def token_normalizer(text):
    text = stemmer(text)
    text = lemmatizer(text)
    return text
  
def denoise_text(text):
    print('Actual Text:','\n\n',text, '\n\n')
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = replace_contractions(text)
    text = tokenize(text)
    text = remove_non_ascii(text)
    text = to_lowercase(text)
    text = remove_punc(text)
    text = replace_numbers(text)
    text = remove_stopwords(text)
    text = token_normalizer(text)
    print('Processed text:','\n\n',' '.join(text))
    return text


file = read_file('file.html')
text = denoise_text(file)    

Actual Text: 

 <h1>Title Goes Here</h1>
<b>Bolded Text</b>
<i>Italicized Text</i>
<img src="this should all be gone"/>
<a href="this will be gone, too">But this will still be here!</a>
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?
[Some text we don't want to keep is in here]
¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.
[This is some other unwanted text]
John: "Well, well, well."
James: "There, there. There, there."
&nbsp;&nbsp;
There are a lot of reasons not to do this. Ther