In [1]:
pip install inflect

Collecting inflect
  Using cached inflect-5.5.2-py3-none-any.whl (33 kB)
Installing collected packages: inflect
Successfully installed inflect-5.5.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import string
import re

import inflect
p = inflect.engine()

import spacy 
nlp = spacy.load("en_core_web_sm")

class PreTextCleaner:
    def __init__(self,actions=None):
        self.actions={}
        self.actions_list=['lower_case','remove_numeric','numeric_to_word','remove_sentence_split_delimeters',
                           'remove_text_in_brackets','remove_links','negation_handling','remove_puntuation','remove_whitespace','lemmatization']
        for x in self.actions_list:
            self.actions[x]=False 
        if actions:
            if type(actions) is str:
                with open(actions, "r") as f:
                        actions_text=f.read()
                f.close()
                actions_text=actions_text.split(',')
                for act in actions_text:
                    act_tokens=act.split(':')
                    if act_tokens[1].strip() =='True':
                        self.actions[act_tokens[0].strip()]=True
                    else:
                        self.actions[act_tokens[0].strip()]=False    
            elif type(actions) is dict:
                for x in self.actions_list:
                    if type(actions[x]) is bool:
                        self.actions[x]=actions[x] 
            else: 
                raise TypeError('Expected dict; got %s' % type(actions).__actions__)

    def clean(self,text):

        #convert text to lower case
        if self.actions['lower_case']:
            text = text.lower()

        #remove digits from text
        if self.actions['remove_numeric']:
            text = re.sub(r'\w*\d\w*', '', text)
            
        # convert number into words
        if self.actions['numeric_to_word']:
            # split string into list of words
            temp_str = text.split()
            # initialise empty list
            new_string = []

            for word in temp_str:
                # if word is a digit, convert the digit
                # to numbers and append into the new_string list
                if word.isdigit():
                    temp = p.number_to_words(word)
                    new_string.append(temp)

                # append the word as it is
                else:
                    new_string.append(word)

            # join the words of new_string to form a string
            text = ' '.join(new_string)

        #remove sentence split delimeters
        if self.actions['remove_sentence_split_delimeters']:
            text = re.sub(r'\?|\.|!', '', text)
        
        #remove text in brackets
        if self.actions['remove_text_in_brackets']:
            text = re.sub(r'\[.*?\]', '', text)
            text = re.sub(r'\(.*?\)', '', text)
            text = re.sub(r'\<.*?\>', '', text)
            
        #remove links from text
        if self.actions['remove_links']:
            text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # negation_handling(converts don't to do not)
        import pickle
        if self.actions['negation_handling']:
            text = text.lower()
#             text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
            pfile = open('apostrophe.pkl', 'rb')
            appos = pickle.load(pfile)
            pfile.close()
            words = text.split()
            text = ' '.join(appos[w] if w in appos else w for w in words)

        #remove puntuations from text
        if self.actions['remove_puntuation']:
            text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        
        # remove whitespace from text
        if self.actions['remove_whitespace']:
            text = " ".join(text.split())

        # Lemmatization
        if self.actions['lemmatization']:
            text = text.lower()
            word = nlp(text)
            text = " ".join([token.lemma_ if token.lemma_ !='-PRON-' else token.text for token in word])

        return text