In [0]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
import itertools
from autocorrect import Speller

In [0]:
# words which contains apostroph and also has negative meaning

contractions = {
"ain't": "are not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
"couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not",
"don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not",
"haven't": "have not", "isn't": "is not", "mayn't": "may not", "mightn't": "might not",
"mightn't've": "might not have", "mustn't": "must not", "mustn't've": "must not have", 
"needn't": "need not", "needn't've": "need not have", "oughtn't": "ought not", 
"oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
"shan't've": "shall not have", "shouldn't": "should not", "shouldn't've": "should not have",
"wasn't": "was not", "weren't": "were not", "won't": "will not","won't've": "will not have",
"would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 

"aint": "are not", "arent": "are not", "cant": "cannot", "cant've": "cannot have",
"couldnt": "could not", "couldnt've": "could not have", "didnt": "did not", "doesnt": "does not",
"dont": "do not", "hadnt": "had not", "hadnt've": "had not have", "hasnt": "has not",
"havent": "have not", "isnt": "is not", "maynt": "may not", "mightnt": "might not",
"mightnt've": "might not have", "mustnt": "must not", "mustnt've": "must not have", 
"neednt": "need not", "neednt've": "need not have", "oughtnt": "ought not", 
"oughtnt've": "ought not have", "shant": "shall not", "shant": "shall not", 
"shant've": "shall not have", "shouldnt": "should not", "shouldnt've": "should not have",
"wasnt": "was not", "werent": "were not", "wont": "will not","wont've": "will not have",
"would've": "would have", "wouldnt": "would not", "wouldnt've": "would not have"
}

In [0]:
stop_words = set(stopwords.words('english'))
# 'not' & 'no' indicate that certain sentence has negative meaning.
stop_words.discard('not')
stop_words.discard('no')

# add some stopwords
stop_words.add('yet')
stop_words.add('mine')

In [0]:
class TextCleaner():

    def __stim_html_tags(self, text:str):
        clean = re.compile('<.*?>')
        clean = re.sub(clean, '', text)
        clean = re.sub('[\n\t]', '', clean)

        return clean.lower() # convert all letters to lowrcase version

    def __remove_numbers(self, text:str):
        return re.sub(r'\d+', '', text)

    def __remove_punctuation(self, text:str):
        punctuations = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~' # don't remove "  '  " char.
        no_punc_chars = [char for char in text if char not in punctuations]

        return ''.join(no_punc_chars)

    def __lemmatizing_words(self, text:str):
        lemmatizer = WordNetLemmatizer()
        
        # stemming: is the process of eliminating affixes (suffixed, prefixes, infixes, circumfixes) from a word
        # books -> book, running->run
        text = [lemmatizer.lemmatize(token, "v") for token in text.split(' ')]
        # lemmatizing: like stemming, the diference is it captures canonical forms based on a word's lemma.
        # best -> good, worst -> bad
        text = [lemmatizer.lemmatize(token, "a") for token in text]

        return text

    def __remove_stopwords(self, text:list):
        # to remove words with apostroph which contains negative meaning
        words = []
        for word in text:
            if word in contractions.keys():
                words.extend(contractions[word].split())
            else:
                words.append(word)

        words = [word for word in words if word not in stop_words]
        
        return ' '.join(words)

    # use for give input. training data is already clear.
    def standardize_words(self, text:str):
        # converting letters of a word at most 2 times if it repeats at least one time.
        text = ''.join(''.join(word)[:2] for _, word in itertools.groupby(text))
        # then correct speeling of the word
        text = Speller(lang='en')(text)

        return text

    def clean_text(self, text:str):
        text = self.__stim_html_tags(text)     # step 1: removing html tags and convert text to lowercase letters
        text = self.__remove_numbers(text)     # step 2: removing numbers from text
        text = self.__remove_punctuation(text) # step 3: removing punctuations from text
        text = self.__lemmatizing_words(text)  # step 4: lemmatizing and stemming words in text
        text = self.__remove_stopwords(text)   # step 5: removing stopwords

        return text