New test cell

In [None]:
# Text preprocessing pipeline

In [None]:
%reset -f
from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all"
import nltk, pandas as pd, numpy.testing as npt, unicodedata, contractions, re
from numpy.testing import assert_equal as eq
import unittest
from colorunittest import run_unittest
_ = nltk.download(['omw-1.4','brown','wordnet','stopwords','averaged_perceptron_tagger'], quiet=True)
from nltk.corpus import brown, stopwords
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ

class Pipe():
    ### TASK 1: Attribute Initialization
    def __init__(self, LsWords=[], SsLex=set(), SsStopWords=set()) -> object:
        assert isinstance(LsWords, list) or LsWords is None, f'LsWords must be a list, not a {type(LsWords)}'
        assert isinstance(SsLex, set) or (SsLex=='nltk'), f'SsLex must be "nltk" or a set of lexicon words, not a {type(SsLex)}'
        assert isinstance(SsStopWords, set) or (SsStopWords=='nltk'), f'SsStopWords must be "nltk" or a set of words, not a {type(SsStopWords)}'
        self.df = pd.DataFrame(columns = ['Step', 'Words', 'Vocab', 'CorrVocab'])

        _ = nltk.download(['brown'], quiet=True)
        Ss6 = {s.lower() for s in nltk.corpus.brown.words()}

        self.LsWords = LsWords
        if SsLex =='nltk':
            self.SsLex = Ss6
        else:
            self.SsLex = SsLex
        if SsStopWords =='nltk':
            self.SsStopWords = set(stopwords.words('english'))
        else:
            self.SsStopWords = SsStopWords

        self.AddStats('Initialize')     # Saves basic stats for LsWord

    ### TASK 2: Output
    def Out(self) -> str:
        cleaned_string = ' '.join(self.LsWords)
        cleaned_string = re.sub(r'\s+', ' ', cleaned_string)
        return cleaned_string.strip()

    ### TASK 3: Lowercase
    @property
    def Low(self) -> object:
        self.LsWords = [w.lower() for w in self.LsWords]  # Lowercase each word token
        self.AddStats('Lower')  # Update statistics for this step **** not sure about this
        return self  # Return reference to self for method chaining
        raise NotImplementedError()

    ### TASK 4: Remove Digits
    @property
    def NoNum(self) -> object:
        self.LsWords = [re.sub(r'\d+', '', word) for word in self.LsWords]  # Remove all digits
        self.AddStats('NoNum')  # Update statistics for this step
        return self
        raise NotImplementedError()

    ### TASK 5: Keep Only Word Characters
    @property
    def Words(self) -> object:
        self.LsWords = [re.sub(r'[^\w\s]+', '', word) for word in self.LsWords]
        self.AddStats('Words')
        return self
        raise NotImplementedError()

    ### TASK 6: Remove Stop Words
    @property
    def Stop(self) -> object:
        self.LsWords = [word for word in self.LsWords if word.lower() not in self.SsStopWords]
        self.AddStats('Stop')
        return self
        raise NotImplementedError()

    ### TASK 7: Normalization
    @property
    def Norm(self) -> object:
        self.LsWords = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8') for word in self.LsWords]
        self.AddStats('Norm')  # Update statistics for this step
        return self  # Return reference to self for method chaining
        raise NotImplementedError()

    ### TASK 8: Expand Contractions
    @property
    def Exp(self) -> object:
        concatenated_string = ' '.join(self.LsWords)
        expanded_string = contractions.fix(concatenated_string)
        self.LsWords = expanded_string.split()
        self.AddStats('Exp')  # Update statistics for this step
        return self  # Return reference to self for method chaining
        raise NotImplementedError()


    ### TASK 9: Stem
    @property
    def Stem(self) -> object:
        pso = nltk.stem.PorterStemmer()       # instantiates Porter Stemmer object
        self.LsWords = [pso.stem(word) for word in self.LsWords]
        self.AddStats('Stem')
        return self
        raise NotImplementedError()

    ### TASK 10: Lemmatize
    @property
    def Lem(self) -> object:
        wlo = nltk.stem.WordNetLemmatizer()   # instantiates WordNet Lemmatizer object
        WNTag = lambda t: t[0].lower() if t[0] in 'ARNV' else 'n'   # Converts NLTK POS Tag to WordNet POS Tag
        # Create a list of tuples of words & their WordNet POS tags,
        #    i.e. 'a' for adjectives, 'r' for adverbs, 'v' for verbs, 'n' for nouns and all else
        LTssWordTag = [(word, WNTag(tag)) for word, tag in nltk.pos_tag(self.LsWords)]
        WNTag = lambda t: t[0].lower() if t[0] in 'ARNV' else 'n'
        LTssWordTag = [(word, WNTag(tag)) for word, tag in nltk.pos_tag(self.LsWords)]
        self.LsWords = [wlo.lemmatize(word, tag) for word, tag in LTssWordTag]
        self.AddStats('Lem')  # Update statistics for this step
        return self
        raise NotImplementedError()

    def AddStats(self, sTask='') -> object:
        SsWords = {s for s in self.LsWords}
        self.df.loc[len(self.df)] = [sTask, len(self.LsWords), len(SsWords), len(SsWords.intersection(self.SsLex))]
        return self     # Finally, return reference to the object itself

In [None]:
_ = nltk.download(['gutenberg'], quiet=True)
LsBookWords = list(nltk.corpus.gutenberg.words('bryant-stories.txt')) #[:1000]
sSampleText = nltk.corpus.gutenberg.raw('bryant-stories.txt')[:500] + '...\n'

pp = Pipe(LsBookWords, SsStopWords='nltk', SsLex='nltk').Low.Norm.Exp.Words.Stem.Stop.NoNum
pp.Out()[:500]
pp.df

'stori tell children sara cone bryant two littl riddl rhyme garden ken full littl gentlemen littl cap blue wear green ribbon veri fair flax hous hous goe messeng small slight whether rain snow sleep outsid night path littl yellow tulip onc wa littl yellow tulip live littl dark hous ground one day wa sit wa veri still suddenli heard littl _tap tap tap_ door said rain want come said soft sad littl voic come littl tulip said heard anoth littl _tap tap tap_ window pane said soft littl voic answer rai'

Unnamed: 0,Step,Words,Vocab,CorrVocab
0,Initialize,55563,4420,3307
1,Lower,55563,3940,3448
2,Norm,55563,3940,3448
3,Exp,55572,3935,3445
4,Words,55572,3896,3432
5,Stem,55572,2998,1955
6,Stop,32561,2882,1848
7,NoNum,32561,2880,1846
