### Activity 1.01: Preprocessing of Raw Text

We have a text corpus that is in an improper format. In this activity, we will perform all the preprocessing steps that were discussed earlier to get some meaning out of the text.

In [14]:
# Required libraries
from nltk import word_tokenize, download, pos_tag, stem, sent_tokenize
download(['punkt', 'averaged_perceptron_tagger', 'stopwords'])
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

[nltk_data] Downloading package punkt to /Users/LNonyane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/LNonyane/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/LNonyane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load text corpus to a variable.
text = "The reader of this course should have a basic knowledge of the Python programming lenguage.\
 He/she must have knowldge of data types in Python. He should be able to write functions,\
 and also have the ability to import and use libraries and packages in Python. Familiarity\
 with basic linguistics and probability is assumed although not required to fully\
 complete this course."

In [3]:
print(text)

The reader of this course should have a basic knowledge of the Python programming lenguage. He/she must have knowldge of data types in Python. He should be able to write functions, and also have the ability to import and use libraries and packages in Python. Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.


In [4]:
# Apply the tokenization process to the text corpus and print the first 20 tokens.
#sentence = sent_tokenize(text)
word_token = word_tokenize(text)
print(word_token[:20])

['The', 'reader', 'of', 'this', 'course', 'should', 'have', 'a', 'basic', 'knowledge', 'of', 'the', 'Python', 'programming', 'lenguage', '.', 'He/she', 'must', 'have', 'knowldge']


In [5]:
# Apply spelling correction on each token and print the initial 20 corrected tokens as well as the corrected text corpus.
spell = Speller(lang='en')
def correct_spelling(tokens):
    sentence_corrected = ' '.join([spell(word) for word in tokens])
    tokens_corrected = [spell(word) for word in tokens]
    return sentence_corrected, tokens_corrected

In [6]:
word_token, word_token_correct = correct_spelling(word_token)
print(word_token)
print(word_token_correct)

The reader of this course should have a basic knowledge of the Python programming language . He/she must have knowledge of data types in Python . He should be able to write functions , and also have the ability to import and use libraries and packages in Python . Familiarity with basic linguistics and probability is assumed although not required to fully complete this course .
['The', 'reader', 'of', 'this', 'course', 'should', 'have', 'a', 'basic', 'knowledge', 'of', 'the', 'Python', 'programming', 'language', '.', 'He/she', 'must', 'have', 'knowledge', 'of', 'data', 'types', 'in', 'Python', '.', 'He', 'should', 'be', 'able', 'to', 'write', 'functions', ',', 'and', 'also', 'have', 'the', 'ability', 'to', 'import', 'and', 'use', 'libraries', 'and', 'packages', 'in', 'Python', '.', 'Familiarity', 'with', 'basic', 'linguistics', 'and', 'probability', 'is', 'assumed', 'although', 'not', 'required', 'to', 'fully', 'complete', 'this', 'course', '.']


In [7]:
# Apply PoS tags to each of the corrected tokens and print them.
def get_pos(tokens):
    tags = pos_tag(tokens)
    return tags

In [8]:
pos_tags = get_pos(word_token_correct)
print(pos_tags)

[('The', 'DT'), ('reader', 'NN'), ('of', 'IN'), ('this', 'DT'), ('course', 'NN'), ('should', 'MD'), ('have', 'VB'), ('a', 'DT'), ('basic', 'JJ'), ('knowledge', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Python', 'NNP'), ('programming', 'NN'), ('language', 'NN'), ('.', '.'), ('He/she', 'NNP'), ('must', 'MD'), ('have', 'VB'), ('knowledge', 'NN'), ('of', 'IN'), ('data', 'NNS'), ('types', 'NNS'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.'), ('He', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('able', 'JJ'), ('to', 'TO'), ('write', 'VB'), ('functions', 'NNS'), (',', ','), ('and', 'CC'), ('also', 'RB'), ('have', 'VBP'), ('the', 'DT'), ('ability', 'NN'), ('to', 'TO'), ('import', 'NN'), ('and', 'CC'), ('use', 'NN'), ('libraries', 'NNS'), ('and', 'CC'), ('packages', 'NNS'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.'), ('Familiarity', 'NN'), ('with', 'IN'), ('basic', 'JJ'), ('linguistics', 'NNS'), ('and', 'CC'), ('probability', 'NN'), ('is', 'VBZ'), ('assumed', 'VBN'), ('although', 'IN'), ('not', 'RB'), (

In [9]:
# Remove stop words from the corrected token list and print the initial 20 tokens.
stop_words = stopwords.words('english') # English stopwords
stop_punc = list(punctuation)
stop_final = stop_words + stop_punc
def remove_stopwords(tokens, stop_final):
    return [token for token in tokens if token not in stop_final]

In [10]:
tokens_no_stop_words = remove_stopwords(word_token_correct, stop_final)
print(tokens_no_stop_words[:20])

['The', 'reader', 'course', 'basic', 'knowledge', 'Python', 'programming', 'language', 'He/she', 'must', 'knowledge', 'data', 'types', 'Python', 'He', 'able', 'write', 'functions', 'also', 'ability']


In [11]:
# Apply stemming and lemmatization to the corrected token list and the print the initial 20 tokens.
def get_stems(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]
porterStem = stem.PorterStemmer()

In [12]:
token_stems = get_stems(tokens_no_stop_words, porterStem)
token_stems[:20]

['the',
 'reader',
 'cours',
 'basic',
 'knowledg',
 'python',
 'program',
 'languag',
 'he/sh',
 'must',
 'knowledg',
 'data',
 'type',
 'python',
 'he',
 'abl',
 'write',
 'function',
 'also',
 'abil']

In [27]:
# Lemmatization
lemmatizer = stem.WordNetLemmatizer()
def get_lemma(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [28]:
token_lemma = get_lemma(tokens_no_stop_words)
token_lemma[:20]

['The',
 'reader',
 'course',
 'basic',
 'knowledge',
 'Python',
 'programming',
 'language',
 'He/she',
 'must',
 'knowledge',
 'data',
 'type',
 'Python',
 'He',
 'able',
 'write',
 'function',
 'also',
 'ability']

In [26]:
# Detect the sentence boundaries in the given text corpus and print the total number of sentences.
sentence = sent_tokenize(text)
print(sentence) # total number of sentences

['The reader of this course should have a basic knowledge of the Python programming lenguage.', 'He/she must have knowldge of data types in Python.', 'He should be able to write functions, and also have the ability to import and use libraries and packages in Python.', 'Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.']
