# Preprocessing Techniques

### RegEx

RegEx is especially useful for cleaning out unwanted punctuation marks, captialized letters, special characters etc.

In [1]:
import re

Eg. Replacing Characters

In [3]:
string='Harper is a good girl.'
#'.' represents any character, while * represents zero or more occurences
#in this case, '..g.*d' will match with 'a good'
#re.sub replaces this with 'the goodest'
re.sub('..g.*d','the goodest',string)

'Harper is the goodest girl.'

Using RegEx to remove special chars and punctuation

In [8]:
string='''
One ring to rule them all,
One ring to find them, One ring to bring them all,
and in the darkness, bind them.
'''

string=re.sub('(<.*?>)', ' ', string)
string=re.sub('[,\.!?:()"]', '', string)
string=re.sub('[^a-zA-Z"]',' ',string)

print(string)

 One ring to rule them all One ring to find them One ring to bring them all and in the darkness bind them 


### Word Tokenization

Tokenizing a string into individual words

In [9]:
from nltk.tokenize import word_tokenize

words=word_tokenize(string)
print(words)

['One', 'ring', 'to', 'rule', 'them', 'all', 'One', 'ring', 'to', 'find', 'them', 'One', 'ring', 'to', 'bring', 'them', 'all', 'and', 'in', 'the', 'darkness', 'bind', 'them']


### Stemming

In [10]:
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liuru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
ps=PorterStemmer()

string='''
From the tip of his wand burst the silver doe. She landed on the office floor, bounded once across the office, and soared out of the window. Dumbledore watched her fly away, and as her silvery glow faded he turned back to Snape, and his eyes were full of tears.
“After all this time?”
“Always,” said Snape.
'''

words=word_tokenize(string)
stemmed_string = reduce(lambda x, y: x +" "+ps.stem(y), words, "")
print(stemmed_string)

 from the tip of hi wand burst the silver doe . she land on the offic floor , bound onc across the offic , and soar out of the window . dumbledor watch her fli away , and as her silveri glow fade he turn back to snape , and hi eye were full of tear . “ after all thi time ? ” “ alway , ” said snape .


### Lemmatization

For the lemmatizer to work as intended, we need to give th lemmatizer the context of each word. This is achieved through POS tagging, which will be covered in greatee detail next week. The default POS tagger assumes all words to be nouns if no context is given.

In [13]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
lemmatizer= WordNetLemmatizer()
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liuru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\liuru\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [16]:
string = '''
From the tip of his wand burst the silver doe. 
She landed on the office floor, bounded once across the office, and soared out of the window. 
Dumbledore watched her fly away, and as her silvery glow faded he turned back to Snape, and his eyes were full of tears.
“After all this time?”
“Always,” said Snape.”
'''
pos_tagged = nltk.pos_tag(nltk.word_tokenize(string))

wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
print(wordnet_tagged)

[('From', None), ('the', None), ('tip', 'n'), ('of', None), ('his', None), ('wand', 'n'), ('burst', 'v'), ('the', None), ('silver', 'n'), ('doe', 'n'), ('.', None), ('She', None), ('landed', 'v'), ('on', None), ('the', None), ('office', 'n'), ('floor', 'n'), (',', None), ('bounded', 'v'), ('once', 'r'), ('across', None), ('the', None), ('office', 'n'), (',', None), ('and', None), ('soared', 'v'), ('out', None), ('of', None), ('the', None), ('window', 'n'), ('.', None), ('Dumbledore', 'n'), ('watched', 'v'), ('her', None), ('fly', 'n'), ('away', 'r'), (',', None), ('and', None), ('as', None), ('her', None), ('silvery', 'n'), ('glow', 'n'), ('faded', 'v'), ('he', None), ('turned', 'v'), ('back', 'r'), ('to', None), ('Snape', 'n'), (',', None), ('and', None), ('his', None), ('eyes', 'n'), ('were', 'v'), ('full', 'a'), ('of', None), ('tears', 'n'), ('.', None), ('“', 'n'), ('After', None), ('all', None), ('this', None), ('time', 'n'), ('?', None), ('”', 'a'), ('“', 'n'), ('Always', 'n'), (

In [17]:
lemmatized_sentence = []

for word, tag in wordnet_tagged:
    if tag is None:
        lemmatized_sentence.append(word)
    else:       
        lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
lemmatized_sentence = " ".join(lemmatized_sentence)
 
print(lemmatized_sentence)

From the tip of his wand burst the silver doe . She land on the office floor , bound once across the office , and soar out of the window . Dumbledore watch her fly away , and as her silvery glow fade he turn back to Snape , and his eye be full of tear . “ After all this time ? ” “ Always , ” say Snape . ”


### Preprocessing for BOW

In [19]:
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

text = '''
Never gonna run around and desert you.
'''

dataset= nltk.word_tokenize(text)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'/W', ' ', dataset[i])
    dataset[i] = re.sub(r'/s+', ' ', dataset[i])
filtered_sentence = [w for w in dataset if not w.lower() in stop_words]
print(filtered_sentence)

['never', 'gon', 'na', 'run', 'around', 'desert', '.']
