In [1]:
# regex for noise removal
import re

In [2]:
# nltk for some preprocessing wonders
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
# download nltk corpora
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/mariel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/mariel/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/mariel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# passage from Through the Looking Glass
text = "The shop seemed to be full of all manner of curious things — but the oddest part of it all was, that whenever she looked hard at any shelf, to make out exactly what it had on it, that particular shelf was always quite empty: though the others round it were crowded as full as they could hold."

In [5]:
# noise removal using regex
cleaned = re.sub('\W+', ' ', text)
print(cleaned)

The shop seemed to be full of all manner of curious things but the oddest part of it all was that whenever she looked hard at any shelf to make out exactly what it had on it that particular shelf was always quite empty though the others round it were crowded as full as they could hold 


In [6]:
# tokenization using word_tokenize() from nltk
tokenized = word_tokenize(cleaned)
print(tokenized)

['The', 'shop', 'seemed', 'to', 'be', 'full', 'of', 'all', 'manner', 'of', 'curious', 'things', 'but', 'the', 'oddest', 'part', 'of', 'it', 'all', 'was', 'that', 'whenever', 'she', 'looked', 'hard', 'at', 'any', 'shelf', 'to', 'make', 'out', 'exactly', 'what', 'it', 'had', 'on', 'it', 'that', 'particular', 'shelf', 'was', 'always', 'quite', 'empty', 'though', 'the', 'others', 'round', 'it', 'were', 'crowded', 'as', 'full', 'as', 'they', 'could', 'hold']


In [7]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]
print("Stemmed text:\n", stemmed)

Stemmed text:
 ['the', 'shop', 'seem', 'to', 'be', 'full', 'of', 'all', 'manner', 'of', 'curiou', 'thing', 'but', 'the', 'oddest', 'part', 'of', 'it', 'all', 'wa', 'that', 'whenev', 'she', 'look', 'hard', 'at', 'ani', 'shelf', 'to', 'make', 'out', 'exactli', 'what', 'it', 'had', 'on', 'it', 'that', 'particular', 'shelf', 'wa', 'alway', 'quit', 'empti', 'though', 'the', 'other', 'round', 'it', 'were', 'crowd', 'as', 'full', 'as', 'they', 'could', 'hold']


In [8]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
print("Lemmatized text:\n", lemmatized)

Lemmatized text:
 ['The', 'shop', 'seemed', 'to', 'be', 'full', 'of', 'all', 'manner', 'of', 'curious', 'thing', 'but', 'the', 'oddest', 'part', 'of', 'it', 'all', 'wa', 'that', 'whenever', 'she', 'looked', 'hard', 'at', 'any', 'shelf', 'to', 'make', 'out', 'exactly', 'what', 'it', 'had', 'on', 'it', 'that', 'particular', 'shelf', 'wa', 'always', 'quite', 'empty', 'though', 'the', 'others', 'round', 'it', 'were', 'crowded', 'a', 'full', 'a', 'they', 'could', 'hold']


In [9]:
# luckily we've prepared for this...
from part_of_speech import get_part_of_speech

In [10]:
lemmatized_with_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
print("Text lemmatized with part of speech:\n", lemmatized_with_pos)

Text lemmatized with part of speech:
 ['The', 'shop', 'seem', 'to', 'be', 'full', 'of', 'all', 'manner', 'of', 'curious', 'thing', 'but', 'the', 'odd', 'part', 'of', 'it', 'all', 'be', 'that', 'whenever', 'she', 'look', 'hard', 'at', 'any', 'shelf', 'to', 'make', 'out', 'exactly', 'what', 'it', 'have', 'on', 'it', 'that', 'particular', 'shelf', 'be', 'always', 'quite', 'empty', 'though', 'the', 'others', 'round', 'it', 'be', 'crowd', 'a', 'full', 'a', 'they', 'could', 'hold']


In [13]:
# what if we removed unnecessary common words?
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print("Stopwords:", stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
filtered = [word for word in tokenized if word.lower() not in stop_words]
print("Text Filtered for stopwords:\n", filtered)

Text Filtered for stopwords:
 ['shop', 'seemed', 'full', 'manner', 'curious', 'things', 'oddest', 'part', 'whenever', 'looked', 'hard', 'shelf', 'make', 'exactly', 'particular', 'shelf', 'always', 'quite', 'empty', 'though', 'others', 'round', 'crowded', 'full', 'could', 'hold']
