# Text Preprocessing

In [1]:
# importing packages & libraries
import nltk
import string

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/msonjap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Text Data

In [2]:
# defining sentence
sentences = "Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding. Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials. Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving. "
sentences

'Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding. Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials. Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving. '

In [3]:
# tokenizing the corpus into a list of sentences
sentence_tokens = sent_tokenize(sentences)
sentence_tokens

['Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding.',
 'Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials.',
 'Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving.']

### Converting to Lowercase

In [4]:
# using "".lower() from string library to convert string to lowercase
lower_sentence = sentence_tokens[0].lower()
lower_sentence

'apple today unveiled completely redesigned everyone can code curriculum to help introduce more elementary and middle school students to the world of coding.'

### Tokenization

In [5]:
# tokenizing sentence down to the word level
tokenized_words = word_tokenize(lower_sentence)
tokenized_words

['apple',
 'today',
 'unveiled',
 'completely',
 'redesigned',
 'everyone',
 'can',
 'code',
 'curriculum',
 'to',
 'help',
 'introduce',
 'more',
 'elementary',
 'and',
 'middle',
 'school',
 'students',
 'to',
 'the',
 'world',
 'of',
 'coding',
 '.']

### Removing stop words and punctuation

In [6]:
# concatenating together nltk’s list of English stop words with a list of the string library’s punctuation  
stpwrds = stopwords.words('english') + list(string.punctuation)
stpwrds

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
# printing list comprehension
stpwrds_removed = [word for word in tokenized_words if word not in stpwrds]
stpwrds_removed

['apple',
 'today',
 'unveiled',
 'completely',
 'redesigned',
 'everyone',
 'code',
 'curriculum',
 'help',
 'introduce',
 'elementary',
 'middle',
 'school',
 'students',
 'world',
 'coding']

### Stemming

In [8]:
# creating an instance of the PorterStemmer() object to stem words
stemmer = PorterStemmer()

In [9]:
# printing list comprehension
[stemmer.stem(word) for word in stpwrds_removed]

['appl',
 'today',
 'unveil',
 'complet',
 'redesign',
 'everyon',
 'code',
 'curriculum',
 'help',
 'introduc',
 'elementari',
 'middl',
 'school',
 'student',
 'world',
 'code']

## Processing the full corpus

In [10]:
# making lowercase, tokenize/remove puctuation and remove stop words from corpus
processed_sentences = []

for sentence in sent_tokenize(sentences):
    low_tokens = word_tokenize(sentence.lower())
    processed_sentences.append([word for word in low_tokens if word
                        not in stpwrds])

In [11]:
# printing list
processed_sentences[0]

['apple',
 'today',
 'unveiled',
 'completely',
 'redesigned',
 'everyone',
 'code',
 'curriculum',
 'help',
 'introduce',
 'elementary',
 'middle',
 'school',
 'students',
 'world',
 'coding']

In [12]:
# getting the number of tokens in the sentence
len(processed_sentences[0])

16