In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

## Big Idea
- we want to reduce variability
- ex: both "math" and "Math" mean the same thing
    - lowercase to reduce variability


## Workflow
- establish a workflow to process our text data
- normalize

### Steps to Normalize:
1. original text
2. lowercase everything
3. remove accented and non-ASCII characters
4. remove special characters (/-?!)
5. tokenize the string into discrete units
6. stem or lemmatize the words (take the prefix or suffix off. calling = call)
7. remove stopwords (the, in, they)
8. store the transformed text for exploration

<hr style="border:2px solid black"> </hr>

## Normalizing

### Step 1: original text

In [2]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

### Step 2. Lowercase the text

In [3]:
article = original.lower()
article

"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

### Step 3: remove accented and non-ASCII characters

In [4]:
# Normalizaton: Remove inconsistencies in unicode charater encoding.
article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

# encode the strings into ASCII byte-strings (ignore non-ASCII characters)
# decode the byte-string back into a string

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Step 4: remove special characters (/-?!)

In [5]:
article = re.sub(r"[^a-z0-9'\s]", '', article)
article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Step 5: tokenize 

In [6]:
#create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

#use the tokenizer
article = tokenizer.tokenize(article, return_str = True)

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Step 6: stem or lemmatize 

- **stemming**: 
    - reducing the word to its stem
    - sometimes, word is not in dictionary
    - fast and efficient

In [7]:
#create porter stemmer
ps = nltk.porter.PorterStemmer()

In [8]:
#look how it works
ps.stem('Calling')

'call'

In [9]:
stems = [ps.stem(word) for word in article.split()]
#look at first 10 words
stems[:10]

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'are',
 'influenti',
 'hungarian',
 'mathematician',
 'who']

In [10]:
#put stemmed words back together
article_stemmed = ' '.join(stems)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

- **lemmatize**:
    - similiar to stemming
    - real word, looks up dictionary
    - slower than stemming because it has to look uo

In [11]:
#download the first time
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/natasharivers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()

In [13]:
#look how it words
wnl.lemmatize('influential')

'influential'

In [14]:
#look at the article 
lemmas = [wnl.lemmatize(word) for word in article.split()]
#first 10 words
lemmas[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [15]:
#join lemmatized words into article
article_lemmatized= ' '.join(lemmas)
article_lemmatized

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### Step 7: remove stopwords

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natasharivers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
#standard Enlgish languarge stopwords list from nltk
stopword_list = stopwords.words('english')

In [18]:
#see how many stop words there are
len(stopword_list)

179

In [19]:
stopword_list.append('o')
stopword_list.remove('not')
stopword_list.append("'")

In [20]:
#see 10 stopwords
stopword_list[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### Step 8: store the transformed text for exploration

In [21]:
words = article_lemmatized.split()

In [22]:
filtered_words = [word for word in words if word not in stopword_list]

In [23]:
print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

Removed 24 stopwords
---
paul erdos george polya influential hungarian mathematician contributed lot field erdos name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity
