In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
'math' == 'Math'

False

### Big Idea
We want to reduce the variability between words.  
Both "math" and "Math" mean the same thing, so we lowercase things to reduce the variability of the same exact term.  
Erdős, Erdös, and Erdos refer to the same person. Again, we're looking to reduce variability before we start searching for relationships between values.  


### Workflow:

We will establish a workflow to process our text data and prepare it for further use in exploration and modeling. This preprocessing is know as text **normalization**. Normalization is when you perform a series of tasks like making all text lowercase, removing punctuation, expanding contractions, removing anything that's not an ASCII character, etc.

![Untitled%20presentation-3.png](attachment:Untitled%20presentation-3.png)


In [12]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to \
the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often \
incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

***

#### Lowercase the text

In [13]:
#lowercase all letters in the text

article = original.lower()
article

"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

***

#### Removing accented characters or non-ASCII characters


![Screen Shot 2021-05-04 at 11.25.23 AM.png](attachment:8d257e11-4d0f-413a-8852-fa0957425811.png)

![Screen Shot 2021-05-04 at 11.28.12 AM.png](attachment:4b2b538d-4918-4e7f-a702-873d45e1a575.png)



In [14]:
# NFKD is one of the forms of normalization^^^(refer above)
# characters that look similar, use it

unicodedata.normalize('NFKD', 'Erdős')\
.encode('ascii', 'ignore').decode()

'Erdos'

In [15]:
# Remove inconsistencies in unicode character encoding.
# encode the strings into ASCII bytestrings (ignore non-ASCII characters)
# decode the bytestring into (Unicode) string

article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

***

#### Removing Special Characters

In [16]:
# remove anything that is **not**^ a through z, a number, a single quote', or whitespace\s

article = re.sub(r"[^a-z0-9'\s]", '', article)

***

#### Tokenization - break words and punctuation into discrete units
Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as **individual** words or terms. Each of these smaller units are called tokens

In [17]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
tokenizer.tokenize(article)

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [18]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
tokenizer.tokenize(article, return_str = True)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

***
### Stemming and Lemmatization - choose one


1. #### Stemming: reduce related words in your text to their common stem
    - suffix stripping
    - "calls", "called", and "calling" all share the base stem "call". It can make it easier when you are searching for a particular word in your text to search for their common stem rather than every form of the word.  
   
    - Algorithmic rules (non lingustic)
    - Fast and efficient

2. #### Lemmatize: 
    - Similar to stemming, but the root word is lexicographically correct word (present in the dictionary)  
    - Slower than stemming

***

#### Stemming

In [19]:
# Create porter stemmer.

ps = nltk.porter.PorterStemmer()

In [21]:
ps.stem(article)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [22]:
# Apply the stemmer to each word in our string.

stems = [ps.stem(word) for word in article.split()]
stems[:10]

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'are',
 'influenti',
 'hungarian',
 'mathematician',
 'who']

In [23]:
# Join our lists of words into a string again

article_stemmed = ' '.join(stems)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

***

#### Lemmatize
- Similar to stemming, but the root word is lexicographically correct word (present in the dictionary)  
- Slower than stemming

![Screen Shot 2021-05-04 at 11.48.10 AM.png](attachment:453fd857-a918-4123-b10e-de96ae3bcfb4.png)

In [24]:
# Download the first time.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/agomez/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [25]:
# Create the Lemmatizer.

wnl = nltk.stem.WordNetLemmatizer()

In [26]:
# Check lemmatizer. It works.

wnl.lemmatize('influence')

'influence'

In [28]:
# Use the lemmatizer on each word in the list of words we created by using split.

lemmas = [wnl.lemmatize(word) for word in article.split()]
lemmas[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [29]:
# Join our list of words into a string again; assign to a variable to save changes.

article_lemmatized = ' '.join(lemmas)
article_lemmatized

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

***

#### Removing Stopwords

- Words which have little or no significance, especially when constructing meaningful features from text, are known as stop words (or stopwords)
- example: a, an, the, and like
- we will use a standard English language stopwords list from nltk

In [32]:
# standard English language stopwords list from nltk
from nltk.corpus import stopwords


stopword_list = stopwords.words('english')

In [33]:
len(stopword_list)

179

In [34]:
# you can add or remove from stopword list 

stopword_list.remove('no')
stopword_list.append("o")
stopword_list.append("'")

In [35]:
# Split words in lemmatized article.

words = article_lemmatized.split()
words[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [36]:
# Create a list of words from my string with stopwords removed and assign to variable.

filtered_words = [word for word in words if word not in stopword_list]
filtered_words[:10]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot',
 'field']

In [37]:
# Join words in the list back into strings; assign to a variable to keep changes.

article_without_stopwords = ' '.join(filtered_words)
article_without_stopwords

"paul erdos george polya influential hungarian mathematician contributed lot field erdos's name contains hungarian letter 'o' 'o' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"