### **``Exercises: NLP Preparation``**
    date: Wednesday, August 31st 2022

----

In [206]:
# notebook dependencies 
import os # for caching purposeses
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

# JSON import
import json

# importing BeautifulSoup for parsing HTML/XTML
from bs4 import BeautifulSoup

# request module for connecting to APIs
from requests import get

# text prepare modules
import acquire_codeup_blogs
import acquire_news_articles

# uni-code library
import unicodedata

# natural language toolkit library/modules
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

----
#### ``Exercise Number 1: Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:``

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [207]:
# creating a function titled, 'basic_clean'
# lowercase everything
# normalize unicode characters
# replace non-alphanumeric characters with whitespace

def basic_clean(string):

    # lowercase the text
    string = string.lower()

    # Handle curly quotes
    charmap = {0x201c: u'"',
                0x201d: u'"',
                0x2018: u"'",
                0x2019: u"'"}

    string = string.translate(charmap)

    # normalizing the text
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # return only alphanumeric values in text: everything else, convert to whitespace
    string = re.sub("[^a-z0-9\s']", '', string)
    
    # return the string text
    return string

In [208]:
# test the function

messy_string = messy_string = 'ThiS is% .thE mé_ssiest STRING o^f all timE.!!!!!'

clean_string = basic_clean(messy_string)

clean_string # checks out!

'this is the messiest string of all time'

----

#### ``Exercise Number 2: Define a function named tokenize``

* It should take in a string and tokenize all the words in the string.

**<u>Tokenization</u>**

After removing non-ASCII characters and special characters, it's common to tokenize the strings, to break words and any punctuation left over into discrete units.

Tokenization is the process of breaking something down into discrete units. In the context of NLP, this means breaking text down into discrete words, punctuation, etc.

In [209]:
# creating a function to tokenize the string text

def tokenize(string):
    
    # creating the tokenize object
    tokenizer = ToktokTokenizer()
    
    # using the tokenize object on the input string
    return tokenizer.tokenize(string, return_str = True)

In [210]:
# importing some data

df = acquire_codeup_blogs.get_codeup_blogs()
df.head()

df shape: (22, 3)


Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
3,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
4,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...


In [211]:
# testing the tokenize function

cleaned_df = df.applymap(basic_clean)

tokenized_df = cleaned_df.applymap(tokenize)

tokenized_df.head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
2,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
3,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you ' re considering a career in web develo...
4,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you ' re considering a career in web develo...


----
#### ``Exercise Number 3: Define a function named stem.`` 

* It should accept some text and return the text after applying stemming to all the words.

*“**Stemming** usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes….Stemmers use language-specific rules, but they require less knowledge than a lemmatizer…”*


In [212]:
# creating the function to accept some text and apply stemming process
# using 'PorterStemmer' method

def porter_stem(string):

    # creating the object
    ps = PorterStemmer()
    
    # using list comprehension to return the stem of ea. word in the string as a list
    stems = [ps.stem(word) for word in string.split()]

    # then re-joining ea. word as a single string text w/ a space in between ea. word
    stemmed_string = ' '.join(stems)

    return stemmed_string

In [213]:
# creating the function to accept some text and apply stemming process
# using 'LancasterStemmer' method

def lancaster_stem(string):

    # creating the object
    lc = LancasterStemmer()
    
    # using list comprehension to return the stem of ea. word in the string as a list
    stems = [lc.stem(word) for word in string.split()]

    # then re-joining ea. word as a single string text w/ a space in between ea. word
    stemmed_string = ' '.join(stems)

    return stemmed_string

In [214]:
cleaned_df.head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
2,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
3,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you're considering a career in web developm...
4,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you're considering a career in web developm...


In [215]:
# testing porter stemmer

tokenized_df.applymap(porter_stem).head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current econom climat mani economist...
2,is a career in tech recessionproof,aug 12 2022,given the current econom climat mani economist...
3,what job can you get after a code bootcamp par...,aug 2 2022,if you ' re consid a career in web develop but...
4,what job can you get after a code bootcamp par...,aug 2 2022,if you ' re consid a career in web develop but...


In [216]:
# testing lancaster stemmer
# more aggressive than porter stemmer

tokenized_df.applymap(lancaster_stem).head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show com con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a car in tech recessionproof,aug 12 2022,giv the cur econom clim many econom ar consid ...
2,is a car in tech recessionproof,aug 12 2022,giv the cur econom clim many econom ar consid ...
3,what job can you get aft a cod bootcamp part 3...,aug 2 2022,if you ' re consid a car in web develop but do...
4,what job can you get aft a cod bootcamp part 3...,aug 2 2022,if you ' re consid a car in web develop but do...


----

#### ``Exercise Number 4: Define a function named lemmatize ``

* It should accept some text and return the text after applying lemmatization to each word.

*“**Lemmatization** usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma….a lemmatizer, which needs a complete vocabulary and morphological analysis to correctly lemmatize words…”*

In [217]:
# create the function to lemmatize text

def lemmatize(string):
    
    # creating the lemmatizer object
    wnl = WordNetLemmatizer()
    
    # using list comprehension to apply the lemmatizer on ea. word and return words as a list
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # re-joining the individual words as a single string text
    lemmatized_string = ' '.join(lemmas)
    
    # return the tranformed string text
    return lemmatized_string

In [218]:
cleaned_df.head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
2,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
3,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you're considering a career in web developm...
4,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you're considering a career in web developm...


In [219]:
# testing the lemmatizer

tokenized_df.applymap(lemmatize).head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
2,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
3,what job can you get after a coding bootcamp p...,aug 2 2022,if you ' re considering a career in web develo...
4,what job can you get after a coding bootcamp p...,aug 2 2022,if you ' re considering a career in web develo...


----

#### ``Exercise Number 5: Define a function named remove_stopwords.`` 

* It should accept some text and return the text after removing all the stopwords

In [220]:
# list of english stop words

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [221]:
# creating the function to remove stopwords from a string of text

def remove_stopwords(string, exclude_words = None, include_words = None):
    
    # creating the list of english stop words
    stopword_list = stopwords.words('english')
    
    # if there are words to exlude not in stopword_list, then add them to stop word list
    if include_words:
        
        stopword_list = stopword_list + include_words

    # if there are words we dont want to remove, then take them out of the stop words list
    if exclude_words:
        
        for word in exclude_words:
            
            stopword_list.remove(word)

    # split string text into individual words        
    words = string.split()
    
    # filter the string words, and only include words not in stop words list
    filtered_words = [word for word in words if word not in stopword_list]
    
    # re-join the words into individual string text
    filtered_string = ' '.join(filtered_words)
    
    # return the string text back: excluding stop words
    return filtered_string

In [222]:
tokenized_df.head()

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup had a blast at the san antonio superher...
1,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
2,is a career in tech recessionproof,aug 12 2022,given the current economic climate many econom...
3,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you ' re considering a career in web develo...
4,what jobs can you get after a coding bootcamp ...,aug 2 2022,if you ' re considering a career in web develo...


In [223]:
# testing the function

tokenized_df.applymap(remove_stopwords).head() # checks out! stop words removed

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup blast san antonio superhero car show co...
1,career tech recessionproof,aug 12 2022,given current economic climate many economists...
2,career tech recessionproof,aug 12 2022,given current economic climate many economists...
3,jobs get coding bootcamp part 3 web development,aug 2 2022,' considering career web development ' know ex...
4,jobs get coding bootcamp part 3 web development,aug 2 2022,' considering career web development ' know ex...


In [224]:
# ensuring function "keyword arguments" are working: including extra words to remove

extra_words = ['get', 'recessionproof']

tokenized_df.applymap(remove_stopwords, include_words = extra_words).head() # checks out!

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup blast san antonio superhero car show co...
1,career tech,aug 12 2022,given current economic climate many economists...
2,career tech,aug 12 2022,given current economic climate many economists...
3,jobs coding bootcamp part 3 web development,aug 2 2022,' considering career web development ' know ex...
4,jobs coding bootcamp part 3 web development,aug 2 2022,' considering career web development ' know ex...


In [225]:
# what about words to exlcude from filtering?

dont_filter = ["has", "is", "a"]

tokenized_df.applymap(remove_stopwords, exclude_words = dont_filter).head() # checks out!

Unnamed: 0,article_title,publish_date,contents
0,codeup x superhero car show comic con,aug 10 2022,codeup a blast san antonio superhero car show ...
1,is a career tech recessionproof,aug 12 2022,given current economic climate many economists...
2,is a career tech recessionproof,aug 12 2022,given current economic climate many economists...
3,jobs get a coding bootcamp part 3 web development,aug 2 2022,' considering a career web development ' know ...
4,jobs get a coding bootcamp part 3 web development,aug 2 2022,' considering a career web development ' know ...


----

#### ``Exercise 6: Use your data from the acquire to produce a dataframe of the news articles.``

* Name the dataframe news_df.

In [226]:
# importing the Inshorts dataset

df_articles = acquire_news_articles.get_articles_df()
df_articles.head()

df shape: (100, 6)


Unnamed: 0,genre,publish_date,source,title,authors,content
0,business,31 Aug,Twitter,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...
1,business,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Anmol Sharma,Snap said on Wednesday it will lay off 20% of ...
2,business,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ananya Goyal,Two senior advertising executives at Snap quit...
3,business,31 Aug,Reuters,Musk seeks to delay Twitter trial to Nov amid ...,Ananya Goyal,Tesla CEO Elon Musk is seeking to delay the tr...
4,business,31 Aug,News18,Viral video shows Amazon parcels thrown out of...,Ridham Gambhir,A video from Guwahati railway station has gone...


----

#### ``Exercise 7: Make another dataframe for the Codeup blog posts.``

 * Name the dataframe codeup_df

In [227]:
# imporing the Codeup Blogs df

df_blogs = acquire_codeup_blogs.get_codeup_blogs()
df_blogs.head()

df shape: (22, 3)


Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
3,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
4,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...


----
#### ``Exercise Number 8: For each dataframe, produce the following columns:``

* title: to hold the title
* original: to hold the original article/post content
* clean: to hold the normalized and tokenized original with the stopwords removed.
* stemmed: to hold the stemmed version of the cleaned data.
* lemmatized: to hold the lemmatized version of the cleaned data.

In [229]:
# starting with the Articles df

df_articles["clean"] = df_articles["content"].apply(basic_clean).apply(tokenize).apply(remove_stopwords)
df_articles["stemmed"] = df_articles["clean"].apply(porter_stem)
df_articles["lemmatized"] = df_articles["clean"].apply(lemmatize)

df_articles.head()

Unnamed: 0,genre,publish_date,source,title,authors,content,clean,stemmed,lemmatized
0,business,31 Aug,Twitter,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...,india ' gdp grew 135 first quarter fy23 achiev...,india ' gdp grew 135 first quarter fy23 achiev...,india ' gdp grew 135 first quarter fy23 achiev...
1,business,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Anmol Sharma,Snap said on Wednesday it will lay off 20% of ...,snap said wednesday lay 20 staff shut original...,snap said wednesday lay 20 staff shut origin s...,snap said wednesday lay 20 staff shut original...
2,business,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ananya Goyal,Two senior advertising executives at Snap quit...,two senior advertising executives snap quit ho...,two senior advertis execut snap quit hour repo...,two senior advertising executive snap quit hou...
3,business,31 Aug,Reuters,Musk seeks to delay Twitter trial to Nov amid ...,Ananya Goyal,Tesla CEO Elon Musk is seeking to delay the tr...,tesla ceo elon musk seeking delay trial twitte...,tesla ceo elon musk seek delay trial twitter n...,tesla ceo elon musk seeking delay trial twitte...
4,business,31 Aug,News18,Viral video shows Amazon parcels thrown out of...,Ridham Gambhir,A video from Guwahati railway station has gone...,video guwahati railway station gone viral show...,video guwahati railway station gone viral show...,video guwahati railway station gone viral show...


In [233]:
# codeup blogs

df_blogs["clean"] = df_blogs["contents"].apply(basic_clean).apply(tokenize).apply(remove_stopwords, include_words = ["codeup", "codeup's", "Codeup", "Codeup's", "'"])
df_blogs["stemmed"] = df_blogs["clean"].apply(porter_stem)
df_blogs["lemmatized"] = df_blogs["clean"].apply(lemmatize)

df_blogs.head()

Unnamed: 0,article_title,publish_date,contents,clean,stemmed,lemmatized
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...,blast san antonio superhero car show comic con...,blast san antonio superhero car show comic con...,blast san antonio superhero car show comic con...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo...",given current economic climate many economists...,given current econom climat mani economist con...,given current economic climate many economist ...
2,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo...",given current economic climate many economists...,given current econom climat mani economist con...,given current economic climate many economist ...
3,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...,considering career web development know expect...,consid career web develop know expect continu ...,considering career web development know expect...
4,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...,considering career web development know expect...,consid career web develop know expect continu ...,considering career web development know expect...


In [234]:
# creating new dfs

df_articles.to_csv('clean_articles.csv', index = False)
df_blogs.to_csv('clean_blog_posts.csv', index = False)