In [1]:
import pandas as pd
import numpy as np

import unicodedata
import re
import os
import json
import acquire

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

## EXERCISE

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
#original text
original = 'Serbian scientists named a new species of beetle after ex-world number one men\'s tennis player\
Novak Djokovic. The insect, which belongs to Duvalius genus of ground beetles present in Europe and was \
discovered several years ago in underground pit in Serbia, has been named \'Duvalius djokovici\'.\
"We feel urged to pay Djokovic back in...way we can," a researcher said.'
original

'Serbian scientists named a new species of beetle after ex-world number one men\'s tennis playerNovak Djokovic. The insect, which belongs to Duvalius genus of ground beetles present in Europe and was discovered several years ago in underground pit in Serbia, has been named \'Duvalius djokovici\'."We feel urged to pay Djokovic back in...way we can," a researcher said.'

In [3]:
#Lowercase everything
article = original.lower()
article

'serbian scientists named a new species of beetle after ex-world number one men\'s tennis playernovak djokovic. the insect, which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia, has been named \'duvalius djokovici\'."we feel urged to pay djokovic back in...way we can," a researcher said.'

In [4]:
#import unicode character database
import unicodedata

In [5]:
article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

article

'serbian scientists named a new species of beetle after ex-world number one men\'s tennis playernovak djokovic. the insect, which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia, has been named \'duvalius djokovici\'."we feel urged to pay djokovic back in...way we can," a researcher said.'

In [6]:
#import regular expression operations
import re

In [7]:
#use re.sub to remove special characters
article = re.sub(r'[^a-z0-9\'\s]', '', article)
article

"serbian scientists named a new species of beetle after exworld number one men's tennis playernovak djokovic the insect which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia has been named 'duvalius djokovici'we feel urged to pay djokovic back inway we can a researcher said"

In [8]:
def basic_clean(string):
    """
    This function will take in a string and perform basic cleaning procedutes. It will convert all characters
    to lower case, remove accented characters using unicode, and remove all special character 
    and symbols that are not alphanumeric characters.
    """
    
    #Convert string to lower case
    string = string.lower()
    
    #Normalize and remove inconsistencies, encode into ascii byte strings and ignore unknown chars,
    #decode back into a UTF-8 string
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    #Use regex to replace remove/replace all special characters
    string = re.sub(r"[^a-z0-9\s']", '', string)
    
    return string

In [9]:
basic_clean(original)

"serbian scientists named a new species of beetle after exworld number one men's tennis playernovak djokovic the insect which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia has been named 'duvalius djokovici'we feel urged to pay djokovic back inway we can a researcher said"

### 2.Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [10]:
#import natural language toolkit
import nltk

In [11]:
def tokenize(string):
    """
    This function will take in a string, tokenize it and return the 
    tokenized string.
    """
    #Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [12]:
tokenize(original)

'Serbian scientists named a new species of beetle after ex-world number one men \' s tennis playerNovak Djokovic. The insect , which belongs to Duvalius genus of ground beetles present in Europe and was discovered several years ago in underground pit in Serbia , has been named \' Duvalius djokovici \' . " We feel urged to pay Djokovic back in ... way we can , " a researcher said .'

### 3.Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [13]:
def stem(string):
    """
    This function will take in a string return a stemmed version of the string.
    """
    
    #Create the stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in the string and create a list of stemmed words
    stems = [ps.stem(word) for word in string.split()]
    
    #join the list of stemmed words into a string
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [14]:
# Test function using the tokenized version of my string
string = basic_clean(original)

stemmed = tokenize(string)

string_stemmed = stem(stemmed)

string_stemmed

"serbian scientist name a new speci of beetl after exworld number one men ' s tenni playernovak djokov the insect which belong to duvaliu genu of ground beetl present in europ and wa discov sever year ago in underground pit in serbia ha been name ' duvaliu djokovici ' we feel urg to pay djokov back inway we can a research said"

### 4.Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [15]:
# Download wordnet lemmatized
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patricknash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def lemmatize(string):
    """
    This function takes in a string and returns a lemmatized version of the string.
    """
    
    #Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #Use the lemmatizer on each word in the string to create a list of lemmatized words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #Join the lemmatized words into one string
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [17]:
# Test on my tokenized string
lemmatize(original)

'Serbian scientist named a new specie of beetle after ex-world number one men\'s tennis playerNovak Djokovic. The insect, which belongs to Duvalius genus of ground beetle present in Europe and wa discovered several year ago in underground pit in Serbia, ha been named \'Duvalius djokovici\'."We feel urged to pay Djokovic back in...way we can," a researcher said.'

### 5.Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
### This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [18]:
# dowload nltk stopwords list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patricknash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    """
    This function will take in a string, filter out stop words from the nltk standard english list 
    as well as any other extra words, and return a version of the text without these stopwords.
    It includes optional paramaters allowing the user to add extra words to remove 
    or to exclude words from the stopword list.
    """
    #Get the standard english stop word list from nltk
    stop_words = stopwords.words('english')
    
    #Add extra words to be removed to the stop word list
    for word in extra_words:
        stop_words.append(word)
    
    #Remove words to be excluded from the stop word list
    for word in exclude_words:
        stop_words.remove(word)
    
    #Create a list of words to be checked by splitting the string
    words = string.split()
    
    #Filter out all of the stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    #Join the list of filtered words into a string
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string

In [20]:
# Test function using the tokenized version of my string
string = basic_clean(original)

stemmed = tokenize(string)

string_stemmed = stem(stemmed)

string_stemmed

"serbian scientist name a new speci of beetl after exworld number one men ' s tenni playernovak djokov the insect which belong to duvaliu genu of ground beetl present in europ and wa discov sever year ago in underground pit in serbia ha been name ' duvaliu djokovici ' we feel urg to pay djokov back inway we can a research said"

### 6.Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [21]:
#nltk.download('all')

In [22]:
#nltk.download('stopwords')

In [23]:
#let's define news category url
base_url = 'https://inshorts.com/en/read'

In [24]:
#acquire news url
news_df = acquire.get_all_shorts(base_url)
news_df

Unnamed: 0,title,category,body
0,"AAP drops Rajouri Garden candidate, a week bef...",india,"Only a week before Delhi Assembly polls, Aam A..."
1,Zimbabwe players ask India for cricketing tips,india,After getting thrashed by India by 5-0 in the ...
2,"Nigerian weightlifter in dope net, India may gain",india,India may move up after Nigerian weightlifter ...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",india,Samsung has unveiled the Galaxy start 2 Plus s...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."
...,...,...,...
280,Porsche becomes Europe's most valuable automak...,automobile,Porsche overtook parent company Volkswagen to ...
281,Car rental startup Zoomcar may go public via S...,automobile,Bengaluru-based car rental startup Zoomcar has...
282,Record 5.4 lakh vehicles sold during Navratri ...,automobile,Federation of Automobile Dealers Associations ...
283,TVS Motor beats Hero MotoCorp to become 6th mo...,automobile,TVS Motor Company Limited has become the sixth...


### 7.Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [27]:
#let's define codeup blog url
base_url = 'https://codeup.com/blog/'

In [28]:
#acquire codeup url
codeup_df = acquire.get_blog_content(base_url)
codeup_df


Unnamed: 0,title,content
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo..."
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
2,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
3,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
4,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
5,What is Cloud Computing and AWS?,With many companies switching to cloud service...


### 8.For each dataframe, produce the following columns:

### a) title to hold the title

In [33]:
news_df = news_df.rename(columns={'body':'original'})
news_df.head()

Unnamed: 0,title,category,original
0,"AAP drops Rajouri Garden candidate, a week bef...",india,"Only a week before Delhi Assembly polls, Aam A..."
1,Zimbabwe players ask India for cricketing tips,india,After getting thrashed by India by 5-0 in the ...
2,"Nigerian weightlifter in dope net, India may gain",india,India may move up after Nigerian weightlifter ...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",india,Samsung has unveiled the Galaxy start 2 Plus s...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."


### b) original to hold the original article/post content

In [34]:
news_df = news_df[['title','original']]
news_df.head()

Unnamed: 0,title,original
0,"AAP drops Rajouri Garden candidate, a week bef...","Only a week before Delhi Assembly polls, Aam A..."
1,Zimbabwe players ask India for cricketing tips,After getting thrashed by India by 5-0 in the ...
2,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",Samsung has unveiled the Galaxy start 2 Plus s...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I..."


### c) clean to hold the normalized and tokenized original with the stopwords removed.

In [35]:
news_df['clean'] = news_df.original.apply(basic_clean)
news_df.head()

Unnamed: 0,title,original,clean
0,"AAP drops Rajouri Garden candidate, a week bef...","Only a week before Delhi Assembly polls, Aam A...",only a week before delhi assembly polls aam aa...
1,Zimbabwe players ask India for cricketing tips,After getting thrashed by India by 5-0 in the ...,after getting thrashed by india by 50 in the j...
2,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india may move up after nigerian weightlifter ...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",Samsung has unveiled the Galaxy start 2 Plus s...,samsung has unveiled the galaxy start 2 plus s...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",in a regulatory filing to the bse on friday in...


### d) stemmed to hold the stemmed version of the cleaned data.

In [36]:
news_df['stemmed'] = news_df.clean.apply(stem)
news_df['lemmatized'] = news_df.clean.apply(lemmatize)
news_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"AAP drops Rajouri Garden candidate, a week bef...","Only a week before Delhi Assembly polls, Aam A...",only a week before delhi assembly polls aam aa...,onli a week befor delhi assembl poll aam aadmi...,only a week before delhi assembly poll aam aad...
1,Zimbabwe players ask India for cricketing tips,After getting thrashed by India by 5-0 in the ...,after getting thrashed by india by 50 in the j...,after get thrash by india by 50 in the just co...,after getting thrashed by india by 50 in the j...
2,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india may move up after nigerian weightlifter ...,india may move up after nigerian weightlift ch...,india may move up after nigerian weightlifter ...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",Samsung has unveiled the Galaxy start 2 Plus s...,samsung has unveiled the galaxy start 2 plus s...,samsung ha unveil the galaxi start 2 plu smart...,samsung ha unveiled the galaxy start 2 plus sm...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",in a regulatory filing to the bse on friday in...,in a regulatori file to the bse on friday info...,in a regulatory filing to the bse on friday in...


### e) lemmatized to hold the lemmatized version of the cleaned data.

In [37]:
#lemmatize function
def clean_df(df, extra_words=[], exclude_words=[]):
    df = df[['title','original']]
    
    df['clean'] = df.original\
                        .apply(basic_clean)\
                        .apply(tokenize)\
                        .apply(remove_stopwords, 
                                    extra_words=extra_words,
                                    exclude_words=exclude_words)
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    
    return df

In [38]:
#apply the lemmatize function
news_df_new = news_df.rename(columns={'content':'original'})
codeup_df_new = codeup_df.rename(columns={'content':'original'})

In [39]:
clean_df(news_df_new, extra_words=['infosys'])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"AAP drops Rajouri Garden candidate, a week bef...","Only a week before Delhi Assembly polls, Aam A...",week delhi assembly polls aam aadmi party tues...,week delhi assembl poll aam aadmi parti tuesda...,week delhi assembly poll aam aadmi party tuesd...
1,Zimbabwe players ask India for cricketing tips,After getting thrashed by India by 5-0 in the ...,getting thrashed india 50 concluded odi series...,get thrash india 50 conclud odi seri zimbabw c...,getting thrashed india 50 concluded odi series...
2,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india may move nigerian weightlifter chika ama...,india may move nigerian weightlift chika amala...,india may move nigerian weightlifter chika ama...
3,"Samsung launches Galaxy Star 2 Plus at Rs.7,335",Samsung has unveiled the Galaxy start 2 Plus s...,samsung unveiled galaxy start 2 plus smartphon...,samsung unveil galaxi start 2 plu smartphon in...,samsung unveiled galaxy start 2 plus smartphon...
4,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",regulatory filing bse friday ltd decided give ...,regulatori file bse friday ltd decid give shar...,regulatory filing bse friday ltd decided give ...
...,...,...,...,...,...
280,Porsche becomes Europe's most valuable automak...,Porsche overtook parent company Volkswagen to ...,porsche overtook parent company volkswagen bec...,porsch overtook parent compani volkswagen beco...,porsche overtook parent company volkswagen bec...
281,Car rental startup Zoomcar may go public via S...,Bengaluru-based car rental startup Zoomcar has...,bengalurubased car rental startup zoomcar repo...,bengalurubas car rental startup zoomcar report...,bengalurubased car rental startup zoomcar repo...
282,Record 5.4 lakh vehicles sold during Navratri ...,Federation of Automobile Dealers Associations ...,federation automobile dealers associations fad...,feder automobil dealer associ fada monday said...,federation automobile dealer association fada ...
283,TVS Motor beats Hero MotoCorp to become 6th mo...,TVS Motor Company Limited has become the sixth...,tvs motor company limited become sixth mostval...,tv motor compani limit becom sixth mostvalu li...,tv motor company limited become sixth mostvalu...


In [40]:
clean_df(codeup_df)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo...",many people deciding coding bootcamp computer ...,mani peopl decid code bootcamp comput scienc d...,many people deciding coding bootcamp computer ...
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup excited launch first diversity equity i...,codeup excit launch first divers equiti inclus...,codeup excited launch first diversity equity i...
2,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup named 2022 diversity inclusion award wi...,codeup name 2022 divers inclus award winner sa...,codeup named 2022 diversity inclusion award wi...
3,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding transition tech career big step signi...,decid transit tech career big step signific co...,deciding transition tech career big step signi...
4,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity inclusion hon...,codeup strongli valu divers inclus honor ameri...,codeup strongly value diversity inclusion hono...
5,What is Cloud Computing and AWS?,With many companies switching to cloud service...,many companies switching cloud services implem...,mani compani switch cloud servic implement clo...,many company switching cloud service implement...


### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
 - 493kb is small, I prefer lemmatized text.
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
 - 25mb is too large, I prefer lemmatized text.
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
 - 200TB is a large corpus, I prefere stemmed text.
                             