In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import prepare

import acquire

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guadalupeluna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# define categories
categories = ["business", "sports", "technology", "entertainment"]

# use get_all_new_article function from acquire.py file 

news_df = acquire.get_all_news_articles(categories)



  soup = BeautifulSoup(response.text)


In [4]:
# look at the head of dataframe
news_df.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
2,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
3,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
4,Govt may lower import duty on EVs if Tesla man...,The government is open to consider reducing im...,business


In [5]:
# lets use the content of first news item as 'article' to test my functions

article = news_df.content[0]
article

'Reliance Industries has said in a statement that over 98% of its workers have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees, of March 31. Besides Reliance, Hindustan Unilever has also given at least one shot to 90% of employees, while Infosys inoculated 59% employees and TCS 70%.'

#### In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote

In [6]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [7]:
# use the function defined above

basic_clean(article)

'reliance industries has said in a statement that over 98 of its workers have received at least one dose of covid19 vaccine so far the billionaire mukesh ambaniled conglomerate had over 236 lakh employees of march 31 besides reliance hindustan unilever has also given at least one shot to 90 of employees while infosys inoculated 59 employees and tcs 70'

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
def tokenize(string):
    '''
    This function takes in a string and returns a tokenized string.
    
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [9]:
# Use the function defined above

tokenize(article)

'Reliance Industries has said in a statement that over 98 % of its workers have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees , of March 31. Besides Reliance , Hindustan Unilever has also given at least one shot to 90 % of employees , while Infosys inoculated 59 % employees and TCS 70 % .'

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [10]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [11]:
# use the function defined above

stem(article)

'relianc industri ha said in a statement that over 98% of it worker have receiv at least one dose of covid-19 vaccin so far. the billionair mukesh ambani-l conglomer had over 2.36 lakh employees, of march 31. besid reliance, hindustan unilev ha also given at least one shot to 90% of employees, while infosi inocul 59% employe and tc 70%.'

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [12]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [13]:
# use the function defined above

lemmatize(article)

'Reliance Industries ha said in a statement that over 98% of it worker have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees, of March 31. Besides Reliance, Hindustan Unilever ha also given at least one shot to 90% of employees, while Infosys inoculated 59% employee and TCS 70%.'

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [14]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [15]:
# use the function defined above

remove_stopwords(article)

'Reliance Industries said statement 98% workers received least one dose COVID-19 vaccine far. The billionaire Mukesh Ambani-led conglomerate 2.36 lakh employees, March 31. Besides Reliance, Hindustan Unilever also given least one shot 90% employees, Infosys inoculated 59% employees TCS 70%.'

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [16]:
# check head of my news_df dataframe:

news_df.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
2,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
3,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
4,Govt may lower import duty on EVs if Tesla man...,The government is open to consider reducing im...,business


In [17]:
# use all the functions to see if they work on news_df's content column

news_df['content'].apply(basic_clean)\
.apply(tokenize)\
.apply(lemmatize)\
.apply(remove_stopwords)

0     reliance industry ha said statement 98 worker ...
1     tesla ceo world secondrichest person elon musk...
2     amazon monday denied speculation wa looking ac...
3     tesla billionaire ceo elon musk criticised app...
4     government open consider reducing import duty ...
                            ...                        
93    reacting rumour tiff cast show taarak mehta ka...
94    actor atul kulkarni share screen space actor a...
95    veteran actress savita bajaj unwell facing fin...
96    filmmaker aditya vikram senguptas upon time ca...
97    bigg bos 14 contestant jaan kumar sanu spoke i...
Name: content, Length: 98, dtype: object

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [18]:
codeup_df = acquire.acquire_codeup_blog()



  soup = BeautifulSoup(response.text)


In [20]:
codeup_df.head()

Unnamed: 0,title,published_date,blog_image,content
0,Codeup’s Data Science Career Accelerator is Here!,"September 30, 2018",https://codeup.com/wp-content/uploads/2018/10/...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,"October 31, 2018",https://codeup.com/wp-content/uploads/2018/10/...,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"October 17, 2018",https://codeup.com/wp-content/uploads/2018/10/...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,"August 14, 2018",,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,"August 14, 2018",,Competitor Bootcamps Are Closing. Is the Model...


### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [31]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [32]:
# use the function defined above for news_df's content column.

prep_article_data(news_df, 'content', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,reliance industries said statement 98 workers ...,relianc industri said statement 98 worker rece...,reliance industry said statement 98 worker rec...
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,amazon monday denied speculations looking acce...,amazon monday deni specul wa look accept bitco...,amazon monday denied speculation wa looking ac...
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,teslas billionaire ceo elon musk criticised ap...,tesla billionair ceo elon musk criticis appl t...,tesla billionaire ceo elon musk criticised app...
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,tesla ceo worlds secondrichest person elon mus...,tesla ceo world secondrichest person elon musk...,tesla ceo world secondrichest person elon musk...
4,Govt may lower import duty on EVs if Tesla man...,The government is open to consider reducing im...,government open consider reducing import duty ...,govern open consid reduc import duti offer inc...,government open consider reducing import duty ...


### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?