# Data Preparation Exercises

The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.



In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = string.lower()
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string)
    return string

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

## 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

## This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords


## 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [7]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [8]:
#Run acquire function to get the news articles. Is there an easier way?
news_json = acquire.get_news_articles()
news_json

  'content': 'Ahead of the debut of The Lord of the Rings\' prequel \'Rings of Power\' on Amazon\'s Prime Video, Jeff Bezos revealed a conversation he had with his son. He said, "My son came up to me one day, he looked me in the eyes, very sincerely, and he said: \'Dad, please don\'t eff this up\'...And he was right."',
  'category': 'business'},
 {'title': 'US sought records on Binance CEO for crypto money laundering probe: Report',
  'content': 'US prosecutors sought communication records involving Binance CEO Changpeng Zhao along with extensive internal records about Binance\'s anti-money laundering checks, Reuters reported. Binance was asked to voluntarily hand over messages from Zhao and 12 other executives and partners. Prosecutors sought records with instructions that "documents be destroyed, altered or removed from Binance\'s files".',
  'category': 'business'},
 {'title': 'Price of commercial LPG cylinders cut by up to ₹100; list of rates in cities released',
  'content': 'Sta

In [9]:
# putting json file into dataframe
news_df = pd.read_json('news_articles.json')
news_df.head()

Unnamed: 0,title,content,category
0,Don't eff this up: Bezos recalls warning from ...,Ahead of the debut of The Lord of the Rings' p...,business
1,US sought records on Binance CEO for crypto mo...,US prosecutors sought communication records in...,business
2,Price of commercial LPG cylinders cut by up to...,State-owned fuel retailers on Thursday announc...,business
3,Chairman of Russia's 2nd largest oil firm dies...,The chairman of Russia's second-largest oil pr...,business
4,SpiceJet shares fall nearly 15% after CFO resi...,SpiceJet shares declined nearly 15% during Thu...,business


## 7. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [13]:
#Apply a series of functions to clean the news articles text
news_df['clean'] = news_df['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

news_df.head()

Unnamed: 0,title,content,category,clean
0,Don't eff this up: Bezos recalls warning from ...,Ahead of the debut of The Lord of the Rings' p...,business,ahead debut lord rings prequel rings power ama...
1,US sought records on Binance CEO for crypto mo...,US prosecutors sought communication records in...,business,us prosecutors sought communication records in...
2,Price of commercial LPG cylinders cut by up to...,State-owned fuel retailers on Thursday announc...,business,stateowned fuel retailers thursday announced r...
3,Chairman of Russia's 2nd largest oil firm dies...,The chairman of Russia's second-largest oil pr...,business,chairman russias secondlargest oil producer lu...
4,SpiceJet shares fall nearly 15% after CFO resi...,SpiceJet shares declined nearly 15% during Thu...,business,spicejet shares declined nearly 15 thursdays i...


In [14]:
#Create a column of stemmed articles
news_df['stemmed'] = news_df['clean'].apply(stem)

news_df.head()

Unnamed: 0,title,content,category,clean,stemmed
0,Don't eff this up: Bezos recalls warning from ...,Ahead of the debut of The Lord of the Rings' p...,business,ahead debut lord rings prequel rings power ama...,ahead debut lord ring prequel ring power amazo...
1,US sought records on Binance CEO for crypto mo...,US prosecutors sought communication records in...,business,us prosecutors sought communication records in...,us prosecutor sought commun record involv bina...
2,Price of commercial LPG cylinders cut by up to...,State-owned fuel retailers on Thursday announc...,business,stateowned fuel retailers thursday announced r...,stateown fuel retail thursday announc reduct p...
3,Chairman of Russia's 2nd largest oil firm dies...,The chairman of Russia's second-largest oil pr...,business,chairman russias secondlargest oil producer lu...,chairman russia secondlargest oil produc lukoi...
4,SpiceJet shares fall nearly 15% after CFO resi...,SpiceJet shares declined nearly 15% during Thu...,business,spicejet shares declined nearly 15 thursdays i...,spicejet share declin nearli 15 thursday intra...


In [15]:
#Create a column of lemmatized articles
news_df['lemmatized'] = news_df['clean'].apply(lemmatize)

news_df.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,Don't eff this up: Bezos recalls warning from ...,Ahead of the debut of The Lord of the Rings' p...,business,ahead debut lord rings prequel rings power ama...,ahead debut lord ring prequel ring power amazo...,ahead debut lord ring prequel ring power amazo...
1,US sought records on Binance CEO for crypto mo...,US prosecutors sought communication records in...,business,us prosecutors sought communication records in...,us prosecutor sought commun record involv bina...,u prosecutor sought communication record invol...
2,Price of commercial LPG cylinders cut by up to...,State-owned fuel retailers on Thursday announc...,business,stateowned fuel retailers thursday announced r...,stateown fuel retail thursday announc reduct p...,stateowned fuel retailer thursday announced re...
3,Chairman of Russia's 2nd largest oil firm dies...,The chairman of Russia's second-largest oil pr...,business,chairman russias secondlargest oil producer lu...,chairman russia secondlargest oil produc lukoi...,chairman russia secondlargest oil producer luk...
4,SpiceJet shares fall nearly 15% after CFO resi...,SpiceJet shares declined nearly 15% during Thu...,business,spicejet shares declined nearly 15 thursdays i...,spicejet share declin nearli 15 thursday intra...,spicejet share declined nearly 15 thursday int...


In [17]:
news_df.to_csv('clean_articles.csv', index=False)