# Data Preparation Exercises

In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
import acquire
import time

import warnings
warnings.filterwarnings('ignore')

## Exercises
The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

In [2]:
# load data
blogs = acquire.get_blog_articles()
inshorts = acquire.get_inshorts()

### Exercise I
Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
# ty, AG
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    
    return string

### Exercise II
Define a function named `tokenize`. It should take in a string and tokenize all the words in the string

In [4]:
# ty AG
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

### Exercise III
Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [5]:
# ty AG
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

### Exercise IV
Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [6]:
# ty AG
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

### Exercise V
Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.     This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [7]:
# ty AG

def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list).difference(set(exclude_words))
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

### Exercise VI
Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [8]:
# acquire news data
news_df = acquire.get_inshorts()

# preview data
news_df.head()

Unnamed: 0,category,publish_date,article,content,author
0,business,"03 Feb 2022,Thursday",RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,Shalini Ojha
1,business,"04 Feb 2022,Friday",This is an infrastructure and growth-focused B...,Capex outlay has been increased and private ca...,Roshan Gupta
2,business,"04 Feb 2022,Friday","Self-taught beautician to micro-entrepreneur, ...",The latest episode of Urban Company Impact int...,Roshan Gupta
3,business,"03 Feb 2022,Thursday",Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,Pragya Swastik
4,business,"04 Feb 2022,Friday",Facebook facing unprecedented level of competi...,"At a virtual meeting, Meta CEO Mark Zuckerberg...",Kiran Khatri


In [9]:
# get dataframe info
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 23
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   category      99 non-null     object
 1   publish_date  99 non-null     object
 2   article       99 non-null     object
 3   content       99 non-null     object
 4   author        99 non-null     object
dtypes: object(5)
memory usage: 4.6+ KB


### Exercise VII
Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [10]:
# acquire codeup blog data
codeup_df = acquire.get_blog_articles()

# preview data
codeup_df.head()

Unnamed: 0,publish_date,article,content,link
0,"Jan 26, 2022",Codeup Start Dates for March 2022,As we approach the end of January we wanted to...,https://codeup.com/codeup-news/codeup-start-da...
1,"Jan 7, 2022",VET TEC Funding Now Available For Dallas Veterans,We are so happy to announce that VET TEC benef...,https://codeup.com/codeup-news/vet-tec-funding...
2,"Dec 30, 2021",Dallas Campus Re-opens With New Grant Partner,We are happy to announce that our Dallas campu...,https://codeup.com/codeup-news/dallas-campus-r...
3,"Nov 30, 2021",Codeup Dallas Open House,Come join us for the re-opening of our Dallas ...,https://codeup.com/dallas-newsletter/codeup-da...
4,"Nov 19, 2021",Codeup’s Placement Team Continues Setting Records,Our Placement Team is simply defined as a grou...,https://codeup.com/codeup-news/codeups-placeme...


In [11]:
# get dataframe info
codeup_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 0 to 17
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publish_date  18 non-null     object
 1   article       18 non-null     object
 2   content       18 non-null     object
 3   link          18 non-null     object
dtypes: object(4)
memory usage: 720.0+ bytes


### Exercise VIII
For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [12]:
# produce title and original columns for news articles
news_df = news_df.rename(columns = {'article': 'title',    
                                    'content': 'original'
                                   }
                        )
# produce title and original columns for codeup blogs
codeup_df = codeup_df.rename(columns = {'article': 'title',    
                                    'content': 'original'
                                   }
                            )

In [13]:
# produce 'clean' column. clean and then tokenize
news_df['clean'] = news_df.original.apply(basic_clean)\
.apply(tokenize)\
.apply(remove_stopwords)

# verify
news_df.clean.head()

0    rbi cancelled licence maharashtrabased indepen...
1    capex outlay increased private capital require...
2    latest episode urban company impact interviews...
3    facebooks parent metas shares plunged 27 thurs...
4    virtual meeting meta ceo mark zuckerberg expla...
Name: clean, dtype: object

In [14]:
# produce 'clean' column. clean and then tokenize
codeup_df['clean'] = codeup_df.original.apply(basic_clean)\
.apply(tokenize)\
.apply(remove_stopwords)

# verify
codeup_df.clean.head()

0    approach end january wanted look forward next ...
1    happy announce vet tec benefits available used...
2    happy announce dallas campus reopened better y...
3    come join us reopening dallas campus drinks sn...
4    placement team simply defined group manages re...
Name: clean, dtype: object

In [15]:
# produce 'stemmed' column from cleaned column
news_df['stemmed'] = news_df.clean.apply(stem)

# verify
news_df.stemmed

0     rbi cancel licenc maharashtrabas independ coop...
1     capex outlay increas privat capit requir finan...
2     latest episod urban compani impact interview a...
3     facebook parent meta share plung 27 thursday c...
4     virtual meet meta ceo mark zuckerberg explain ...
                            ...                        
19    actress deepika padukon made bollywood debut 2...
20    ajay devgn share first look charact gangubai k...
21    nitu chandra made hollywood debut never back r...
22    actor emraan hashmi took twitter thursday wish...
23    italian actor michel morron star film 365 day ...
Name: stemmed, Length: 99, dtype: object

In [16]:
# produce 'stemmed' column from cleaned column
codeup_df['stemmed'] = codeup_df.clean.apply(stem)

# verify
codeup_df.stemmed

0     approach end januari want look forward next st...
1     happi announc vet tec benefit avail use campu ...
2     happi announc dalla campu reopen better yet ne...
3     come join us reopen dalla campu drink snack co...
4     placement team simpli defin group manag relati...
5     aw googl azur red hat comptiathes big name pro...
6     last month us experienc dozen major cyberattac...
7     end militari servic get closer mani transit se...
8     career choos your think career lot direct coul...
9     codeup offer 13week train program system engin...
10    alicia gonzalez codeup home health speechlangu...
11                                                     
12    look best data scienc bootcamp world best code...
13    podcast enthusiast pleas announc releas codeup...
14    mani tech career demand choos system administr...
15    know even though independ school multipl regul...
16    codeup move anoth floor histor vogu build down...
17    happi pride month pride month dedic time c

In [17]:
# produce 'lemmatized' column from cleaned column
news_df['lemmatized'] = news_df.clean.apply(lemmatize)

# verify
news_df.lemmatized

0     rbi cancelled licence maharashtrabased indepen...
1     capex outlay increased private capital require...
2     latest episode urban company impact interview ...
3     facebooks parent metas share plunged 27 thursd...
4     virtual meeting meta ceo mark zuckerberg expla...
                            ...                        
19    actress deepika padukone made bollywood debut ...
20    ajay devgn shared first look character ganguba...
21    nitu chandra made hollywood debut never back r...
22    actor emraan hashmi took twitter thursday wish...
23    italian actor michele morrone starred film 365...
Name: lemmatized, Length: 99, dtype: object

In [18]:
# produce 'lemmatized' column from cleaned column
codeup_df['lemmatized'] = codeup_df.clean.apply(lemmatize)

# verify
codeup_df.lemmatized

0     approach end january wanted look forward next ...
1     happy announce vet tec benefit available used ...
2     happy announce dallas campus reopened better y...
3     come join u reopening dallas campus drink snac...
4     placement team simply defined group manages re...
5     aws google azure red hat comptiathese big name...
6     last month u experienced dozen major cyberatta...
7     end military service get closer many transitio...
8     career choose youre thinking career lot direct...
9     codeup offer 13week training program system en...
10    alicia gonzalez codeup home health speechlangu...
11                                                     
12    looking best data science bootcamp world best ...
13    podcast enthusiast pleased announce release co...
14    many tech career demand choose system administ...
15    know even though independent school multiple r...
16    codeup moving another floor historic vogue bui...
17    happy pride month pride month dedicated ti

In [19]:
news_df.head(1)

Unnamed: 0,category,publish_date,title,original,author,clean,stemmed,lemmatized
0,business,"03 Feb 2022,Thursday",RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,Shalini Ojha,rbi cancelled licence maharashtrabased indepen...,rbi cancel licenc maharashtrabas independ coop...,rbi cancelled licence maharashtrabased indepen...


In [20]:
codeup_df.head(1)

Unnamed: 0,publish_date,title,original,link,clean,stemmed,lemmatized
0,"Jan 26, 2022",Codeup Start Dates for March 2022,As we approach the end of January we wanted to...,https://codeup.com/codeup-news/codeup-start-da...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...


In [21]:
# ty AG
def prep_articles(df, original, extra_words = [], exclude_words = []):
    
    '''
    This function takes in a dataframe, the original corpus column, a list of extra words to supplement
    the list of stop words, and a list of words to exclude from the stop words list, and returns the dataframe 
    supplemented with clean, stemmed, and lemmatized columns.
    '''
    # produce 'original' column
    df = df.rename(columns = {original: 'original'})
    
    # produce 'clean' column. clean and then tokenize
    df['clean'] = df['original'].apply(basic_clean)\
    .apply(tokenize)\
    .apply(remove_stopwords, extra_words = extra_words, exclude_words = exclude_words)
          
    # produce 'stemmed' column from cleaned column
    df['stemmed'] = df.clean.apply(stem)
    
    # produce 'lemmatized' column from cleaned column
    df['lemmatized'] = df.clean.apply(lemmatize)
    
    return df

In [22]:
# test function
blogs = acquire.get_blog_articles()

# test prep_articles function
codeup_df = prep_articles(blogs, 'content')
codeup_df.head()

Unnamed: 0,publish_date,article,original,link,clean,stemmed,lemmatized
0,"Jan 26, 2022",Codeup Start Dates for March 2022,As we approach the end of January we wanted to...,https://codeup.com/codeup-news/codeup-start-da...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...
1,"Jan 7, 2022",VET TEC Funding Now Available For Dallas Veterans,We are so happy to announce that VET TEC benef...,https://codeup.com/codeup-news/vet-tec-funding...,happy announce vet tec benefits available used...,happi announc vet tec benefit avail use campu ...,happy announce vet tec benefit available used ...
2,"Dec 30, 2021",Dallas Campus Re-opens With New Grant Partner,We are happy to announce that our Dallas campu...,https://codeup.com/codeup-news/dallas-campus-r...,happy announce dallas campus reopened better y...,happi announc dalla campu reopen better yet ne...,happy announce dallas campus reopened better y...
3,"Nov 30, 2021",Codeup Dallas Open House,Come join us for the re-opening of our Dallas ...,https://codeup.com/dallas-newsletter/codeup-da...,come join us reopening dallas campus drinks sn...,come join us reopen dalla campu drink snack co...,come join u reopening dallas campus drink snac...
4,"Nov 19, 2021",Codeup’s Placement Team Continues Setting Records,Our Placement Team is simply defined as a grou...,https://codeup.com/codeup-news/codeups-placeme...,placement team simply defined group manages re...,placement team simpli defin group manag relati...,placement team simply defined group manages re...


In [23]:
# testing, attention, please
inshorts = acquire.get_inshorts()
news_df = prep_articles(inshorts, 'content')
news_df.head()

Unnamed: 0,category,publish_date,article,original,author,clean,stemmed,lemmatized
0,business,"03 Feb 2022,Thursday",RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,Shalini Ojha,rbi cancelled licence maharashtrabased indepen...,rbi cancel licenc maharashtrabas independ coop...,rbi cancelled licence maharashtrabased indepen...
1,business,"04 Feb 2022,Friday",This is an infrastructure and growth-focused B...,Capex outlay has been increased and private ca...,Roshan Gupta,capex outlay increased private capital require...,capex outlay increas privat capit requir finan...,capex outlay increased private capital require...
2,business,"04 Feb 2022,Friday","Self-taught beautician to micro-entrepreneur, ...",The latest episode of Urban Company Impact int...,Roshan Gupta,latest episode urban company impact interviews...,latest episod urban compani impact interview a...,latest episode urban company impact interview ...
3,business,"03 Feb 2022,Thursday",Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,Pragya Swastik,facebooks parent metas shares plunged 27 thurs...,facebook parent meta share plung 27 thursday c...,facebooks parent metas share plunged 27 thursd...
4,business,"04 Feb 2022,Friday",Facebook facing unprecedented level of competi...,"At a virtual meeting, Meta CEO Mark Zuckerberg...",Kiran Khatri,virtual meeting meta ceo mark zuckerberg expla...,virtual meet meta ceo mark zuckerberg explain ...,virtual meeting meta ceo mark zuckerberg expla...


### Exercise IX
Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

If my corpus is 493KB or even 25MB, I would prefer to lemmatize. If my corpus is 200TB of data, I'm leaving. Jk I would stem the text.