In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

import warnings
warnings.filterwarnings('ignore')

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

1) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [17]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

2) Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [18]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return(tokenizer.tokenize(string, return_str=True)[0:500]) 
    return string

3) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [19]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    return string

4) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [20]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    return string

5) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [22]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
        stopword_list = stopwords.words('english')
        # Remove 'exclude_words' from stopword_list to keep these in my text.
        stopword_list = set(stopword_list) - set(exclude_words)
        # Add in 'extra_words' to stopword_list.
        stopword_list = stopword_list.union(set(extra_words))
        # Split words in string.
        words = string.split()
        # Create a list of words from my string with stopwords removed and assign to variable.
        filtered_words = [word for word in words if word not in stopword_list]
        # Join words in the list back into strings and assign to a variable.
        string_without_stopwords = ' '.join(filtered_words)
        return string_without_stopwords

6) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [2]:
news_df = acquire.get_news_articles()
news_df

Unnamed: 0,title,content,category
0,Veteran Bollywood singer Bhupinder Singh passe...,Veteran Bollywood playback and ghazal singer B...,national
1,West can't isolate Russia & reverse its develo...,Russian President Vladimir Putin said on Monda...,national
2,CBI arrests 8 persons for alleged malpractices...,The Central Bureau of Investigation (CBI) arre...,national
3,"If I get 20 minutes with Virat Kohli, I might ...","Amid Virat Kohli's poor form, Sunil Gavaskar s...",national
4,Hardik Pandya shares video of his journey from...,Team India all-rounder Hardik Pandya took to s...,national
...,...,...,...
295,J&K LG announces 10% reservation for 'Agniveer...,J&K LG Manoj Sinha on Sunday announced a 10% r...,automobile
296,Bajrang Dal activist attacked in Uttar Pradesh,"A Bajrang Dal activist, Prashant Saini, was at...",automobile
297,UK residents asked to stay indoors as temp exp...,People in the UK are being advised to stay ind...,automobile
298,Nearly half of EU territory at drought risk as...,Nearly half of EU's territory is at risk of dr...,automobile


7) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [4]:
codeup_df = acquire.get_blog_posts()
codeup_df

Unnamed: 0,title,content
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


8) For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

9) Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?