In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
all_articles = acquire.get_blog_articles(acquire.get_codeup_links())

In [3]:
original = all_articles[0]['content']

In [4]:
print(original)


Black excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia

Codeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!  
Meet Wilmarie!
Wilmarie De La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus. 
Wilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup. 
We asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”
We hope you can join us on February 22nd to sit in on an insightful conversation with Wilmarie and all of our panelists!



<hr style="border:2px solid gray">

# Exercises

#### The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

#### In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:
#### - Lowercase everything
#### - Normalize unicode characters
#### - Replace anything that is not a letter, number, whitespace or a single quote.

In [None]:
def basic_clean(text):
    '''
        This function accepts a string of text, cleans,
        and then returns the cleaned text.
    
        params:
        ------
        string: Input text to clean

        return:
        ------
        string: cleaned text
    '''
    
    # lowercase
    text = text.lower()
    
    # normalize unicode characters
    text = unicodedata.normalize('NFKD', text)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # only alphanumeric, apostrophe, & Spaces
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    
    return text

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [None]:
def tokenize(text):
    '''
        this function accepts a string of text, tokenizes,
        and then returns the tokenized text.
    
        params:
        ------
        string: input text to tokenize

        return:
        ------
        string: tokenized text
    '''
    
    #create tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()

    #use the tokenizer
    string = tokenizer.tokenize(text, return_str = True)

    return text

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [None]:
def stem(text):
    '''
        this function accepts a string of text, stems,
        and then returns the stemmed string.
    
        params:
        ------
        string: input string to stem

        return:
        ------
        string: stemmed string
    '''
    
    #create stemmer object
    ps = nltk.porter.PorterStemmer()
    
    #use the stem, split text using each word
    stems = [ps.stem(word) for word in text.split()]
    
    #join stem word to text
    text = ' '.join(stems)

    return text

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(text):
    '''
        this function accepts a string of text, lemmatizes,
        and then returns the lemmatized string.
    
        params:
        ------
        text:
            string: input string to lemmatize

        return:
        ------
        text:
            string: lemmatized string
    '''
    
    #  create lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # split text string into words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #join lemmatized words into article
    text = ' '.join(lemmas)

    return text

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.
#### This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
def remove_stopwords(text,extra_words=None,exclude_words=None):
    '''
        this function accepts a string of text, removes stopwords,
        and then returns the transformed string.
    
        params:
        ------
        text:
            string: input string to transform

        return:
        ------
        text:
            string: transformed string
    '''
    #create stopword list
    stopword_list = stopwords.words('english')
    
    #remove excluded words from list
    stopword_list = set(stopword_list) - set(exclude_words)
    
    #add the extra words to the list
    stopword_list = stopword_list.union(set(extra_words))
    
    #split the string into different words
    words = string.split()
    
    #create a list of words that are not in the list
    filtered_words = [word for word in words if word not in stopword_list]
    
    #join the words that are not stopwords (filtered words) back into the string
    text = ' '.join(filtered_words)
    
    return text

### 6. Use your data from the `acquire` to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [None]:
news_df = pd.DataFrame({"title":titles,"original":originals,"clean":cleaned,"stemmed":stemmed,"lemmatized":lemmatized})

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [7]:
codeup_df = pd.DataFrame({"title":titles,"original":originals,"clean":cleaned,"stemmed":stemmed,"lemmatized":lemmatized})

NameError: name 'titles' is not defined

In [6]:
codeup_df

NameError: name 'codeup_df' is not defined

### 8. For each dataframe, produce the following columns:
#### - `title` to hold the title
#### - `original` to hold the original article/post content
#### - `clean` to hold the normalized and tokenized original with the stopwords removed.
#### - `stemmed` to hold the stemmed version of the cleaned data.
#### - `lemmatized` to hold the lemmatized version of the cleaned data.

In [None]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    #original text from content column
    df['original'] = df['content']
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', 'original', 'clean', 'stemmed', 'lemmatized']]

### 9. Ask yourself:
#### - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
#### - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
#### - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

- lemmatized
- stemmed
- stemmed