In [8]:
# unicode, regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# pandas dataframe manipulation, acquire script, time formatting
import pandas as pd
import acquire as a
from time import strftime

# shh, down in front
import warnings
warnings.filterwarnings('ignore')

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [37]:
def basic_clean(string):
    '''
    Takes in a string, makes everything lowercase,
    normalizes unicode characters,removes anything 
    that isn't a letter, number, whitespace or single quote
    '''
    #removes any inconsistencies in unicode character encoding
    #converts the resulting string to the ASCII character set
    #turns the resulting bytes object back into a string
    cleaned_string = unicodedata.normalize('NFKD', string)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore') 
    
    # remove anything that is not a through z, a number, a single quote, or whitespace
    cleaned_string = re.sub(r"[^\w0-9'\s]", '', cleaned_string).lower()
    
    return cleaned_string

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [19]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    tokens = tokenizer.tokenize(string, return_str=True)
    return tokens

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [20]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    stemmed_string = ' '.join(stems)
    return stemmed_string

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [21]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    lemmatized_string = ' '.join(lemmas)
    return lemmatized_string

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [22]:
def remove_stopwords(string, extra_words =[], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [38]:
news_df = a.get_news_articles_data()

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [13]:
blog_df = a.get_blog_articles_data()

### 8 .For each dataframe, produce the following columns:

- ```title``` to hold the title
- ```original``` to hold the original article/post content
- ```clean``` to hold the normalized and tokenized original with the - stopwords removed.
- ```stemmed``` to hold the stemmed version of the cleaned data.
- ```lemmatized``` to hold the lemmatized version of the cleaned data.

In [39]:
news_df.rename(columns={'content': 'original'}, inplace=True)
blog_df.rename(columns={'content': 'original'}, inplace=True)

In [40]:
news_df

Unnamed: 0,title,original,category
0,"'Mr India' cinematographer Peter Pereira, who ...",Veteran cinematographer and special effects pr...,national
1,Ashneer promises Mercedes cars to staff on com...,BharatPe's former Managing Director Ashneer Gr...,national
2,"Woman, fiance beaten up by bouncers at Gurugra...",A woman and her fiance suffered injuries after...,national
3,"NOCs of over 2,000 CBSE schools in Maha to be ...",Maharashtra Education Commissioner Suraj Mandh...,national
4,"When coach asked me to pad up vs Pak, the feel...",Indian hockey goalkeeper PR Sreejesh recalled ...,national
...,...,...,...
295,German Foreign Minister Annalena Baerbock visi...,German Foreign Minister Annalena Baerbock on T...,automobile
296,24 aid workers go on trial in Greece for 'faci...,The trial of 24 aid workers who helped rescue ...,automobile
297,Court acquits Austria's ex-Vice Chancellor in ...,A court in Austria on Tuesday acquitted the co...,automobile
298,Fed has only limited role to play on climate c...,US Federal Reserve Chair Jerome Powell said th...,automobile


In [41]:
news_df['clean'] = news_df['original'].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords)

In [42]:
news_df['stemmed'] = news_df['clean'].apply(stem)

In [43]:
news_df['lemmatized'] = news_df['clean'].apply(lemmatize)

In [44]:
news_df

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,"'Mr India' cinematographer Peter Pereira, who ...",Veteran cinematographer and special effects pr...,national,veteran cinematographer special effects provid...,veteran cinematograph special effect provid pe...,veteran cinematographer special effect provide...
1,Ashneer promises Mercedes cars to staff on com...,BharatPe's former Managing Director Ashneer Gr...,national,bharatpe ' former managing director ashneer gr...,bharatp ' former manag director ashneer grover...,bharatpe ' former managing director ashneer gr...
2,"Woman, fiance beaten up by bouncers at Gurugra...",A woman and her fiance suffered injuries after...,national,woman fiance suffered injuries allegedly beate...,woman fianc suffer injuri allegedli beaten bou...,woman fiance suffered injury allegedly beaten ...
3,"NOCs of over 2,000 CBSE schools in Maha to be ...",Maharashtra Education Commissioner Suraj Mandh...,national,maharashtra education commissioner suraj mandh...,maharashtra educ commission suraj mandhar tues...,maharashtra education commissioner suraj mandh...
4,"When coach asked me to pad up vs Pak, the feel...",Indian hockey goalkeeper PR Sreejesh recalled ...,national,indian hockey goalkeeper pr sreejesh recalled ...,indian hockey goalkeep pr sreejesh recal first...,indian hockey goalkeeper pr sreejesh recalled ...
...,...,...,...,...,...,...
295,German Foreign Minister Annalena Baerbock visi...,German Foreign Minister Annalena Baerbock on T...,automobile,german foreign minister annalena baerbock tues...,german foreign minist annalena baerbock tuesda...,german foreign minister annalena baerbock tues...
296,24 aid workers go on trial in Greece for 'faci...,The trial of 24 aid workers who helped rescue ...,automobile,trial 24 aid workers helped rescue migrants co...,trial 24 aid worker help rescu migrant coast g...,trial 24 aid worker helped rescue migrant coas...
297,Court acquits Austria's ex-Vice Chancellor in ...,A court in Austria on Tuesday acquitted the co...,automobile,court austria tuesday acquitted country ' exvi...,court austria tuesday acquit countri ' exvic c...,court austria tuesday acquitted country ' exvi...
298,Fed has only limited role to play on climate c...,US Federal Reserve Chair Jerome Powell said th...,automobile,us federal reserve chair jerome powell said fe...,us feder reserv chair jerom powel said fed lim...,u federal reserve chair jerome powell said fed...


In [45]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? **lemmatized**
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? **lemmatized**
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? **stemmed**