In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [3]:
# nltk.download('all')

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.

In [4]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\'\s]', '', string)
    
    return string

In [6]:
basic_clean('Angarta')

'angarita'

In [7]:
basic_clean('Angaríta')

'angarita'

In [12]:
basic_clean("Angaríta '!s'")

"angarita 's'"

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [13]:
def tokenize(string):

    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string)
    
    return string

In [15]:
tokenize('hello my name is nico')

['hello', 'my', 'name', 'is', 'nico']

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [22]:
def stem(text):
    
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    # glue it back together with spaces, as it was before
    text = ' '.join(stems)
    
    return text

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [27]:
def lemmatize(text):
    
    wnl = nltk.stem.WordNetLemmatizer()
   
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    text = ' '.join(lemmas)

    return text

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

 - This function should define two optional parameters, extra_words and exclude_words.
 - These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [36]:
def remove_stopwords(string, extra_words = [], exclude_words = []):

    stopword_list = stopwords.words('english')

    stopword_list = set(stopword_list) - set(exclude_words)

    stopword_list = stopword_list.union(set(extra_words))
    
    words = string.split()
    
    filtered_words = [word for word in words if word not in stopword_list]
    
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [41]:
remove_stopwords('I would like a 1 million dollars, they have too much money')

'I would like 1 million dollars, much money'

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [43]:
acquire.get_news_articles()

<function acquire.get_news_articles(topic_list)>

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

### 8. For each dataframe, produce the following columns:

    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.

### 9. Ask yourself:

    - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?