In [1]:
import unicodedata
import re
import json
from functools import reduce
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire_codeup_blog

Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:

* lowercase everything
* normalize unicode characters
* replace anything that is not a letter, number, whitespace or a single quote

In [2]:
with open('docs/codeup_blog_articles.json') as f:
    original = json.load(f)

In [3]:
article = original[0]['Article']

In [4]:
article = article.lower()

In [5]:
def basic_clean(text):
    return pipe(text, lowercase_text, normalize_text, remove_special)
    
def lowercase_text(text):    
    return text.lower()

def normalize_text(text):
    return unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

def remove_special(text):
    return re.sub(r'[^\w\s]', ' ', text)

In [6]:
def pipe(v, *fns):
    return reduce(lambda x, f: f(x), fns, v)

Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    return ' '.join(stems)

Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    return ' '.join(lemmas)

Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [10]:
def remove_stopwords(text, include=[], exclude=[]):
    stopword_list = stopwords.words('english')
    # add in new stopwords
    stopword_list.extend(include)
    # remove stopwords
    for word in exclude:
        stopword_list.remove(word)
    
    words = text.split()
    filtered_words = [w for w in words if w not in stopword_list]
    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('Added {} stopwords'.format(len(include)))
    print('---')
    return ' '.join(filtered_words)

Define a function named `prep_article` that takes in the dictionary representing an article and returns a dictionary that looks like this:
```
{
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
}
```

Note that if the orignal dictionary has a title property, it should remain unchanged (same goes for the `category` property).

In [14]:
def prep_article(dictionary):
    
    clean_stem = pipe(dictionary['Article'],
                      basic_clean,
                      tokenize,
                      remove_stopwords,
                      stem)
    
    clean_lemm = pipe(dictionary['Article'],
                      basic_clean,
                      tokenize,
                      remove_stopwords,
                      lemmatize)
       
    new_dict = {
        'title': dictionary['Title'].lower(),
        'category': dictionary['Category'].lower(),
        'original': dictionary['Article'].lower(),
        'stemmed': stem(dictionary['Article']),
        'lemmatized': lemmatize(dictionary['Article']),
        'clean_stem': clean_stem,
        'clean_lemm': clean_lemm
    }
    return new_dict

In [15]:
def prep_article_data(dict_list):
    cleaned_dict_list = [prep_article(d) for d in dict_list]
    return cleaned_dict_list

In [16]:
full_list = prep_article_data(original)

Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 stopwords
---
Removed 663 stopwords
Added 0 st

In [19]:
full_list

[{'title': '5-common-excuses-keeping-you-from-breaking-into-the-tech-field',
  'category': '\xa0in uncategorized',
  'original': '\njust a few months before starting at codeup in the redwood cohort, i was sitting in the football stadium at the university of colorado at boulder, pondering what i would do after graduation. the commencement speaker that year was kate fagan, a sports reporter and commentator at espn. in her speech, something she said stuck out to me: “try replacing ‘should’ with ‘want’ and, as frequently as you are able, make decisions with that rubric. life is best when your ‘should’ and your ‘want’ are aligned.” sitting there in that stadium, i realized that i knew exactly what i should be doing after graduating, which was applying to attend graduate school for the next five years. but the actual truth was, i didn’t know what i truly wanted. did i really want to jump into something for five years that i wasn’t completely sure about? \nwith this in mind i moved to san ant