In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [29]:
def basic_clean(unclean):
    unclean = unclean.strip()
    unclean = unclean.lower()
    unclean = re.sub('-', ' ',unclean)
    normalized = unicodedata.normalize('NFKD', unclean)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    normalized = ' '.join(re.sub(r'[^\w\s]', '', normalized).split())
    return normalized

In [8]:
articles = acquire.get_blog_articles()

In [9]:
article1 = articles[0]['content']
article1

'\nThe rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Stude

In [15]:
basic_clean_a1 = basic_clean(article1)

In [16]:
basic_clean_a1

'the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america data science is a method of providing actionable intelligence from data the data revolution has hit san antonio resulting in an explosion in data scientist positions across companies like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demands of this industry our program will be 18 weeks long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who has worked at heb capital group and rackspace along with input from dozens of practitioners and hiring partners students will work with real data sets realistic problems 

In [30]:
import acquire_zach
news = acquire_zach.get_news_articles()

In [31]:
news1 = news[0]['content']

In [32]:
news1 = basic_clean(news1)

In [33]:
news1

'after worlds fourth richest person warren buffett claimed carmaker tesla cannot sell insurance teslas ceo elon musk responded with a series of tweets on sunday in his first tweet musk urged users to buy tesla model 3 claiming its cheaper than a honda accord also 1000 times more fun and thats a direct quote from warren buffett his next tweets read'

In [26]:
def tokenize(some_text):
    '''takes in a string and tokenizes all the words in the string'''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    some_text = tokenizer.tokenize(some_text, return_str=True)
    return some_text

In [28]:
tokenized_news = tokenize(news1)
tokenized_news
tokenized_news2 = tokenize(news[1]['content'])
tokenized_news2

"Gujarat-based diamond trader Savji Dholakia , who ' s known for gifting cars to his employees , has said he ' ll remove a ' kutcha ' road built by him across the dry Narmada river to reach his farmhouse. This comes after government officials visited the spot on learning about the illegal structure. Meanwhile , Dholakia said his only intention was to enable people to reach the island ."

In [34]:
def stem(some_text):
    '''accepts some text and returns the text after applying stemming to words'''
    ps = nltk.stem.PorterStemmer()
    stems = [ps.stem(word) for word in some_text.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

In [35]:
def lemmatize(some_text):
    '''accepts some text and returns the text after applying lemmatization to ea word'''
    wn = nltk.stem.WordNetLemmatizer()
    lemmas = [wn.lemmatize(word) for word in some_text.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [45]:
def remove_stopwords(some_text, extra_words = [], exclude_words = []):
    '''accepts some text and returns it after removing stopwords'''
    stopword_list = stopwords.words('english')
    [stopword_list.append(word) for word in extra_words if word not in stopword_list]
    [stopword_list.remove(word) for word in exclude_words if word in stopword_list]
    word_list = some_text.split()
    filtered_words = [w for w in word_list if w not in stopword_list]
    return ' '.join(filtered_words)

In [46]:
news1 = remove_stopwords(news1)

In [47]:
news1

'worlds fourth richest person warren buffett claimed carmaker tesla cannot sell insurance teslas ceo elon musk responded series tweets sunday first tweet musk urged users buy tesla model 3 claiming cheaper honda accord also 1000 times fun thats direct quote warren buffett next tweets read'

In [51]:
list_of_keys = list(news[0].keys())

In [52]:
list_of_keys

['title', 'content', 'category']

In [57]:
if 'title' and 'content' in list_of_keys:
    print('yes!')

yes!


In [87]:
def prep_article(d_content, extra_words = [], exclude_words = []):
    if 'title' and 'content' in list(d_content.keys()):
        cleaned = basic_clean(d_content['content'])
        tokened = tokenize(cleaned)
        prepped_content = {
            'title': d_content['title'],
            'original': d_content['content'],
            'stemmed': stem(tokened),
            'lemmatized': lemmatize(tokened),
            'clean': remove_stopwords(tokened, extra_words, exclude_words)
        }
        return prepped_content
    else:
        print('~~~Function requires a dictionary with a \'title\' and \'content\' key.~~~')
              

In [89]:
preppedtest = prep_article({'hi': 'this isnt right'})

Function requires a dictionary with a 'title' and 'content' key.


In [85]:
preppedtest

{'title': "I'll never hesitate to fly on 737 MAX: Buffett on crashes killing 346",
 'original': 'World\'s fourth-richest person, Warren Buffett, has said that he will "never hesitate even for a second to fly on a Boeing 737 MAX plane" which was involved in two crashes that claimed the lives of 346 people. He was responding to a question about damage to Boeing\'s reputation after the crashes. "Planes have never been so safe," Buffett added.',
 'stemmed': 'world fourth richest person warren buffett ha said that he will never hesit even for a second to fli on a boe 737 max plane which wa involv in two crash that claim the live of 346 peopl he wa respond to a question about damag to boe reput after the crash plane have never been so safe buffett ad',
 'lemmatized': 'world fourth richest person warren buffett ha said that he will never hesitate even for a second to fly on a boeing 737 max plane which wa involved in two crash that claimed the life of 346 people he wa responding to a question

In [90]:
def prepare_article_data(list_o_dicts):
    return [prep_article(art) for art in list_o_dicts]

In [91]:
prepped_news = prepare_article_data(news)

In [92]:
prepped_news[3]

{'title': 'Infosys makes Hyd staff pay for parking, activists call it illegal',
 'original': "Activists in Hyderabad have claimed that Infosys deducting parking charges from employees' salary is illegal. Infosys charges ₹500 for four-wheelers and ₹250 for two-wheelers per month to park on the campus located in the Pocharam Special Economic Zone in city's outskirts. The management reportedly responded to employee complaints by saying the fee was being collected to maintain the parking lot.",
 'stemmed': 'activist in hyderabad have claim that infosi deduct park charg from employe salari is illeg infosi charg 500 for four wheeler and 250 for two wheeler per month to park on the campu locat in the pocharam special econom zone in citi outskirt the manag reportedli respond to employe complaint by say the fee wa be collect to maintain the park lot',
 'lemmatized': 'activist in hyderabad have claimed that infosys deducting parking charge from employee salary is illegal infosys charge 500 for f