In [1]:
import acquire
import pandas as pd
import unicodedata
import re
import json
import nltk
#nltk.download()
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove anything that is not a through z, a number, a single quote, or whitespace
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

In [21]:
sample = 'My name is Christopher John Francis Boone. I know all the countries of the world and their capital cities and every prime number up to 7,057.'
sample = basic_clean(sample)
sample

'my name is christopher john francis boone i know all the countries of the world and their capital cities and every prime number up to 7057'

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [5]:
tokenize(sample)

'my name is christopher john francis boone i know all the countries of the world and their capital cities and every prime number up to 7057'

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [26]:
def stem(string):
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    return ' '.join(stems)
    

In [27]:
stem(sample)

'my name is christoph john franci boon i know all the countri of the world and their capit citi and everi prime number up to 7057'

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [28]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    return ' '.join(lemmas)

In [29]:
lemmatize(sample)

'my name is christopher john francis boone i know all the country of the world and their capital city and every prime number up to 7057'

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [30]:
def remove_stopwords(string, extra_words = None, exclude_words = None):
    stopword_list = stopwords.words('english')
    if extra_words != None:
        stopword_list = stopword_list.append(extrawords)
    if exclude_words != None:
        stopword_list = stopword_list.remove(exclude_words)
    words = string.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [31]:
remove_stopwords(sample)

'name christopher john francis boone know countries world capital cities every prime number 7057'

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [32]:
news_df = acquire.get_news_articles_data()

In [33]:
news_df.head()

Unnamed: 0,title,content,category
0,Veteran Bollywood singer Bhupinder Singh passe...,Veteran Bollywood playback and ghazal singer B...,national
1,West can't isolate Russia & reverse its develo...,Russian President Vladimir Putin said on Monda...,national
2,CBI arrests 8 persons for alleged malpractices...,The Central Bureau of Investigation (CBI) arre...,national
3,"If I get 20 minutes with Virat Kohli, I might ...","Amid Virat Kohli's poor form, Sunil Gavaskar s...",national
4,Report claims Ronaldo to join Sporting CP as c...,Manchester United forward Cristiano Ronaldo ha...,national


### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [34]:
codeup_df = acquire.get_blog_articles_data()

In [35]:
codeup_df

Unnamed: 0,title,content
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [36]:
news_df.rename({'content':'original'}, axis=1, inplace=True)

In [37]:
news_df['clean'] = news_df.original.apply(basic_clean)

In [38]:
news_df['clean'] = news_df.clean.apply(tokenize)

In [39]:
news_df['stemmed'] = news_df.clean.apply(stem)

In [40]:
news_df['lemmatized'] = news_df.clean.apply(lemmatize)

In [41]:
news_df

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,Veteran Bollywood singer Bhupinder Singh passe...,Veteran Bollywood playback and ghazal singer B...,national,veteran bollywood playback and ghazal singer b...,veteran bollywood playback and ghazal singer b...,veteran bollywood playback and ghazal singer b...
1,West can't isolate Russia & reverse its develo...,Russian President Vladimir Putin said on Monda...,national,russian president vladimir putin said on monda...,russian presid vladimir putin said on monday t...,russian president vladimir putin said on monda...
2,CBI arrests 8 persons for alleged malpractices...,The Central Bureau of Investigation (CBI) arre...,national,the central bureau of investigation cbi arrest...,the central bureau of investig cbi arrest eigh...,the central bureau of investigation cbi arrest...
3,"If I get 20 minutes with Virat Kohli, I might ...","Amid Virat Kohli's poor form, Sunil Gavaskar s...",national,amid virat kohli ' s poor form sunil gavaskar ...,amid virat kohli ' s poor form sunil gavaskar ...,amid virat kohli ' s poor form sunil gavaskar ...
4,Report claims Ronaldo to join Sporting CP as c...,Manchester United forward Cristiano Ronaldo ha...,national,manchester united forward cristiano ronaldo ha...,manchest unit forward cristiano ronaldo ha bra...,manchester united forward cristiano ronaldo ha...
...,...,...,...,...,...,...
295,Terror hideout busted by security forces in J&...,Security forces busted a terror hideout and se...,automobile,security forces busted a terror hideout and se...,secur forc bust a terror hideout and seiz a ca...,security force busted a terror hideout and sei...
296,J&K LG announces 10% reservation for 'Agniveer...,J&K LG Manoj Sinha on Sunday announced a 10% r...,automobile,jk lg manoj sinha on sunday announced a 10 res...,jk lg manoj sinha on sunday announc a 10 reser...,jk lg manoj sinha on sunday announced a 10 res...
297,Nearly half of EU territory at drought risk as...,Nearly half of EU's territory is at risk of dr...,automobile,nearly half of eu ' s territory is at risk of ...,nearli half of eu ' s territori is at risk of ...,nearly half of eu ' s territory is at risk of ...
298,"4 people shot at nightclub in Spain, suspected...",Four people were shot and one other was stabbe...,automobile,four people were shot and one other was stabbe...,four peopl were shot and one other wa stab wit...,four people were shot and one other wa stabbed...


In [42]:
codeup_df.rename({'content':'original'}, axis=1, inplace=True)

In [43]:
codeup_df['clean'] = codeup_df.original.apply(basic_clean)

In [44]:
codeup_df['clean'] = codeup_df.clean.apply(tokenize)

In [45]:
codeup_df['stemmed'] = codeup_df.clean.apply(stem)

In [46]:
codeup_df['lematized'] = codeup_df.clean.apply(lemmatize)

In [47]:
codeup_df

Unnamed: 0,title,original,clean,stemmed,lematized
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...,have you been considering a career in cloud ad...,have you been consid a career in cloud adminis...,have you been considering a career in cloud ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...,if you are interested in embarking on a career...,if you are interest in embark on a career in t...,if you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...,changing careers can be scary the first thing ...,chang career can be scari the first thing you ...,changing career can be scary the first thing y...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...,come work in the cloud\nwhen your monday rolls...,come work in the cloud when your monday roll a...,come work in the cloud when your monday roll a...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...,have you been considering a career in cloud ad...,have you been consid a career in cloud adminis...,have you been considering a career in cloud ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...,if you are interested in embarking on a career...,if you are interest in embark on a career in t...,if you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...,join us for our live inperson javascript crash...,join us for our live inperson javascript crash...,join u for our live inperson javascript crash ...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising job...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...,event info \nlocation codeup dallas\ntime 6 pm...,event info locat codeup dalla time 6 pm come l...,event info location codeup dallas time 6 pm co...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...,changing careers can be scary the first thing ...,chang career can be scari the first thing you ...,changing career can be scary the first thing y...


### Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? **Lemmatized**
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? **Lemmatized**
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? **Stemmed**