In [1]:
# from __future__ import division
import itertools

# To get rid of those blocks of red warnings
import warnings
warnings.filterwarnings("ignore")

# Standard Imports
import numpy as np
from scipy import stats
import pandas as pd
from math import sqrt
import os
from scipy.stats import spearmanr
from sklearn import metrics
from random import randint


# Vis Imports
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import plotly.express as px
from pandas.plotting import register_matplotlib_converters
from mpl_toolkits.mplot3d import Axes3D

# Modeling Imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import sklearn.preprocessing
import statsmodels.api as sm
from sklearn.cluster import DBSCAN

# NLP Imports
import unicodedata
import re
import json
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Custom Module Imports
import env
import acquire

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [99]:
url = 'https://codeup.com/blog/'

In [100]:
original = acquire.get_blog_articles(url)

In [4]:
original

Unnamed: 0,title,content
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...
5,2022 SABJ C-Suite Award Winner: Stephen Noteboom,"Codeup’s Chief Operating Officer, Stephen Note..."


In [85]:
def basic_clean(original):
    article = original.lower()
    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
    #use re.sub to remove special characters
    article = re.sub(r'[^a-z0-9\'\s]', '', article)
    return article

In [39]:
def tokenize(article):
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use the tokenizer
    article = tokenize.tokenize(article, return_str=True)
    return article

In [40]:
def stem(article):
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in article.split()]
    #join words back together
    article_stemmed = ' '.join(stems)
    return article_stemmed

In [97]:
def lemmatize(article):
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    #join words back together
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [80]:
def remove_stopwords(article):
    #save stopwords
    stopwords_ls = stopwords.words('english')
    words = article.split()
    #remove stopwords from list of words
    filtered_words = [word for word in words if word not in stopwords_ls]
    #join words back together
    article = ' '.join(filtered_words)
    return article

In [57]:
# def prepare_article(original):
#     for x in original.content:
#         article = x.lower()
#         article = unicodedata.normalize('NFKD', article)\
#         .encode('ascii', 'ignore')\
#         .decode('utf-8')
#         #use re.sub to remove special characters
#         article = re.sub(r'[^a-z0-9\'\s]', '', article)
#         #create the tokenizer
#         tokenize = nltk.tokenize.ToktokTokenizer()
#         #use the tokenizer
#         article = tokenize.tokenize(article, return_str=True)
#         #save stopwords
#         stopwords_ls = stopwords.words('english')
#         words = article.split()
#         #remove stopwords from list of words
#         filtered_words = [word for word in words if word not in stopwords_ls]
#         #join words back together
#         clean = ' '.join(filtered_words)
#         #create porter stemmer
#         ps = nltk.porter.PorterStemmer()
#         stems = [ps.stem(word) for word in clean.split()]
#         #join words back together
#         stemmed = ' '.join(stems)
#         #create the lemmatizer
#         wnl = nltk.stem.WordNetLemmatizer()
#         lemmas = [wnl.lemmatize(word) for word in article.split()]
#         #join words back together
#         lemmatized = ' '.join(lemmas)
#     original.rename(columns = {'content':'original'}, inplace = True)
#     original['clean'] = clean
#     original['stemmed'] = stemmed
#     original['lemmatized'] = lemmatized
#     return original

In [107]:
def prepare_article(original):
    original['clean'] = original['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)
    original['stemmed'] = original['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(stem)
    original['lemmatized'] = original['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
    original.rename(columns = {'content':'original'}, inplace = True)
    return original

In [15]:
article = original.content[0].lower()

In [7]:
# nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zi

[nltk_data]    |   Unzipping corpora/pl196x.zip.
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping stemmers/porter_test.zip.
[nltk_data]    | Downloading package ppattach to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/ppattach.zip.
[nltk_data]    | Downloading package problem_reports to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/problem_reports.zip.
[nltk_data]    | Downloading package product_reviews_1 to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/product_reviews_1.zip.
[nltk_data]    | Downloading package product_reviews_2 to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    |   Unzipping corpora/product_reviews_2.zip.
[nltk_data]    | Downloading package propbank to
[nltk_data]    |     /Users/mph/nltk_data...
[nltk_data]    | Downloading package pros_cons to
[nltk_data]    |     /Use

True

In [16]:
article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

article

'codeup is excited to launch our first diversity equity, and inclusion (dei) report! in over eight years as an organization, weve implemented policies and grown our dei efforts. we are extremely proud of the progress weve made as a staff and codeup community, and we recognize there is more to learn. this report captures some of the ways that weve lived our value of cultivating inclusive growth, and how we will continue doing so as we look to the future.\nwe wanted to shine a light on the demographics of our students and staff, and in particular how that compares to the tech industry as a whole. how we collect, organize, and share employee demographic data is informed by standards set by the equal employment opportunity commission (eeoc).\nwe are proud to celebrate how weve grown and are motivated and committed to do more and be better. to view the report visit the link here, or download it below.'

In [17]:
#use re.sub to remove special characters
article = re.sub(r'[^a-z0-9\'\s]', '', article)
article

'codeup is excited to launch our first diversity equity and inclusion dei report in over eight years as an organization weve implemented policies and grown our dei efforts we are extremely proud of the progress weve made as a staff and codeup community and we recognize there is more to learn this report captures some of the ways that weve lived our value of cultivating inclusive growth and how we will continue doing so as we look to the future\nwe wanted to shine a light on the demographics of our students and staff and in particular how that compares to the tech industry as a whole how we collect organize and share employee demographic data is informed by standards set by the equal employment opportunity commission eeoc\nwe are proud to celebrate how weve grown and are motivated and committed to do more and be better to view the report visit the link here or download it below'

In [18]:
#create the tokenizer
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x159ce3ca0>

In [19]:
#use the tokenizer
article = tokenize.tokenize(article, return_str=True)
article

'codeup is excited to launch our first diversity equity and inclusion dei report in over eight years as an organization weve implemented policies and grown our dei efforts we are extremely proud of the progress weve made as a staff and codeup community and we recognize there is more to learn this report captures some of the ways that weve lived our value of cultivating inclusive growth and how we will continue doing so as we look to the future\nwe wanted to shine a light on the demographics of our students and staff and in particular how that compares to the tech industry as a whole how we collect organize and share employee demographic data is informed by standards set by the equal employment opportunity commission eeoc\nwe are proud to celebrate how weve grown and are motivated and committed to do more and be better to view the report visit the link here or download it below'

In [20]:
#save stopwords
stopwords_ls = stopwords.words('english')
stopwords_ls[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
words = article.split()

In [22]:
#remove stopwords from list of words
filtered_words = [word for word in words if word not in stopwords_ls]
filtered_words

['codeup',
 'excited',
 'launch',
 'first',
 'diversity',
 'equity',
 'inclusion',
 'dei',
 'report',
 'eight',
 'years',
 'organization',
 'weve',
 'implemented',
 'policies',
 'grown',
 'dei',
 'efforts',
 'extremely',
 'proud',
 'progress',
 'weve',
 'made',
 'staff',
 'codeup',
 'community',
 'recognize',
 'learn',
 'report',
 'captures',
 'ways',
 'weve',
 'lived',
 'value',
 'cultivating',
 'inclusive',
 'growth',
 'continue',
 'look',
 'future',
 'wanted',
 'shine',
 'light',
 'demographics',
 'students',
 'staff',
 'particular',
 'compares',
 'tech',
 'industry',
 'whole',
 'collect',
 'organize',
 'share',
 'employee',
 'demographic',
 'data',
 'informed',
 'standards',
 'set',
 'equal',
 'employment',
 'opportunity',
 'commission',
 'eeoc',
 'proud',
 'celebrate',
 'weve',
 'grown',
 'motivated',
 'committed',
 'better',
 'view',
 'report',
 'visit',
 'link',
 'download']

In [23]:
#show how many words we removed
len(words) - len(filtered_words)


82

In [24]:
#join words back together
article_without_stopwords = ' '.join(filtered_words)
article_without_stopwords

'codeup excited launch first diversity equity inclusion dei report eight years organization weve implemented policies grown dei efforts extremely proud progress weve made staff codeup community recognize learn report captures ways weve lived value cultivating inclusive growth continue look future wanted shine light demographics students staff particular compares tech industry whole collect organize share employee demographic data informed standards set equal employment opportunity commission eeoc proud celebrate weve grown motivated committed better view report visit link download'

In [26]:
#create porter stemmer
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [27]:
#test stemmer
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [28]:
#use stemmer - apply stem to each word in our string
ps.stem(article)


'codeup is excited to launch our first diversity equity and inclusion dei report in over eight years as an organization weve implemented policies and grown our dei efforts we are extremely proud of the progress weve made as a staff and codeup community and we recognize there is more to learn this report captures some of the ways that weve lived our value of cultivating inclusive growth and how we will continue doing so as we look to the future\nwe wanted to shine a light on the demographics of our students and staff and in particular how that compares to the tech industry as a whole how we collect organize and share employee demographic data is informed by standards set by the equal employment opportunity commission eeoc\nwe are proud to celebrate how weve grown and are motivated and committed to do more and be better to view the report visit the link here or download it below'

In [29]:
stems = [ps.stem(word) for word in article.split()]

In [30]:
#join words back together
article_stemmed = ' '.join(stems)
article_stemmed

'codeup is excit to launch our first divers equiti and inclus dei report in over eight year as an organ weve implement polici and grown our dei effort we are extrem proud of the progress weve made as a staff and codeup commun and we recogn there is more to learn thi report captur some of the way that weve live our valu of cultiv inclus growth and how we will continu do so as we look to the futur we want to shine a light on the demograph of our student and staff and in particular how that compar to the tech industri as a whole how we collect organ and share employe demograph data is inform by standard set by the equal employ opportun commiss eeoc we are proud to celebr how weve grown and are motiv and commit to do more and be better to view the report visit the link here or download it below'

In [31]:
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [32]:
#use lemmatizer
lemmas = [wnl.lemmatize(word) for word in article.split()]
lemmas[:10]

['codeup',
 'is',
 'excited',
 'to',
 'launch',
 'our',
 'first',
 'diversity',
 'equity',
 'and']

In [33]:
#join words back together
article_lemmatized = ' '.join(lemmas)
article_lemmatized

'codeup is excited to launch our first diversity equity and inclusion dei report in over eight year a an organization weve implemented policy and grown our dei effort we are extremely proud of the progress weve made a a staff and codeup community and we recognize there is more to learn this report capture some of the way that weve lived our value of cultivating inclusive growth and how we will continue doing so a we look to the future we wanted to shine a light on the demographic of our student and staff and in particular how that compare to the tech industry a a whole how we collect organize and share employee demographic data is informed by standard set by the equal employment opportunity commission eeoc we are proud to celebrate how weve grown and are motivated and committed to do more and be better to view the report visit the link here or download it below'

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [101]:
codeup_df = prepare_article(original)

In [102]:
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup excited launch first diversity equity i...,codeup excit launch first divers equiti inclus...,codeup excited launch first diversity equity i...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup named 2022 diversity inclusion award wi...,codeup name 2022 divers inclus award winner sa...,codeup named 2022 diversity inclusion award wi...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding transition tech career big step signi...,decid transit tech career big step signific co...,deciding transition tech career big step signi...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity inclusion hon...,codeup strongli valu divers inclus honor ameri...,codeup strongly value diversity inclusion hono...
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...,many companies switching cloud services implem...,mani compani switch cloud servic implement clo...,many company switching cloud service implement...


In [103]:
url = 'https://inshorts.com/en/read'

In [104]:
original = acquire.get_news_articles(url)

In [105]:
original

Unnamed: 0,title,content,category
0,"Indian Navy gets VLF, easy communication with ...",The Indian navy has a new communication system...,india
1,India beat NZ 3-2 to enter CWG hockey finals,In the CWG men's hockey semi-final against New...,india
2,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ...",india
3,Kashmir's famous Dal Lake freezes,After the recent snowfall in upper reaches of ...,india
4,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india
...,...,...,...
280,Withdraw rule that makes 6 airbags mandatory i...,International Road Federation (IRF) has urged ...,automobile
281,Fix for wheel issue that caused electric car r...,Toyota Motor said it has found a fix for the d...,automobile
282,Mercedes-Benz sees 28% rise in sales in India ...,Mercedes-Benz India has registered 28% rise in...,automobile
283,TVS Motor beats Hero MotoCorp to become 6th mo...,TVS Motor Company Limited has become the sixth...,automobile


In [106]:
news_df = prepare_article(original)
news_df.head()

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,"Indian Navy gets VLF, easy communication with ...",The Indian navy has a new communication system...,india,indian navy new communication system critical ...,indian navi new commun system critic pass code...,indian navy new communication system critical ...
1,India beat NZ 3-2 to enter CWG hockey finals,In the CWG men's hockey semi-final against New...,india,cwg men ' hockey semifinal new zealand saturda...,cwg men ' hockey semifin new zealand saturday ...,cwg men ' hockey semifinal new zealand saturda...
2,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ...",india,bharti airtel india ' top telecommunications c...,bharti airtel india ' top telecommun compani r...,bharti airtel india ' top telecommunication co...
3,Kashmir's famous Dal Lake freezes,After the recent snowfall in upper reaches of ...,india,recent snowfall upper reaches kashmir himalaya...,recent snowfal upper reach kashmir himalayan p...,recent snowfall upper reach kashmir himalayan ...
4,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india,india may move nigerian weightlifter chika ama...,india may move nigerian weightlift chika amala...,india may move nigerian weightlifter chika ama...


9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

Yes for 493KB and 25MB, however 200TB of data is never going to load and would result in a large bill.