In [9]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

# Exercises

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.


1) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.


In [10]:
def basic_clean(article):
    '''
    This function takes in a article in string format.
    
    Turns all letters into lowercase.
    
    Normalizes the unicode characters using the NFKD method,
    while ignoring any unknow characters.
    
    Will replace anything that is NOT letters, numbers, whitespace or single quote.
    
    This funtion will return a basic cleaned article in string format
    '''
    
    # Lowercase 
    article = article.lower()
    
    # Normalization
    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # Replace
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    
    return article

In [11]:
article = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to\
the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often\
incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"


In [13]:
article = basic_clean(article)
article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written as erdos or erdos either by mistake or out of typographical necessity"

2) Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [16]:
def tokenize(article):
    '''
    This function takes in an article as a string.
    Creates a tokenizer using nltk.
    Uses the tokenize on the artical and returns the article in string fromat.
    '''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    tokenizer.tokenize(article, return_str = True)
    
    return article

In [18]:
article = tokenize(article)
article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written as erdos or erdos either by mistake or out of typographical necessity"

3) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [20]:
def stem(article):
    '''
    This function takes in an article as a string.
    Creates a porter stemmer.
    Applies the stemmer to each word in the article/string.
    Joins the stems into a single string called article_stemmed.
    Returns article_stemmed with all stemed characters. 
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Apply the stemmer to each word in our string.
    stems = [ps.stem(word) for word in article.split()]
    
    # Join stems
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

In [22]:
article_stemmed = stem(article)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot toth field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is oftenincorrectli written as erdo or erdo either by mistak or out of typograph necess"

4) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [27]:
def lemmatize(article):
    '''
    This function takes in an article as a string.
    Downloads 'wordnet' from nltk.
    Creates a lemmatizer.
    Applies the lemmatizer to each word in the article/string.
    Joins the lemmmas into a single string called article_lemmatized.
    Returns artical_lemmatized with all lemmatized characters. 
    '''
    # Download the first time.
    nltk.download('wordnet')
    
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Apply the lemmatize to each word in our string.
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    
    # Join lemmas
    article_lemmatized = ' '.join(lemmas)
    
    return article_lemmatized

In [28]:
article_lemmatized = lemmatize(article)
article_lemmatized

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/liamjackson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


"paul erdos and george polya are influential hungarian mathematician who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written a erdos or erdos either by mistake or out of typographical necessity"

5) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

   - This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.


In [94]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [96]:
extra_words = 'my'
exclude_words = 'are'

In [97]:
# use the function defined above
remove_stopwords(article)

"paul erdos george polya influential hungarian mathematicians contributed lot tothe field erdos's name contains hungarian letter 'o' 'o' double acute accent oftenincorrectly written erdos erdos either mistake typographical necessity"

6) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [64]:
from requests import get
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [67]:
def get_article(article, category):
    """
    This function takes in a category and artical as a string. 
    Category must be an available category in inshorts, article is the link.
    Returns a single inshort article.
    """
    # Attribute selector
    title = article.select("[itemprop='headline']")[0].text
    
    # article body
    content = article.select("[itemprop='articleBody']")[0].text
    
    output = {}
    output["title"] = title
    output["content"] = content
    output["category"] = category
    
    return output

In [72]:
def news_df(category):
    """
    This function takes in a category as a string. Category must be an available category in inshorts
    Returns a list of dictionaries where each dictionary represents a single inshort article
    """
    base = "https://inshorts.com/en/read/"
    
    # We concatenate our base_url with the category
    url = base + category
    
    # Set the headers to show as Netscape Navigator on Windows 98, b/c I feel like creating an anomaly in the logs
    headers = {"User-Agent": "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"}

    # Get the http response object from the server
    response = get(url, headers=headers)

    # Make soup out of the raw html
    soup = BeautifulSoup(response.text)
    
    # Ignore everything, focusing only on the news cards
    articles = soup.select(".news-card")
    
    output = []
    
    # Iterate through every article tag/soup 
    for article in articles:
        
        # Returns a dictionary of the article's title, body, and category
        article_data = get_article(article, category) 
        
        # Append the dictionary to the list
        output.append(article_data)
    
    # Return the list of dictionaries
    return pd.DataFrame(output)


In [73]:
news_df("technology")

Unnamed: 0,title,content,category
0,Kangana Ranaut's Twitter account suspended for...,Actress Kangana Ranaut's Twitter account has b...,technology
1,Twitter issues statement after permanently sus...,Twitter has permanently suspended Kangana Rana...,technology
2,SMS worm that tricks Android users into downlo...,A malware said to be targeting Android users i...,technology
3,Melinda refuses spousal support from Bill Gate...,Melinda Gates officially filed for divorce fro...,technology
4,Apple hires former Google AI scientist who lef...,Apple Inc said it has hired former Google scie...,technology
5,Accenture pledges ₹185 cr for COVID-19 relief ...,Global IT and professional services company Ac...,technology
6,US baseball team makes '1st' Dogecoin transact...,US baseball team Oakland Athletics President D...,technology
7,Spotify CEO has contacted Arsenal owners for t...,Spotify CEO Daniel Ek has contacted Arsenal ow...,technology
8,Pro-Bitcoin messages projected on the walls of...,An unidentified cryptocurrency fan recently us...,technology
9,Sony strikes deal to integrate Discord in Play...,Sony has invested in gaming-focused chat app D...,technology


7) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.


In [79]:
def get_codeup_blog(url):
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"}
    response = get(url, headers = headers)
    
    soup = BeautifulSoup(response.text)
    
    # Title
    Title = soup.select('h1.jupiterx-post-title')[0].text

    # Body
    Content = soup.select('.jupiterx-post-content')[0].text

    # Time
    Time = soup.time.text
    
    output = {}
    output['Title'] = Title
    output['Time'] = Time
    output['Content'] = Content
    
    return output

In [80]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [121]:
def codeup_df(urls):
    posts = [get_codeup_blog(url) for url in urls]
    posts = pd.DataFrame(posts)
    return posts

In [125]:
posts = codeup_df(urls)
posts.head()

Unnamed: 0,Title,Time,Content
0,Codeup’s Data Science Career Accelerator is Here!,"September 30, 2018",The rumors are true! The time has arrived. Cod...
1,Data Science Myths,"October 31, 2018",By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"October 17, 2018","By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,"August 14, 2018",SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,"August 14, 2018",Competitor Bootcamps Are Closing. Is the Model...


8) For each dataframe, produce the following columns:


    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.


In [126]:
import pandas as pd

In [127]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [128]:
df[['title', column,'clean', 'stemmed', 'lemmatized']] = prep_article_data(df, column, extra_words=[], exclude_words=[])

NameError: name 'column' is not defined

9) Ask yourself:


   - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
   - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
   - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?


In [129]:
# Lemmatized.

In [130]:
# It depends, if you have a lot of time use lemmmatized if not use stemmed.

In [131]:
# Stemmed.