In [27]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import pandas as pd
import acquire as aq


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mariamnaqvi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [6]:
def basic_clean(word):
    word = word.lower()
    word = unicodedata.normalize('NFKD', word)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    return word

In [7]:
assert basic_clean('Th!$ picTURE costs $40.00') == 'th picture costs 4000'

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
def tokenize(string):
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    tokenized_string = tokenizer.tokenize(string, return_str = True)
    return tokenized_string

In [9]:
tokenize("paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity")

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [10]:
def stem(text):
    
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    # Join our lists of words into a string again; assign to a variable to save changes
    text_stemmed = ' '.join(stems)
    return text_stemmed

In [11]:
stem("paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity")

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mariamnaqvi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
def lemmatize(text):
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    lemms = [wnl.lemmatize(word) for word in text.split()]
    # Join our list of words into a string again; assign to a variable to save changes.
    text_lemmatized = ' '.join(lemms)
    return text_lemmatized

In [14]:
lemmatize("paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity")

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [25]:
def remove_stopwords(text, extra_words = [], exclude_words = []):
    '''
    This function takes in some text, optional extra_words and exclude_words parameters
    with default empty lists and returns the text after removing all stop words.
    '''
    # Create stopword_list
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = text.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords
    

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [17]:
article_dict = {'Business':'https://inshorts.com/en/read/business',
               'Sports':'https://inshorts.com/en/read/sports',
               'Technology':'https://inshorts.com/en/read/technology',
               'Entertainment':'https://inshorts.com/en/read/entertainment'}

df_news = aq.get_news_articles(article_dict)
df_news.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,Business
1,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,Business
2,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,Business
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,Business
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,Business


In [18]:
# use the content of first news item as 'article' to test my functions

article = df_news.content[0]
article

'Reliance Industries has said in a statement that over 98% of its workers have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees, of March 31. Besides Reliance, Hindustan Unilever has also given at least one shot to 90% of employees, while Infosys inoculated 59% employees and TCS 70%.'

In [19]:
basic_clean(article)

'reliance industries has said in a statement that over 98 of its workers have received at least one dose of covid19 vaccine so far the billionaire mukesh ambaniled conglomerate had over 236 lakh employees of march 31 besides reliance hindustan unilever has also given at least one shot to 90 of employees while infosys inoculated 59 employees and tcs 70'

In [20]:
tokenize(article)

'Reliance Industries has said in a statement that over 98 % of its workers have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees , of March 31. Besides Reliance , Hindustan Unilever has also given at least one shot to 90 % of employees , while Infosys inoculated 59 % employees and TCS 70 % .'

In [21]:
stem(article)


'relianc industri ha said in a statement that over 98% of it worker have receiv at least one dose of covid-19 vaccin so far. the billionair mukesh ambani-l conglomer had over 2.36 lakh employees, of march 31. besid reliance, hindustan unilev ha also given at least one shot to 90% of employees, while infosi inocul 59% employe and tc 70%.'

In [22]:
lemmatize(article)

'Reliance Industries ha said in a statement that over 98% of it worker have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees, of March 31. Besides Reliance, Hindustan Unilever ha also given at least one shot to 90% of employees, while Infosys inoculated 59% employee and TCS 70%.'

In [28]:
remove_stopwords(article)

'Reliance Industries said statement 98% workers received least one dose COVID-19 vaccine far. The billionaire Mukesh Ambani-led conglomerate 2.36 lakh employees, March 31. Besides Reliance, Hindustan Unilever also given least one shot 90% employees, Infosys inoculated 59% employees TCS 70%.'

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
input = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
'https://codeup.com/data-science-myths/',
'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

df_blogs = aq.get_blog_articles(input)
df_blogs.head()

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [29]:
def prep_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function takes in a df and the string name for a text column with the
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [31]:
# use the function defined above for news_df's content column.

prep_data(df_news, 'content', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,reliance industries said statement 98 workers ...,relianc industri said statement 98 worker rece...,reliance industry said statement 98 worker rec...
1,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,tesla ' billionaire ceo elon musk criticised a...,tesla ' billionair ceo elon musk criticis appl...,tesla ' billionaire ceo elon musk criticised a...
2,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,amazon monday denied speculations looking acce...,amazon monday deni specul wa look accept bitco...,amazon monday denied speculation wa looking ac...
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,inox leisure denied report claimed amazon indi...,inox leisur deni report claim amazon india dis...,inox leisure denied report claimed amazon indi...
