In [1]:
import pandas as pd
import numpy as np
#import acquire as aq

import unicodedata
import re
import os
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

## EXERCISE

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
#original text
original = 'Serbian scientists named a new species of beetle after ex-world number one men\'s tennis player\
Novak Djokovic. The insect, which belongs to Duvalius genus of ground beetles present in Europe and was \
discovered several years ago in underground pit in Serbia, has been named \'Duvalius djokovici\'.\
"We feel urged to pay Djokovic back in...way we can," a researcher said.'
original

'Serbian scientists named a new species of beetle after ex-world number one men\'s tennis playerNovak Djokovic. The insect, which belongs to Duvalius genus of ground beetles present in Europe and was discovered several years ago in underground pit in Serbia, has been named \'Duvalius djokovici\'."We feel urged to pay Djokovic back in...way we can," a researcher said.'

In [3]:
#Lowercase everything
article = original.lower()
article

'serbian scientists named a new species of beetle after ex-world number one men\'s tennis playernovak djokovic. the insect, which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia, has been named \'duvalius djokovici\'."we feel urged to pay djokovic back in...way we can," a researcher said.'

In [4]:
#import unicode character database
import unicodedata

In [5]:
article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

article

'serbian scientists named a new species of beetle after ex-world number one men\'s tennis playernovak djokovic. the insect, which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia, has been named \'duvalius djokovici\'."we feel urged to pay djokovic back in...way we can," a researcher said.'

In [6]:
#import regular expression operations
import re

In [7]:
#use re.sub to remove special characters
article = re.sub(r'[^a-z0-9\'\s]', '', article)
article

"serbian scientists named a new species of beetle after exworld number one men's tennis playernovak djokovic the insect which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia has been named 'duvalius djokovici'we feel urged to pay djokovic back inway we can a researcher said"

In [8]:
def basic_clean(string):
    """
    This function will take in a string and perform basic cleaning procedutes. It will convert all characters
    to lower case, remove accented characters using unicode, and remove all special character 
    and symbols that are not alphanumeric characters.
    """
    
    #Convert to lower case
    string = string.lower()
    
    #Normalize and remove inconsistencies, encode into ascii byte strings and ignore unknown chars,
    #decode back into a UTF-8 string
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    #Use regex to replace remove/replace all special characters
    string = re.sub(r"[^a-z0-9\s']", '', string)
    
    return string

In [9]:
basic_clean(original)

"serbian scientists named a new species of beetle after exworld number one men's tennis playernovak djokovic the insect which belongs to duvalius genus of ground beetles present in europe and was discovered several years ago in underground pit in serbia has been named 'duvalius djokovici'we feel urged to pay djokovic back inway we can a researcher said"

### 2.Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [10]:
#import natural language toolkit
import nltk

In [11]:
def tokenize(string):
    """
    This function will take in a string, tokenize it and return the 
    tokenized string.
    """
    #Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [12]:
tokenize(original)

'Serbian scientists named a new species of beetle after ex-world number one men \' s tennis playerNovak Djokovic. The insect , which belongs to Duvalius genus of ground beetles present in Europe and was discovered several years ago in underground pit in Serbia , has been named \' Duvalius djokovici \' . " We feel urged to pay Djokovic back in ... way we can , " a researcher said .'

### 3.Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [13]:
def stem(string):
    """
    This function will take in a string return a stemmed version of the string.
    """
    
    #Create the stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in the string and create a list of stemmed words
    stems = [ps.stem(word) for word in string.split()]
    
    #join the list of stemmed words into a string
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [14]:
# Test function using the tokenized version of my string
string = basic_clean(original)

stemmed = tokenize(string)

string_stemmed = stem(stemmed)

string_stemmed

"serbian scientist name a new speci of beetl after exworld number one men ' s tenni playernovak djokov the insect which belong to duvaliu genu of ground beetl present in europ and wa discov sever year ago in underground pit in serbia ha been name ' duvaliu djokovici ' we feel urg to pay djokov back inway we can a research said"

### 4.Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [15]:
# Download wornet lemmatized
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patricknash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def lemmatize(string):
    """
    This function takes in a string and returns a lemmatized version of the string.
    """
    
    #Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #Use the lemmatizer on each word in the string to create a list of lemmatized words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #Join the lemmatized words into one string
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [17]:
# Test on my tokenized string
lemmatize(original)

'Serbian scientist named a new specie of beetle after ex-world number one men\'s tennis playerNovak Djokovic. The insect, which belongs to Duvalius genus of ground beetle present in Europe and wa discovered several year ago in underground pit in Serbia, ha been named \'Duvalius djokovici\'."We feel urged to pay Djokovic back in...way we can," a researcher said.'

### 5.Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
### This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [18]:
# dowload nltk stopwords list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patricknash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    """
    This function will take in a string, filter out stop words from the nltk standard english list 
    as well as any other extra words, and return a version of the text without these stopwords.
    It includes optional paramaters allowing the user to add extra words to remove 
    or to exclude words from the stopword list.
    """
    #Get the standard english stop word list from nltk
    stop_words = stopwords.words('english')
    
    #Add extra words to be removed to the stop word list
    for word in extra_words:
        stop_words.append(word)
    
    #Remove words to be excluded from the stop word list
    for word in exclude_words:
        stop_words.remove(word)
    
    #Create a list of words to be checked by splitting the string
    words = string.split()
    
    #Filter out all of the stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    #Join the list of filtered words into a string
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string

In [20]:
# Test function using the tokenized version of my string
string = basic_clean(original)

stemmed = tokenize(string)

string_stemmed = stem(stemmed)

string_stemmed

"serbian scientist name a new speci of beetl after exworld number one men ' s tenni playernovak djokov the insect which belong to duvaliu genu of ground beetl present in europ and wa discov sever year ago in underground pit in serbia ha been name ' duvaliu djokovici ' we feel urg to pay djokov back inway we can a research said"

### 6.Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [21]:
#nltk.download('all')

In [22]:
#nltk.download('stopwords')