In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [30]:
# We don't need to install nltk, it should come with anaconda, but nltk
# does need to download some data.
!python -c "import nltk; nltk.download('wordnet')"

[nltk_data] Downloading package wordnet to /Users/dusts/nltk_data...


In [32]:
!python -c "import nltk; nltk.download('omw-1.4')"

[nltk_data] Downloading package omw-1.4 to /Users/dusts/nltk_data...


In [11]:
all_articles = acquire.get_blog_articles(acquire.get_codeup_links())

In [16]:
all_articles[0]['content']

'\nBlack excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”\nWe hope you can join us on February 22nd to sit in on an insightful conversation with Wilmarie and all of our panelists!\n'

In [17]:
original = all_articles[0]['content']
print(original[0:500])


Black excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia

Codeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!  
Meet Wilmarie!
Wilmarie De La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas


In [18]:
article = original.lower()
print(article[0:500])


black excellence in tech: panelist spotlight – wilmarie de la cruz mejia

codeup is hosting a black excellence in tech panel in honor of black history month on february 22, 2023! to further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!  
meet wilmarie!
wilmarie de la cruz mejia is a current codeup student on the path to becoming a full-stack web developer at our dallas


In [19]:
article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

print(article[0:500])


black excellence in tech: panelist spotlight  wilmarie de la cruz mejia

codeup is hosting a black excellence in tech panel in honor of black history month on february 22, 2023! to further celebrate, wed like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!  
meet wilmarie!
wilmarie de la cruz mejia is a current codeup student on the path to becoming a full-stack web developer at our dallas, 


In [20]:
#special characters
# remove anything that is not a through z, a number, a single quote, or whitespace
article = re.sub(r"[^a-z0-9'\s]", '', article)
print(article)


black excellence in tech panelist spotlight  wilmarie de la cruz mejia

codeup is hosting a black excellence in tech panel in honor of black history month on february 22 2023 to further celebrate wed like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry  
meet wilmarie
wilmarie de la cruz mejia is a current codeup student on the path to becoming a fullstack web developer at our dallas tx campus 
wilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with codeup 
we asked wilmarie to share more about her experience at codeup she shares i was able to meet other people who were passionate about coding and be in a positive learning environment
we hope you can join us on february 22nd to sit in on an insightful conversation with wilmarie and all of our panelists



In [21]:
#tokenize
tokenizer = nltk.tokenize.ToktokTokenizer()

print(tokenizer.tokenize(article, return_str=True)[0:500])

black excellence in tech panelist spotlight wilmarie de la cruz mejia

codeup is hosting a black excellence in tech panel in honor of black history month on february 22 2023 to further celebrate wed like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry 
meet wilmarie
wilmarie de la cruz mejia is a current codeup student on the path to becoming a fullstack web developer at our dallas tx campus 



In [23]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

ps.stem('call'), ps.stem('called'), ps.stem('calling')

('call', 'call', 'call')

In [24]:
stems = [ps.stem(word) for word in article.split()]
article_stemmed = ' '.join(stems)
print(article_stemmed)

black excel in tech panelist spotlight wilmari de la cruz mejia codeup is host a black excel in tech panel in honor of black histori month on februari 22 2023 to further celebr wed like to spotlight each of our panelist lead up to the discuss to learn a bit about their respect experi as black leader in the tech industri meet wilmari wilmari de la cruz mejia is a current codeup student on the path to becom a fullstack web develop at our dalla tx campu wilmari is a veteran expand her knowledg of program languag and technolog on her journey with codeup we ask wilmari to share more about her experi at codeup she share i wa abl to meet other peopl who were passion about code and be in a posit learn environ we hope you can join us on februari 22nd to sit in on an insight convers with wilmari and all of our panelist


In [25]:
pd.Series(stems).value_counts().head(10)

to         8
a          6
in         6
wilmari    6
on         5
black      4
of         4
codeup     4
and        3
about      3
dtype: int64

In [33]:
wnl = nltk.stem.WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."

for word in sentence.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

stem: he -- lemma: He
stem: wa -- lemma: wa
stem: run -- lemma: running
stem: and -- lemma: and
stem: eat -- lemma: eating
stem: at -- lemma: at
stem: same -- lemma: same
stem: time. -- lemma: time.
stem: he -- lemma: He
stem: ha -- lemma: ha
stem: bad -- lemma: bad
stem: habit -- lemma: habit
stem: of -- lemma: of
stem: swim -- lemma: swimming
stem: after -- lemma: after
stem: play -- lemma: playing
stem: long -- lemma: long
stem: hour -- lemma: hour
stem: in -- lemma: in
stem: the -- lemma: the
stem: sun. -- lemma: Sun.


In [34]:
#lemmatize
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

print(article_lemmatized)

black excellence in tech panelist spotlight wilmarie de la cruz mejia codeup is hosting a black excellence in tech panel in honor of black history month on february 22 2023 to further celebrate wed like to spotlight each of our panelist leading up to the discussion to learn a bit about their respective experience a black leader in the tech industry meet wilmarie wilmarie de la cruz mejia is a current codeup student on the path to becoming a fullstack web developer at our dallas tx campus wilmarie is a veteran expanding her knowledge of programming language and technology on her journey with codeup we asked wilmarie to share more about her experience at codeup she share i wa able to meet other people who were passionate about coding and be in a positive learning environment we hope you can join u on february 22nd to sit in on an insightful conversation with wilmarie and all of our panelist


In [40]:
pd.Series(lemmas).value_counts()[:10]

to          8
a           7
in          6
wilmarie    6
on          5
codeup      4
of          4
black       4
the         3
our         3
dtype: int64

In [37]:
stopword_list = stopwords.words('english')

stopword_list.remove('no')
stopword_list.remove('not')

stopword_list[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [38]:
words = article.split()
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

Removed 70 stopwords
---
black excellence tech panelist spotlight wilmarie de la cruz mejia codeup hosting black excellence tech panel honor black history month february 22 2023 celebrate wed like spotlight panelists leading discussion learn bit respective experiences black leaders tech industry meet wilmarie wilmarie de la cruz mejia current codeup student path becoming fullstack web developer dallas tx campus wilmarie veteran expanding knowledge programming languages technologies journey codeup asked wilmarie share experience codeup shares able meet people passionate coding positive learning environment hope join us february 22nd sit insightful conversation wilmarie panelists


In [67]:
no_stopwords = article_without_stopwords.split()

In [68]:
article_series = pd.Series(no_stopwords)

In [69]:
article_series

0            black
1       excellence
2             tech
3         panelist
4        spotlight
          ...     
84             sit
85      insightful
86    conversation
87        wilmarie
88       panelists
Length: 89, dtype: object

In [70]:
article_series.value_counts()[:10]

wilmarie      6
black         4
codeup        4
tech          3
meet          2
excellence    2
mejia         2
cruz          2
la            2
de            2
dtype: int64

<hr style="border:2px solid gray">

# Exercises

#### The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

#### In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:
#### - Lowercase everything
#### - Normalize unicode characters
#### - Replace anything that is not a letter, number, whitespace or a single quote.

In [None]:
def basic_clean(text):
    '''
        This function cleans a string of text
        and returns the cleaned string.
        
    '''
    # lowercase
    text = text.lower()
    
    # normalize unicode characters
    text = unicodedata.normalize('NFKD', text)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # only alphanumeric, apostrophe, & Spaces
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    
    return text

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [None]:
def tokenize(text):
    return text

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [None]:
def stem(text):
    return text

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(text):
    return text

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.
#### This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
def remove_stopwords(text,extra_words=None,exclude_words=None):
    return text

In [None]:
titles = 
originals = 
cleaned = 
stemmed = 
lemmatized = 

### 6. Use your data from the `acquire` to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [None]:
news_df = pd.DataFrame({"title":titles,"original":originals,"clean":cleaned,"stemmed":stemmed,"lemmatized":lemmatized})

### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [None]:
codeup_df = pd.DataFrame({"title":titles,"original":originals,"clean":cleaned,"stemmed":stemmed,"lemmatized":lemmatized})

### 8. For each dataframe, produce the following columns:
#### - `title` to hold the title
#### - `original` to hold the original article/post content
#### - `clean` to hold the normalized and tokenized original with the stopwords removed.
#### - `stemmed` to hold the stemmed version of the cleaned data.
#### - `lemmatized` to hold the lemmatized version of the cleaned data.

### 9. Ask yourself:
#### - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
#### - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
#### - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

In [None]:
- lemmatized
- stemmed
- stemmed

In [None]:
def tokenize(string):
    '''
    This function takes in a string
    and returns the string as individual tokens put back into the string
    '''
    #create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    #use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)

    return string

In [None]:
def stem(string):
    '''
    This function takes in text
    and returns the stem word joined back into the text
    '''
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    #use the stem, split string using each word
    stems = [ps.stem(word) for word in string.split()]
    
    #join stem word to string
    string = ' '.join(stems)

    return string

In [None]:
def lemmatize(string):
    '''
    This function takes in a string
    and returns the lemmatized word joined back into the string
    '''
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #look at the article 
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #join lemmatized words into article
    string = ' '.join(lemmas)

    return string

In [None]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in text, extra words and exclude words
    and returns a list of text with stopword removed
    '''
    #create stopword list
    stopword_list = stopwords.words('english')
    
    #remove excluded words from list
    stopword_list = set(stopword_list) - set(exclude_words)
    
    #add the extra words to the list
    stopword_list = stopword_list.union(set(extra_words))
    
    #split the string into different words
    words = string.split()
    
    #create a list of words that are not in the list
    filtered_words = [word for word in words if word not in stopword_list]
    
    #join the words that are not stopwords (filtered words) back into the string
    string = ' '.join(filtered_words)
    
    return string

In [None]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    #original text from content column
    df['original'] = df['content']
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', 'original', 'clean', 'stemmed', 'lemmatized']]