In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a

import warnings
warnings.filterwarnings('ignore')

In [2]:
# nltk.download('all')

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\'\s]', '', string)
    
    return string

In [4]:
basic_clean('Angarta')

'angarta'

In [5]:
basic_clean('Angaríta')

'angarita'

In [6]:
basic_clean("Angaríta '!s'")

"angarita 's'"

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(string):

    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string)
    
    return string

In [8]:
tokenize('hello my name is nico')

['hello', 'my', 'name', 'is', 'nico']

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [64]:
def stem(text):
    
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text]
    # glue it back together with spaces, as it was before
    text = ' '.join(stems)
    
    return text

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [65]:
def lemmatize(text):
    
    wnl = nltk.stem.WordNetLemmatizer()
   
    lemmas = [wnl.lemmatize(word) for word in text]
    
    text = ' '.join(lemmas)

    return text

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

 - This function should define two optional parameters, extra_words and exclude_words.
 - These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [47]:
def remove_stopwords(string, extra_words = [], exclude_words = []):

    stopword_list = stopwords.words('english')

    stopword_list = set(stopword_list) - set(exclude_words)

    stopword_list = stopword_list.union(set(extra_words))
    
    words = string #.split()
    
    filtered_words = [word for word in words.split() if word not in stopword_list]
    
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [48]:
remove_stopwords('I would like a 1 million dollars, because I could buy a lot of stuff')

'I would like 1 million dollars, I could buy lot stuff'

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [13]:
news_df = a.get_news_blog_articles()
news_df

Unnamed: 0,title,content,category
0,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,business
1,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,business
2,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,business
3,Adani secures $3 bn credit from a sovereign we...,Adani Group has reportedly told creditors it h...,business
4,We can score a century for progress: Gates on ...,Microsoft Co-founder Bill Gates shared a messa...,business
...,...,...,...
95,India to be world's cheapest 5G market: Bill G...,At a session with Telecom Minister Ashwini Vai...,technology
96,Elon Musk plays fart sounds in 2 am Twitter Sp...,Twitter chief Elon Musk co-hosted a Twitter Sp...,technology
97,Woman loses ₹8 lakh after being locked out of ...,A woman in the US has said she was locked out ...,technology
98,Foxconn Chairman meets PM Modi for 2nd time in...,Apple supplier Foxconn's Chairman Young Liu on...,technology


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [14]:
codeup_df = a.acquire_codeup()
codeup_df

Unnamed: 0,title,content
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...


### 8. For each dataframe, produce the following columns:

    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.

In [16]:
# clean, stemmed, and lemmatized columns
codeup_df.columns.tolist()

['title', 'content']

In [21]:
codeup_df.rename(columns={'content':'original'},  inplace= True)
codeup_df

Unnamed: 0,title,original
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...


In [25]:
basic_clean(codeup_df['original'][0])

'black excellence in tech panelist spotlight  wilmarie de la cruz mejia\n\ncodeup is hosting a black excellence in tech panel in honor of black history month on february 22 2023 to further celebrate wed like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry  \nmeet wilmarie\nwilmarie de la cruz mejia is a current codeup student on the path to becoming a fullstack web developer at our dallas tx campus \nwilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with codeup \nwe asked wilmarie to share more about her experience at codeup she shares i was able to meet other people who were passionate about coding and be in a positive learning environment\nwe hope you can join us on february 22nd to sit in on an insightful conversation with wilmarie and all of our panelists'

In [None]:
tokenize(codeup_df['original'][0])

In [40]:
remove_stopwords(codeup_df['original'][0])

'Black excellence tech: Panelist Spotlight – Wilmarie De La Cruz Mejia Codeup hosting Black Excellence Tech Panel honor Black History Month February 22, 2023! To celebrate, we’d like spotlight panelists leading discussion learn bit respective experiences black leaders tech industry! Meet Wilmarie! Wilmarie De La Cruz Mejia current Codeup student path becoming Full-Stack Web Developer Dallas, TX campus. Wilmarie veteran expanding knowledge programming languages technologies journey Codeup. We asked Wilmarie share experience Codeup. She shares, “I able meet people passionate coding positive learning environment.” We hope join us February 22nd sit insightful conversation Wilmarie panelists!'

In [60]:
num_1 = codeup_df['clean'][0]

In [67]:
stem(num_1)

'black excel tech panelist spotlight wilmari de la cruz mejia codeup host black excel tech panel honor black histori month februari 22 2023 celebr wed like spotlight panelist lead discuss learn bit respect experi black leader tech industri meet wilmari wilmari de la cruz mejia current codeup student path becom fullstack web develop dalla tx campu wilmari veteran expand knowledg program languag technolog journey codeup ask wilmari share experi codeup share abl meet peopl passion code posit learn environ hope join us februari 22nd sit insight convers wilmari panelist'

In [66]:
lemmatize(num_1)

'black excellence tech panelist spotlight wilmarie de la cruz mejia codeup hosting black excellence tech panel honor black history month february 22 2023 celebrate wed like spotlight panelist leading discussion learn bit respective experience black leader tech industry meet wilmarie wilmarie de la cruz mejia current codeup student path becoming fullstack web developer dallas tx campus wilmarie veteran expanding knowledge programming language technology journey codeup asked wilmarie share experience codeup share able meet people passionate coding positive learning environment hope join u february 22nd sit insightful conversation wilmarie panelist'

In [68]:
clean_text = []
stemmed_text = []
lemmatized_text= []

for i in range(0, len(codeup_df)):
    text = basic_clean(codeup_df['original'][i])
    no_stopwords = remove_stopwords(text)
    tokens = tokenize(no_stopwords)
    stemmed = stem(tokens)
    lemmatized = lemmatize(tokens)
    
    clean_text.append(tokens)
    stemmed_text.append(stemmed)
    lemmatized_text.append(lemmatized)

codeup_df['clean'] = clean_text
codeup_df['stemmed'] = stemmed_text
codeup_df['lemmatized'] = lemmatized_text

codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight wilmari de...,black excellence tech panelist spotlight wilma...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight stephani j...,black excellence tech panelist spotlight steph...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight jame coope...,black excellence tech panelist spotlight james...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight jeanic fre...,black excellence tech panelist spotlight jeani...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...,"[youre, interested, embarking, career, tech, l...",your interest embark career tech like taken lo...,youre interested embarking career tech likely ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...,"[codeup, pleased, announce, ranked, among, 58,...",codeup pleas announc rank among 58 best code b...,codeup pleased announce ranked among 58 best c...


In [72]:
def add_columns(df):

    df.rename(columns={'content':'original'},  inplace= True)
    
    clean_text = []
    stemmed_text = []
    lemmatized_text= []

    for i in range(0, len(df)):
        text = basic_clean(df['original'][i])
        no_stopwords = remove_stopwords(text)
        tokens = tokenize(no_stopwords)
        stemmed = stem(tokens)
        lemmatized = lemmatize(tokens)

        clean_text.append(tokens)
        stemmed_text.append(stemmed)
        lemmatized_text.append(lemmatized)

    df['clean'] = clean_text
    df['stemmed'] = stemmed_text
    df['lemmatized'] = lemmatized_text

    return df

In [74]:
codeup_df = add_columns(codeup_df)
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight wilmari de...,black excellence tech panelist spotlight wilma...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight stephani j...,black excellence tech panelist spotlight steph...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight jame coope...,black excellence tech panelist spotlight james...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,"[black, excellence, tech, panelist, spotlight,...",black excel tech panelist spotlight jeanic fre...,black excellence tech panelist spotlight jeani...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...,"[youre, interested, embarking, career, tech, l...",your interest embark career tech like taken lo...,youre interested embarking career tech likely ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...,"[codeup, pleased, announce, ranked, among, 58,...",codeup pleas announc rank among 58 best code b...,codeup pleased announce ranked among 58 best c...


In [77]:
news_df = add_columns(news_df)
news_df    

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,business,"[10, adani, group, stocks, closed, higher, wed...",10 adani group stock close higher wednesday fi...,10 adani group stock closed higher wednesday f...
1,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,business,"[hours, central, government, raised, price, co...",hour central govern rais price commerci lpg cy...,hour central government raised price commercia...
2,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,business,"[indianamericans, punit, renjen, rajesh, subra...",indianamerican punit renjen rajesh subramaniam...,indianamericans punit renjen rajesh subramania...
3,Adani secures $3 bn credit from a sovereign we...,Adani Group has reportedly told creditors it h...,business,"[adani, group, reportedly, told, creditors, se...",adani group reportedli told creditor secur 3 b...,adani group reportedly told creditor secured 3...
4,We can score a century for progress: Gates on ...,Microsoft Co-founder Bill Gates shared a messa...,business,"[microsoft, cofounder, bill, gates, shared, me...",microsoft cofound bill gate share messag twitt...,microsoft cofounder bill gate shared message t...
...,...,...,...,...,...,...
95,India to be world's cheapest 5G market: Bill G...,At a session with Telecom Minister Ashwini Vai...,technology,"[session, telecom, minister, ashwini, vaishnaw...",session telecom minist ashwini vaishnaw micros...,session telecom minister ashwini vaishnaw micr...
96,Elon Musk plays fart sounds in 2 am Twitter Sp...,Twitter chief Elon Musk co-hosted a Twitter Sp...,technology,"[twitter, chief, elon, musk, cohosted, twitter...",twitter chief elon musk cohost twitter space l...,twitter chief elon musk cohosted twitter space...
97,Woman loses ₹8 lakh after being locked out of ...,A woman in the US has said she was locked out ...,technology,"[woman, us, said, locked, apple, account, thie...",woman us said lock appl account thief stole ip...,woman u said locked apple account thief stole ...
98,Foxconn Chairman meets PM Modi for 2nd time in...,Apple supplier Foxconn's Chairman Young Liu on...,technology,"[apple, supplier, foxconn, ', s, chairman, you...",appl supplier foxconn ' s chairman young liu t...,apple supplier foxconn ' s chairman young liu ...


### 9. Ask yourself:

    - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

- For a corpus of 493KB, using lemmatized text may be a better option as the size of the corpus is relatively small. Lemmatization retains the base form of words, which may result in a slightly larger file size than stemming, but it can provide more accurate results as it considers the context and part of speech of the word.

- For a corpus of 25MB, the decision between using stemmed or lemmatized text depends on the specific use case and requirements. Stemming is faster and results in a smaller file size, which may be beneficial if the focus is on processing speed and efficiency. On the other hand, lemmatization may be preferable if accuracy is a priority, as it produces more meaningful and contextually relevant words.

- For a corpus of 200TB, using stemmed text may be a more practical option as it results in a smaller file size and can be processed faster. However, this decision also depends on the specific use case and requirements. If the accuracy of the analysis is a top priority and the added file size from lemmatization does not significantly impact the computational cost, then lemmatization may be the better option.