#### Imports

In [1]:
#standard imports
import pandas as pd
import numpy as np

#import
import unicodedata
#import regular expression operations
import re
#import natural language toolkit
import nltk
#import stopwords list
from nltk.corpus import stopwords

import acquire as a

In [33]:
import requests
from requests import get
from bs4 import BeautifulSoup

import os
from pprint import pprint 
import time
import json

# Exercises

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [6]:
new = original.lower()
new

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [8]:
new = unicodedata.normalize('NFKD', new)\
.encode('ascii', 'ignore')\
.decode('utf-8')
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [9]:
new = re.sub(r'[^a-z0-9\'\s]', '', new)
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [4]:
def basic_clean(original):
    '''
    Input: original text
    Actions: 
    lowercase everything,
    normalizes everything,
    removes anything that's not a letter, number, whitespace, or single quote
    Output: Cleaned text
    '''
    # lowercase everything
    basic_cleaned = original.lower()
    # normalize unicode characters
    basic_cleaned = unicodedata.normalize('NFKD', basic_cleaned)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
    # Replace anything that is not a letter, number, whitespace or a single quote.
    basic_cleaned = re.sub(r'[^a-z0-9\'\s]', '', basic_cleaned)
    
    return basic_cleaned

In [5]:
basic_cleaned = basic_clean(original)

In [6]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [7]:
basic_cleaned

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [None]:
#create the tokenizer
tokenize = nltk.tokenize.ToktokTokenizer()

In [None]:
#use the tokenizer
article = tokenize.tokenize(article, return_str=True)

In [8]:
def tokenize(basic_cleaned):
    '''
    Input: basic_cleaned text string
    Actions:
    creates the tokenizer
    uses the tokenizer
    Output: clean_tokenize text string
    '''
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use the tokenizer
    clean_tokenize = tokenize.tokenize(basic_cleaned, return_str=True)
    
    return clean_tokenize

In [9]:
clean_tokenize = tokenize(basic_cleaned)

In [10]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [11]:
basic_cleaned

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [12]:
clean_tokenize

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [None]:
#create porter stemmer
ps = nltk.porter.PorterStemmer()

In [None]:
#use stemmer - apply stem to each word in our string
stems = [ps.stem(word) for word in article.split()]

In [None]:
#join words back together
article_stemmed = ' '.join(stems)

In [13]:
def stem(clean_tokenize):
    '''
    Inputs: clean_tokenize 
    Actions: creates and uses stemmer for each word
    Outputs: clean_tokenize_stem
    '''
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    #use stemmer - apply stem to each word in our string
    stems = [ps.stem(word) for word in clean_tokenize.split()]
    #join words back together
    clean_tokenize_stem = ' '.join(stems)
    
    return clean_tokenize_stem

In [14]:
clean_tokenize_stem = stem(clean_tokenize)

In [15]:
clean_tokenize_stem

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

In [16]:
clean_tokenize

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()

In [None]:
#use lemmatize - apply stem to each word in our string
lemmas = [wnl.lemmatize(word) for word in article.split()]

In [None]:
#join words back together
article_lemma = ' '.join(lemmas)

In [17]:
def lemmatize(clean_tokenize):
    '''
    Inputs: clean_tokenize
    Actions: creates lemmatizer and applies to each word
    Outputs: clean_tokenize_lemma
    '''
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    #use lemmatize - apply to each word in our string
    lemmas = [wnl.lemmatize(word) for word in clean_tokenize.split()]
    #join words back together
    clean_tokenize_lemma = ' '.join(lemmas)
    
    return clean_tokenize_lemma

In [18]:
clean_tokenize_lemma = lemmatize(clean_tokenize)

In [19]:
clean_tokenize_lemma

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

In [20]:
clean_tokenize

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
#save stopwords
stopwords_ls = stopwords.words('english')

In [None]:
#split words in lemmatized article
words = article_lemma.split()

In [None]:
stopwords_ls.append("'")

In [None]:
#remove stopwords from list of words
filtered = [word for word in words if word not in stopwords_ls]

In [None]:
#join words back together
parsed_article = ' '.join(filtered)

In [21]:
def remove_stopwords(lemma_or_stem, extra_words=[], exclude_words=[]):
    '''
    Input:text string or .apply(remove_stopwords) to entire data frame
    Action: removes standard stop words
    Output: parsed_article
    '''
    # save stopwords
    stopwords_ls = stopwords.words('english')
    # removing any stopwords in exclude list
    stopwords_ls = set(stopwords_ls) - set(exclude_words)
    # adding any stopwords in extra list
    stopwords_ls = stopwords_ls.union(set(extra_words))
    
    # split words in article
    words = lemma_or_stem.split()
    # remove stopwords from list of words
    filtered = [word for word in words if word not in stopwords_ls]
    # join words back together
    parsed_article = ' '.join(filtered)
    
    return parsed_article

In [22]:
parsed_article = remove_stopwords(clean_tokenize_lemma)

In [23]:
parsed_article

'paul erdos george polya influential hungarian mathematician contributed lot field erdos name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'

In [24]:
clean_tokenize_lemma

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

#### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [2]:
a.acquire_info()

get_blog_articles2()
scrape_one_page(topic)
get_news_articles(topic_list)


In [25]:
topic_list = ['business', 'sports', 'technology', 'entertainment']

In [26]:
final_list = a.get_news_articles(topic_list)

In [29]:
#pd.set_option("display.max_colwidth", None)

In [89]:
#pd.reset_option("display.max_colwidth")

In [31]:
news_df = pd.DataFrame(final_list)
news_df

Unnamed: 0,category,title,content
0,business,"Sensex, Nifty end at fresh closing highs","Benchmark indices Sensex and Nifty ended at record closing highs on Wednesday. Sensex ended 195 points higher at 63,523 while the Nifty ended at 18,856.85, up 40 points. The gains were led by stocks like HDFC, Reliance Industries and TCS. During the intraday trade, Sensex rose to its fresh record high level of 63,588."
1,business,TIME releases list of the world's 100 most influential companies,"TIME magazine has released its annual list of the world's 100 most influential companies, which features OpenAI, SpaceX, Chess.com, Google DeepMind and Kim Kardashian's SKIMS among others. The National Payments Corporation of India (NPCI) and e-commerce platform Meesho also featured on the list. ""NPCI launched UPI...which accounted for 52% of India's digital transactions in FY22,"" TIME said."
2,business,Which are the world's top 10 airlines according to passengers?,"Singapore Airlines is the world's best airline, according to Skytrax World Airline Awards 2023, an annual poll of flyers released at the Paris Air Show. It is followed by Qatar Airways, All Nippon Airways, Emirates, Japan Airlines, Turkish Airlines, Air France, Cathay Pacific, EVA Air, and Korean Air. Vistara, ranked 16th, is the only Indian airline in the top 20."
3,business,"Loves India, is a fan of PM: Paytm Founder on Musk after Modi meet","Paytm Founder Vijay Shekhar Sharma shared a video of Tesla CEO Elon Musk's media interaction after meeting with PM Modi and wrote, ""Musk may be in India, next year."" In another tweet, Sharma said, ""He [Musk] won't find population problem in India. Finds clean energy, energy storage and EVs as great potential...Loves India and is a fan of our PM."""
4,business,UK's net debt passes 100% of GDP for the first time since 1961,"The United Kingdom's public sector net debt in May exceeded 100% of the nation's GDP for the first time since 1961, the UK government's Office for National Statistics said on Wednesday. Public sector net debt, excluding that of state-controlled banks, hit £2.567 trillion, equivalent to 100.1% of the UK's GDP. The UK government's borrowing in May totalled £20.045 billion.\n\n"
...,...,...,...
95,entertainment,"Of course, why not: Gautam Gulati on if he would ever date a fan","Actor Gautam Gulati, when asked if he would ever date a fan, said, ""Of course, why not."" He also said that if he had to change his name, he would change it to 'Kabir' because he likes the name. He added that he loves his routine and says no to anyone asking him to go out for food or drinks."
96,entertainment,Rajamouli said Baahubali was litmus test for Mahabharata: Prasad,"\nSS Rajamouli's father and writer Vijayendra Prasad shared that the filmmaker told him that with 'Baahubali' he was ""trying to check"" his own readiness for handling 'Mahabharata' at a later stage. ""Rajamouli said ['Baahubali'] is going to be...litmus test for that ultimate goal,"" Vijayendra shared. He added that it is because of Rajamouli that his work has gained recognition."
97,entertainment,Admire her audacity: Mahesh Bhatt on Pooja entering 'Bigg Boss...',"Filmmaker Mahesh Bhatt reacted to his daughter Pooja Bhatt's participation in 'Bigg Boss OTT 2'. ""Life's greatest adventures begin when we step into the realm of the unknown with courage and curiosity. She has done just that. I admire her audacity,"" Bhatt told ETimes. Pooja, on the show, had opened up about battling alcohol addiction at the age of 44."
98,entertainment,I lost reputation the first day I started working: Cryus Broacha,"Cyrus Broacha said he ""needed new ways of earning money"", which is why he participated in 'Bigg Boss OTT 2'. When questioned if he's worried about his image, Cyrus answered, ""Do you think coming on 'Bigg Boss' would affect my reputation? I lost [it] the...day I started working. I've no fears."" However, he mentioned that he's worried about mental health."


#### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [34]:
url = "https://codeup.com/blog/"
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
more_links = soup.find_all('a', class_='more-link')
links_list = [link['href'] for link in more_links]

In [35]:
article_info = a.get_blog_articles2(links_list)

In [37]:
codeup_df = pd.DataFrame(article_info)
codeup_df

Unnamed: 0,title,link,date_published,content
0,Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa,https://codeup.com/codeup-news/panelist-spotlight-4/,"May 24, 2023","\nMay is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.\n\nIn an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.\nArbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.\nAt Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American individuals. Hence, we will now use the term Asian Pacific Islander Desi American (APIDA).\nHere is how the rest of our conversation with Arbeena went!\nHow do you celebrate or connect with your heritage and cultural traditions?\n“I celebrate Nepal’s version of Christmas or Dashain. This is a nine-day celebration also known as Dussehra. I grew up as Hindu and I identify as Hindu, this is a very large part of my heritage. “\n“Other ways I connect with my culture include sharing food! Momos are South Asian Dumplings and they’re my favorite to make and share.”\n“On my Asian American side, I am an advocate of immigrant justice and erasure within APIDA social or political movements. I participate in events to embrace my identity such as immigrant justice advocacy because I come from a mixed-status family. I’ve always been in a community with undocumented Asian immigrants. .”\nWhat are some of the challenges you have faced as an APIDA individual, personally or professionally?\n“I often struggle with being gendered as compliant or a pushover. Professionally, I am often stereotyped as meek, so I’ve been overlooked for leadership roles. We are seen as perpetually foreign; people tend to other us in that way, yet put us on a pedestal for what a model minority looks like. This has made me hesitant to share my heritage in the past because these assumptions get mapped onto me. ”\nCan you describe some common barriers of entry that APIDA individuals, specifically women may face when trying to enter or advance in the workplace?\n“Being overlooked for leadership. In the past, I have not been viewed as a leader. People sometimes have preconceived stereotypes of Asian women not being able to be bold, or being vocal can be mistaken for being too emotional. “\nHow do you believe microaggressions impact APIDA individuals in the workplace? Can you provide examples of such microaggressions?\n“Erasure is big. To me, only saying ‘Merry Christmas’ isn’t inclusive to other religions. People are often resistant to saying ‘Happy Holidays,’ but saying Merry Christmas excludes, and does not appreciate my heritage. “\n“Often microaggressions are not micro at all. They typically are not aggressive racialized violence, but the term ‘micro’ minimizes impact.”\n“Some that I’ve heard are ‘What kind of Asian are you?’ or ‘Where are you from?’ This automatically makes me the ‘other’ and not seen as American. Even within the APIDA community, South Asians are overlooked as “Asian”.”\nHow important is representation, specifically APIDA representation, in organizational leadership positions?\n“I want to say that it is important to have someone who looks like you in leadership roles, and it is, but those leaders may not share the same beliefs as you. Certain privileges such as wealth, resources, or lack of interaction with lower-socioeconomic-status Asian Americans may cause a difference in community politics. I do not think the bamboo ceiling is acceptable, but the company you work for plays a big part in your politics and belief alignment.”\nHow do you feel about code-switching, and have you ever felt it necessary to code-switch?\n“I like sharing South Asian terms or connecting with others that have similar heritage and culture. A workplace that is welcoming to going into this sort of breakout is refreshing and makes space for us. However, having to code-switch could also mean a workplace that is not conducive and welcoming of other cultures. “\nFinally, in your opinion, what long-term strategies can create lasting change in the workplace and ensure support, equality, and inclusion for APIDA individuals?\n“Prior to a career in financial aid, I did a lot of research related to the post-9/11 immigration of the South Asian diaspora. This background made me heavily rely on grassroots organizing. Hire the people that want to innovate, hire the changemakers, hire the button-pushers. Reduce reliance on whiteness as change. This will become natural for the organization and become organizational change. Change comes from us on the ground.”\nA huge thank you to Arbeena Thapa for sharing her experiences, and being vulnerable with us. Your words were inspiring and the opportunity to understand your perspective more has been valuable. We hope we can become better support for the APIDA community as we learn and grow on our journey of cultivating inclusive growth.\n"
1,Women in tech: Panelist Spotlight – Magdalena Rahn,https://codeup.com/codeup-news/panelist-spotlight-4/,"Mar 28, 2023","\nWomen in tech: Panelist Spotlight – Magdalena Rahn\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\n\nMeet Magdalena!\nMagdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.\nWe asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data science methods, and it’s been an encouragement to have such supportive instructors and wonderful classmates.”\nDon’t forget to tune in on March 29th to sit in on an insightful conversation with Magdalena.\n"
2,Women in tech: Panelist Spotlight – Rachel Robbins-Mayhill,https://codeup.com/codeup-news/panelist-spotlight-4/,"Mar 20, 2023","\nWomen in tech: Panelist Spotlight – Rachel Robbins-Mayhill\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry! Meet Rachel!\n\nRachel Robbins-Mayhill is a Decision Science Analyst I in San Antonio, Texas. Rachel has had a varied career that includes counseling, teaching, training, community development, and military operations. Her focus has always been on assessing needs, identifying solutions, and educating individuals and groups on aligning needs and solutions in different contexts. Rachel’s passion for data science stems from her belief that data is a powerful tool for communicating patterns that can lead to hope and growth in the future.\nIn June 2022, Rachel graduated from Codeup’s Innis cohort, where she honed her skills in data science. Shortly after, she started working as a Data Science Technical Writer with Apex Systems as a Contractor for USAA in July 2022. Her unconventional role allowed her to understand where her skills could be best utilized to support USAA in a non-contract role.\nRachel recently joined USAA’s Data Science Delivery team as a Decision Science Analyst I in February 2023. The team is focused on delivering machine learning models for fraud prevention, and Rachel’s particular role centers around providing strategic process solutions for the team in collaboration with Operational and Model Risk components.\nIn addition to her career, Rachel is currently pursuing a master’s degree in Applied Data Science from Syracuse University, further expanding her knowledge and skills in the field. Rachel is passionate about collaborating with individuals who share her belief in the potential of others and strive to achieve growth through logical, informed action. She welcomes LinkedIn connections and is excited about supporting the network of CodeUp alumni!\nWe asked Rachel how Codeup impacted her career, and she replied “Codeup delivered a comprehensive education in all facets of the data science pipeline, laying a strong foundation for me to build upon. Through repeated hands-on practice, I developed a reliable process that was immediately applicable in my job. Collaborative group projects were instrumental in helping me hone my skills in project management, allowing me to navigate complex data science projects with comfortability. Thanks to this invaluable experience, I was able to make significant strides in my career within just six months of graduating from Codeup.”\nDon’t forget to tune in on March 29th to sit in on an insightful conversation.\n"
3,Women in Tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/panelist-spotlight-4/,"Mar 13, 2023","\nWomen in tech: Panelist Spotlight – Sarah Mellor \nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\nMeet Sarah!\nSarah Mellor currently works as the Director of People Operations. She joined Codeup four and a half years ago as an Admissions Manager. She went on to build out and lead the Marketing and Admissions team, while picking up People Ops tasks and projects here and there until moving over to lead the People Ops team two years ago. Prior to Codeup, she worked at education-focused non-profits in Washington, DC and Boulder, Colorado. She graduated from Wake Forest University.\nWe asked Sarah how Codeup has impacted her career, and her response was “I have absolutely loved having the privilege to grow alongside Codeup. In my time here across multiple different roles and departments, I’ve seen a lot of change. The consistent things have always been the high quality of passionate and hardworking people I get to work with; the impactful mission we get to work on; and the inspiring students who trust us with their career change.”\nDon’t forget to tune in on March 29th to sit in on an insightful conversation.\n"
4,Women in Tech: Panelist Spotlight – Madeleine Capper,https://codeup.com/codeup-news/panelist-spotlight-4/,"Mar 6, 2023","\nWomen in tech: Panelist Spotlight – Madeleine Capper\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\nMeet Madeleine!\nMadeleine Capper is a Data Scientist in San Antonio, Texas. A long-standing San Antonio resident, she studied mathematics at the University of Texas San Antonio and has worked as a Data Scientist for Booz Allen Hamilton. Madeleine currently teaches Data Science at Codeup, where she works daily with burgeoning data professionals to help them actualize their career aspirations through technical education.\nMadeleine attended Codeup as a student in early 2019 as a pupil in the very first Codeup Data Science cohort. The program proved immediately effective and she was the first student to obtain a data career out of the program. After working at Booz Allen Hamilton, Madeleine’s passion for education in conjunction with her appreciation for Codeup’s capacity for transformative life change brought her back to the institution in an instructional capacity, where she has been teaching for two years.\nDon’t forget to tune in on March 29th to sit in on an insightful conversation.\n"
5,Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia,https://codeup.com/codeup-news/panelist-spotlight-4/,"Feb 16, 2023","\nBlack excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry! \nMeet Wilmarie!\nWilmarie De La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus. \nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup. \nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”\nWe hope you can join us on February 22nd to sit in on an insightful conversation with Wilmarie and all of our panelists!\n"


#### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [63]:
news_df['clean_norm_token'] = news_df['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

In [68]:
news_df['stemmed'] = news_df.clean_norm_token.apply(stem)

In [83]:
news_df['lemmatized'] = news_df.clean_norm_token.apply(lemmatize)

In [91]:
news_df.head()

Unnamed: 0,category,title,content,clean_norm_token,stemmed,lemmatized
0,business,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex nifty ended record cl...,benchmark indic sensex nifti end record close ...,benchmark index sensex nifty ended record clos...
1,business,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine released annual list world 100 i...,time magazin releas annual list world 100 infl...,time magazine released annual list world 100 i...
2,business,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines world best airline accordin...,singapor airlin world best airlin accord skytr...,singapore airline world best airline according...
3,business,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,paytm founder vijay shekhar sharma shared vide...,paytm founder vijay shekhar sharma share video...,paytm founder vijay shekhar sharma shared vide...
4,business,UK's net debt passes 100% of GDP for the first...,The United Kingdom's public sector net debt in...,united kingdom public sector net debt may exce...,unit kingdom public sector net debt may exceed...,united kingdom public sector net debt may exce...


In [75]:
codeup_df['clean_norm_token'] = codeup_df.content.apply(basic_clean).apply(tokenize).apply(remove_stopwords)

In [78]:
codeup_df['stemmed'] = codeup_df.clean_norm_token.apply(stem)

In [81]:
codeup_df['lemmatized'] = codeup_df.clean_norm_token.apply(lemmatize)

In [90]:
codeup_df.head()

Unnamed: 0,title,link,date_published,content,clean_norm_token,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,https://codeup.com/codeup-news/panelist-spotli...,"May 24, 2023",\nMay is traditionally known as Asian American...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/codeup-news/panelist-spotli...,"Mar 28, 2023",\nWomen in tech: Panelist Spotlight – Magdalen...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
2,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/codeup-news/panelist-spotli...,"Mar 20, 2023",\nWomen in tech: Panelist Spotlight – Rachel R...,women tech panelist spotlight rachel robbinsma...,women tech panelist spotlight rachel robbinsma...,woman tech panelist spotlight rachel robbinsma...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/panelist-spotli...,"Mar 13, 2023",\nWomen in tech: Panelist Spotlight – Sarah Me...,women tech panelist spotlight sarah mellor cod...,women tech panelist spotlight sarah mellor cod...,woman tech panelist spotlight sarah mellor cod...
4,Women in Tech: Panelist Spotlight – Madeleine ...,https://codeup.com/codeup-news/panelist-spotli...,"Mar 6, 2023",\nWomen in tech: Panelist Spotlight – Madelein...,women tech panelist spotlight madeleine capper...,women tech panelist spotlight madelein capper ...,woman tech panelist spotlight madeleine capper...


In [None]:
def clean_df(df, extra_words=[], exclude_words=[]):
    
    
    df['clean'] = df.original\
                    

#### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

493KB, lemmatized text

25MB, lemmatized text

200TB, stemmed because it's "faster"