### ``Exercises: NLP Acquire/Web Scrapping``

    30AUGUST2022

----

In [1]:
# notebook dependencies 
import os # for caching purposeses
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

# JSON import
import json

# importing BeautifulSoup for parsing HTML/XTML
from bs4 import BeautifulSoup

# request module for connecting to APIs
from requests import get

#### ``Exercise Number 1: Web Scrapping -- Codeup Blog Articles``

**<u>``Prompt:``</u>**

* Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. 

* For each post, you should scrape at least the post's title and content.

* Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries:

    - With each dictionary representing one article. The shape of each dictionary should look like this:

>{
    'title': 'the title of the article',\
    'content': 'the full text content of the article'
}


- Plus any additional properties you think might be helpful

In [2]:
# let's connect to the Codeup url/domain

domain = 'https://codeup.com'
endpoint = '/blog/'

# creating the url
url = domain + endpoint

# creating the response element/object (including headers)
# note: some websites don't accept the pyhon-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science'} 
response = get(url, headers = headers)

print(f'url: {url}')

url: https://codeup.com/blog/


In [3]:
# checking the response object/type

type(response)

requests.models.Response

In [4]:
# let's use the BeautifulSoup module to create an HTML object

soup = BeautifulSoup(response.content, 'html.parser')
type(soup)

bs4.BeautifulSoup

**``Beautiful Soup Methods and Properties``**

* ``soup.title.string`` gets the page's title (the same text in the browser tab for a page, this is the title element

* ``soup.prettify()`` is useful to print in case you want to see the HTML

* ``soup.find_all("a")`` find all the anchor tags, or whatever argument is specified.

* ``soup.find("h1")`` finds the first matching element

* ``soup.get_text()`` gets the text from within a matching piece of soup/HTML

* The ``soup.select()`` method takes in a CSS selector as a string and returns all matching elements. super useful

In [5]:
# in looking at the Codeup blog page, i notice the article titles at the 'h2 <a href = ' attribute level
# i can use the select method to hit this attribute and return back all text tagged as such

soup.select('h2 a[href]') # checks out!

[<a href="https://codeup.com/data-science/recession-proof-career/">Is a Career in Tech Recession-Proof?</a>,
 <a href="https://codeup.com/codeup-news/codeup-x-comic-con/">Codeup X Superhero Car Show &amp; Comic Con</a>,
 <a href="https://codeup.com/featured/series-part-3-web-development/">What Jobs Can You Get After a Coding Bootcamp? Part 3: Web Development</a>,
 <a href="https://codeup.com/codeup-news/codeup-dallas-campus/">Codeup’s New Dallas Campus</a>,
 <a href="https://codeup.com/codeup-news/codeup-tv-commercial/">Codeup TV Commercial</a>,
 <a href="https://codeup.com/featured/what-jobs-can-you-get-after-a-coding-bootcamp-part-2-cloud-administration/">What Jobs Can You Get After a Coding Bootcamp? Part 2: Cloud Administration</a>]

In [6]:
# what if we just want a single title or link?

url = soup.select('h2 a[href]')[0]['href'] # this is the link, what about the title?
url

'https://codeup.com/data-science/recession-proof-career/'

In [7]:
# what if we want just the links to iterate through?

urls = soup.select('h2 a[href]')[:]

type(urls)
urls

[<a href="https://codeup.com/data-science/recession-proof-career/">Is a Career in Tech Recession-Proof?</a>,
 <a href="https://codeup.com/codeup-news/codeup-x-comic-con/">Codeup X Superhero Car Show &amp; Comic Con</a>,
 <a href="https://codeup.com/featured/series-part-3-web-development/">What Jobs Can You Get After a Coding Bootcamp? Part 3: Web Development</a>,
 <a href="https://codeup.com/codeup-news/codeup-dallas-campus/">Codeup’s New Dallas Campus</a>,
 <a href="https://codeup.com/codeup-news/codeup-tv-commercial/">Codeup TV Commercial</a>,
 <a href="https://codeup.com/featured/what-jobs-can-you-get-after-a-coding-bootcamp-part-2-cloud-administration/">What Jobs Can You Get After a Coding Bootcamp? Part 2: Cloud Administration</a>]

In [8]:
# extracting the published date

published_date = soup.find('span', class_ = "published").text.strip() # checks out!
published_date

'Aug 12, 2022'

In [9]:
# sample title extraction code

# container = []

# # create the blog url
# url = 'https://codeup.com/blog/'

# # include the headers
# headers = {'User-Agent': 'Codeup Data Science'} 

# #create the response object
# response1 = get(url, headers = headers)

# # first soup
# soup1 = BeautifulSoup(response1.content, 'html.parser')

# # hit the blog domain and retrieve article link
# link_url = soup1.select('h2 a[href]')[counter]["href"]

# response2 = get(link_url, headers = headers)

# # new soup object
# soup2 = BeautifulSoup(response2.content, 'html.parser')

# soup2.find('h1', class_ = "entry-title").text # checks out!

In [10]:
# with the links accessible, i can hit the needed articles to extract more data

container = []

# let's extract all articles in the Codeup blog post website
for num in range(len(urls)):

    # extracting the article url from Codeup blog urls
    article_url = urls[num]['href']

    # creating the response object (Article Website)
    response = get(article_url, headers = headers)

    # create the soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # extract the title
    title = soup.find('h1', class_ = "entry-title").text
    
    # extract the publish date
    published = soup.find('span', class_ = "published").text.strip()
    
    # extract article body
    contents = soup.find('div', class_ = 'entry-content').text.strip()

    # create dictionary that holds article contents
    article_dict = { 
        "article_title": title,
        "publish_date": published,
        "contents": contents
    }

    # append article dictionary to the container list
    container.append(article_dict)

articles = pd.DataFrame(container).sort_values("publish_date").reset_index(drop = True)
articles

# notes to self:
# when creating a function that pulls information at scale, ensure the headers, tags, or required labeling of information is consistent and accurate

Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
3,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 14, 2022",Have you been considering a career in Cloud Ad...
4,Codeup TV Commercial,"Jul 20, 2022",Codeup has officially made its TV debut! Our c...
5,Codeup’s New Dallas Campus,"Jul 25, 2022",Codeup’s Dallas campus has a new location! For...


In [11]:
# extract article/blog contents 

soup.find('div', class_ = 'entry-content').text.strip() # checks out!

'Have you been considering a career in Cloud Administration, but have no idea what your job title or potential salary could be? Continue reading below to find out!\nIn this mini-series, we will take each of our programs here at Codeup: Data Science, Web Development, and Cloud Administration, and outline respectively potential job titles, as well as entry-level salaries.*\xa0Let’s discuss Cloud Administration.\nProgram Overview\nAt Codeup, we offer a 15-week Cloud Administration program, which was derived from our previous two programs: Systems Engineering and Cyber Cloud. We combined the best of both and blended hands-on practical knowledge with skilled instructors to create the Cloud Administration program.\nUpon completing this program, you’ll have the opportunity to take on two exams for certifications: Amazon Web Services (AWS) Cloud Practitioner and AWS Solutions Architect Associate.\xa0\nPotential Jobs\nAccording to A Cloud Guru, with an AWS Certification you’ll be equipped with 

In [12]:
# creating a function to scrape all Codeup blogs

def scrape_codeup_blogs(url):

    # providing url headers for referencing/access
    headers = {'User-Agent': 'Codeup Data Science'}

    # creating the response object to access to the url
    response = get(url, headers = headers)

    # creating the soup object
    soup = BeautifulSoup(response.content, "html.parser")
    
    # selecting/extracting all urls from the blog home page
    urls = soup.select('h2 a[href]')[:]

    # container list to store needed contents/attributes
    container = []

    # let's extract all articles in the Codeup blog post website
    for num in range(len(urls)):

        # extracting the article url from Codeup blog urls
        article_url = urls[num]['href']

        # creating the response object (Article Website)
        response = get(article_url, headers = headers)

        # create the soup object
        soup = BeautifulSoup(response.content, "html.parser")

        # extract the title
        title = soup.find('h1', class_ = "entry-title").text
        
        # extract the publish date
        published = soup.find('span', class_ = "published").text.strip()
        
        # extract article body
        contents = soup.find('div', class_ = 'entry-content').text.strip()

        # create dictionary that holds article contents
        article_dict = { 
            
            "article_title": title,
            "publish_date": published,
            "contents": contents
        }

        # append article dictionary to the container list
        container.append(article_dict)

    # create an articles/blogs dataframe
    df = pd.DataFrame(container).sort_values("publish_date").reset_index(drop = True)
    
    # print the shape
    print(f'dataframe shape: {df.shape}')

    # return articles/blogs in a Pandas Dataframe
    return df

In [13]:
# trying out the function

codeup_blogs = scrape_codeup_blogs('https://codeup.com/blog/')
codeup_blogs

dataframe shape: (6, 3)


Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
3,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 14, 2022",Have you been considering a career in Cloud Ad...
4,Codeup TV Commercial,"Jul 20, 2022",Codeup has officially made its TV debut! Our c...
5,Codeup’s New Dallas Campus,"Jul 25, 2022",Codeup’s Dallas campus has a new location! For...


In [14]:
# creating a function to scrape all Codeup blogs

def get_blogs_dict(url):

    # providing url headers for referencing/access
    headers = {'User-Agent': 'Codeup Data Science'}

    # creating the response object to access to the url
    response = get(url, headers = headers)

    # creating the soup object
    soup = BeautifulSoup(response.content, "html.parser")
    
    # selecting/extracting all urls from the blog home page
    urls = soup.select('h2 a[href]')[:]

    # container list to store needed contents/attributes
    container = []

    # let's extract all articles in the Codeup blog post website
    for num in range(len(urls)):

        # extracting the article url from Codeup blog urls
        article_url = urls[num]['href']

        # creating the response object (Article Website)
        response = get(article_url, headers = headers)

        # create the soup object
        soup = BeautifulSoup(response.content, "html.parser")

        # extract the title
        title = soup.find('h1', class_ = "entry-title").text
        
        # extract the publish date
        published = soup.find('span', class_ = "published").text.strip()
        
        # extract article body
        contents = soup.find('div', class_ = 'entry-content').text.strip()

        # create dictionary that holds article contents
        article_dict = { 
            "article_title": title,
            "publish_date": published,
            "contents": contents
        }

        # append article dictionary to the container list
        container.append(article_dict)

    with open("filename", 'w') as f:

        json.dump(container, f)

    # return articles/blogs in a Pandas Dataframe
    return container

In [15]:
# creating a function to scrape all Codeup blogs

def return_blogs_list(url):
    
    # providing url headers for referencing/access
    headers = {'User-Agent': 'Codeup Data Science'}

    # creating the response object to access to the url
    response = get(url, headers = headers)

    # creating the soup object
    soup = BeautifulSoup(response.content, "html.parser")

    # selecting/extracting all urls from the blog home page
    urls = soup.select('h2 a[href]')[:]

    # container list to store needed contents/attributes
    container = []

    # let's extract all articles in the Codeup blog post website
    for num in range(len(urls)):

        # extracting the article url from Codeup blog urls
        article_url = urls[num]['href']

        # creating the response object (Article Website)
        response = get(article_url, headers = headers)

        # create the soup object
        soup = BeautifulSoup(response.content, "html.parser")

        # extract the title
        title = soup.find('h1', class_ = "entry-title").text

        # extract the publish date
        published = soup.find('span', class_ = "published").text.strip()

        # extract article body
        contents = soup.find('div', class_ = 'entry-content').text.strip()

        # create dictionary that holds article contents
        article_dict = { 

        "article_title": title,
        "publish_date": published,
        "contents": contents

        }

        # append article dictionary to the container list
        container.append(article_dict)

In [16]:
# testing out the Codeup web scrape function
# if successful, it should return back the same/similar df to the one previously created

codeup_blogs = scrape_codeup_blogs("https://codeup.com/blog/")
codeup_blogs # checks out!

dataframe shape: (6, 3)


Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
3,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 14, 2022",Have you been considering a career in Cloud Ad...
4,Codeup TV Commercial,"Jul 20, 2022",Codeup has officially made its TV debut! Our c...
5,Codeup’s New Dallas Campus,"Jul 25, 2022",Codeup’s Dallas campus has a new location! For...


----
#### ``Exercise Number 2: News Articles``

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

* Business
* Sports
* Technology
* Entertainment


``The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:``

>{
'title': 'The article title',\
'content': 'The article content',\
'category': 'business' # for example
}


In [17]:
# let's check out the initial site

url = 'https://inshorts.com/en/read/business'

response = get(url)
type(response)

requests.models.Response

In [18]:
# what's in the object

response # successful connection

<Response [200]>

In [19]:
# creating a beautifulsoup object and exploring the site further

soup = BeautifulSoup(response.content, 'html.parser')
type(soup) # object type checks out!

bs4.BeautifulSoup

In [20]:
# what's in the read page of inshorts: looking at one title

soup.find('span', itemprop = 'headline').text

"India's GDP grows at 13.5% in first quarter of FY23, fastest in a year"

In [21]:
# ok, but can we get all the tiles? using the find_all() method

soup.find_all('span', itemprop = 'headline') # checks out

[<span itemprop="headline">India's GDP grows at 13.5% in first quarter of FY23, fastest in a year</span>,
 <span itemprop="headline">Snap to lay off 20% of staff, cancel several projects to cut costs</span>,
 <span itemprop="headline">2 top executives at Snap quit hours after report about 20% layoffs emerges</span>,
 <span itemprop="headline">Musk seeks to delay Twitter trial to Nov amid whistleblower's claims</span>,
 <span itemprop="headline">Viral video shows Amazon parcels thrown out of train at station, Railways clarifies</span>,
 <span itemprop="headline">Dell among firms conducting stay interviews to contain high attrition rates: Report</span>,
 <span itemprop="headline">World's 3rd richest person Adani's wealth surged over 13 times in 2.5 years</span>,
 <span itemprop="headline">Russia's Gazprom halts gas supply to Europe via major pipeline</span>,
 <span itemprop="headline">Japan calls for $24 bn investment to boost battery competitiveness</span>,
 <span itemprop="headline">In

In [22]:
# notes to self: use the find_all() method and iterate through the needed attributes/tags
# ensure that the total numner of titles matches the total number of authors, publish date, content, etc. 
# 25 articles on the Business page

len(soup.find_all('span', itemprop = 'headline'))

25

In [23]:
# extracting the author 
# reminder that class is a reserved python word, so must use 'class_' to specify html tag
# here! i see that there can be multiple authors on one single blog; makes sense!

len(soup.find_all('span', class_ = 'author'))

50

In [24]:
# what about content blurbs/paragraphas
# checks out! 25 articles and 25 titles

len(soup.find_all('div', itemprop = 'articleBody'))

25

In [25]:
# understanding the contents object

contents = soup.find_all('div', itemprop = 'articleBody')
range(len(contents))

range(0, 25)

In [26]:
# creating the news article function

def get_news_articles(website_url):

    # create the unique response object
    response = get(website_url)

    # create the soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # creating a list of titles/headlines to iterate throug
    titles = soup.find_all('span', itemprop = 'headline')

    dates = soup.find_all('span', class_ = 'date')

    sources = soup.find_all('a', class_ = 'source')

    authors = soup.find_all('span', class_ = 'author')

    contents = soup.find_all('div', itemprop = 'articleBody')

    # creating a container list to hold article contents in
    container = []

    # iterate through the total number of headlines on website
    for num in range(len(titles)):
        
        published = dates[num].text

        title = titles[num].text

        author = authors[num].text

        content = contents[num].text
        
        '''IF Statement to handle instances where there is not a source.
        This code can probably be written more efficiently and/or across all collected attributes.'''
        
        if num in range(len(sources)):

                source = sources[num].text

        else: 

            source = None
            
        # creating a dictionary to save the articles contents
        article_dict = { 
            
            'publish_date': published, 
            'source': source, 
            'title': title,
            'authors': author,
            'content': content
        }

        # append to container list
        container.append(article_dict)
    
    # creating a dataframe from all scrapped articles
    article_df = pd.DataFrame(container)

    # printing the dataframe shape
    print(f'dataframe shape: {article_df.shape}')

    return article_df

In [27]:
# testing out the function

inshort_busns = get_news_articles('https://inshorts.com/en/read/business')
inshort_busns.head() # where there are 5 unique authors on the inshort business site

dataframe shape: (25, 5)


Unnamed: 0,publish_date,source,title,authors,content
0,31 Aug,Twitter,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...
1,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Anmol Sharma,Snap said on Wednesday it will lay off 20% of ...
2,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ananya Goyal,Two senior advertising executives at Snap quit...
3,31 Aug,Reuters,Musk seeks to delay Twitter trial to Nov amid ...,Ananya Goyal,Tesla CEO Elon Musk is seeking to delay the tr...
4,31 Aug,News18,Viral video shows Amazon parcels thrown out of...,Ridham Gambhir,A video from Guwahati railway station has gone...


In [28]:
# testing the function on the 'Sports' section

inshort_sports = get_news_articles('https://inshorts.com/en/read/sports')
inshort_sports["source"].unique()

dataframe shape: (25, 5)


array(['BCCI', 'ICC', 'Times Now', 'Hindustan Times', 'The Independent',
       'CricTracker', 'ANI', 'Sportskeeda'], dtype=object)

In [29]:
# testing the function on the 'Technology' section

inshort_tech = get_news_articles('https://inshorts.com/en/read/technology')
inshort_tech.head()

dataframe shape: (25, 5)


Unnamed: 0,publish_date,source,title,authors,content
0,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Ananya Goyal,Snap said on Wednesday it will lay off 20% of ...
1,30 Aug,Reuters,Musk cites whistleblower's claims in new notic...,Ananya Goyal,Tesla CEO Elon Musk's legal team has filed ano...
2,30 Aug,Reuters,American man sues Tesla over car suddenly stop...,Ridham Gambhir,"Jose Alvarez Toledo, a Tesla Model 3 owner fro..."
3,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ridham Gambhir,Two senior advertising executives at Snap quit...
4,31 Aug,Reuters,Facebook's Gaming app to be shut down in Octob...,Anmol Sharma,Facebook’s Gaming app for iOS and Android is s...


In [30]:
# testing the function on the 'Entertainment' section

inshort_ent = get_news_articles('https://inshorts.com/en/read/entertainment')
inshort_ent.head()

dataframe shape: (25, 5)


Unnamed: 0,publish_date,source,title,authors,content
0,31 Aug,Instagram,"Shah Rukh Khan celebrates Ganesh Chaturthi, sh...",Daisy Mowke,Actor Shah Rukh Khan took to social media to s...
1,31 Aug,Hindustan Times,Jacqueline knew about Sukesh's criminal past &...,Daisy Mowke,The ED's chargesheet filed against Jacqueline ...
2,31 Aug,ANI,Jacqueline Fernandez summoned by Delhi court i...,Daisy Mowke,Delhi's Patiala House Court has summoned actre...
3,31 Aug,The Associated Press,Aamir Khan to not charge fee for Laal Singh Ch...,Daisy Mowke,Aamir Khan's 'Laal Singh Chaddha' has not been...
4,31 Aug,Bollywood Hungama,S Korea may hold survey on BTS members' mandat...,Apaar Sharma,South Korea is considering a survey to determi...


----

#### ``Exercise Number 3: Caching the Data``

**<u>Notes:</u>**

* Write your code such that the acquired data is saved locally in some form or fashion. Your functions that retrieve the data should prefer to read the local data instead of having to make all the requests everytime the function is called. 
* Include a boolean flag in the functions to allow the data to be acquired "fresh" from the actual sources (re-writing your local cache)

In [31]:
# let's first cache the Codeup and Inshorts article dataframes

codeup_blogs.to_csv("/Users/mijailmariano/codeup-data-science/natural-language-processing-exercises/codeup_blogs.json", index = False)

In [32]:
# creating a function to first: check if the Codeup Blogs dataset exists, if not: scrape the web for it

def get_codeup_blogs():

    # creating the operating system filename for referencing
    filename = "codeup_blogs.csv"
    
    # check to see if the file path exists
    if os.path.isfile(filename):
        
        # if found, read the csv as a Pandas Dataframe
        df = pd.read_csv(filename)

        # let's print the shape
        print(f'df shape: {df.shape}')

        # return the blogs dataset
        return df
    
    # if not cached, then retrieve the data from Codeup's blog site
    else:

        # set the Codeup Blogs url
        url = "https://codeup.com/blog/"

        # providing url headers for referencing/web access
        headers = {'User-Agent': 'Codeup Data Science'}

        # creating the response object to access to the url
        response = get(url, headers = headers)

        # creating the Codeup Blogs soup object
        soup = BeautifulSoup(response.content, "html.parser")
        
        # selecting/extracting all urls from the blog home page
        urls = soup.select('h2 a[href]')[:]

        # container list to store needed contents/attributes
        container = []

        # let's extract all articles in the Codeup blog post website
        for num in range(len(urls)):

            # extracting the article url from Codeup blog urls
            article_url = urls[num]['href']

            # creating the response object (Article Website)
            response = get(article_url, headers = headers)

            # create the soup object
            soup = BeautifulSoup(response.content, "html.parser")

            # extract the title
            title = soup.find('h1', class_ = "entry-title").text
            
            # extract the publish date
            published = soup.find('span', class_ = "published").text.strip()
            
            # extract article body
            contents = soup.find('div', class_ = 'entry-content').text.strip()

            # create dictionary that holds article contents
            article_dict = { 
                "article_title": title,
                "publish_date": published,
                "contents": contents
            }

            # append article dictionary to the container list
            container.append(article_dict)

        # create an articles/blogs dataframe
        df = pd.DataFrame(container).sort_values("publish_date").reset_index(drop = True)
        
        # creating a .csv file in local directory for future referencing
        df.to_csv("codeup_blogs.csv", index = False)

        # print the shape
        print(f'dataframe shape: {df.shape}')

        # return articles/blogs in a Pandas Dataframe
        return df

In [33]:
# let's test the get codeup blogs function
# can add to acquire file

df = get_codeup_blogs()
df.head() # checks out!

df shape: (22, 3)


Unnamed: 0,article_title,publish_date,contents
0,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
1,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
2,Is a Career in Tech Recession-Proof?,"Aug 12, 2022","Given the current economic climate, many econo..."
3,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
4,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...


In [34]:
# let's try extracting the genre name from the url with regex

re.findall(r'\w+\/?$', 'https://inshorts.com/en/read/entertainment')[0] # checks out!

'entertainment'

In [35]:
# does it find the regex as a variable?

url = 'https://inshorts.com/en/read/entertainment'

re.findall(r'\w+\/?$', 'https://inshorts.com/en/read/entertainment')[0] # checks out!

'entertainment'

In [36]:
# creating the news article function

def get_news_articles(website_url):

    # create the unique response object
    response = get(website_url)

    # creating a topic/genre object
    genre = re.findall(r'\w+\/?$', website_url)[0]

    # create the soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # creating a list of titles/headlines to iterate throug
    titles = soup.find_all('span', itemprop = 'headline')

    dates = soup.find_all('span', class_ = 'date')

    sources = soup.find_all('a', class_ = 'source')

    authors = soup.find_all('span', class_ = 'author')

    contents = soup.find_all('div', itemprop = 'articleBody')

    # creating a container list to hold article contents in
    container = []

    # iterate through the total number of headlines on website
    for num in range(len(titles)):
        
        published = dates[num].text

        title = titles[num].text

        author = authors[num].text

        content = contents[num].text
        
        '''IF Statement to handle instances where there is not a source.
        This code can probably be written more efficiently and/or across all collected attributes.'''
        
        if num in range(len(sources)):

                source = sources[num].text

        else: 

            source = None
            
        # creating a dictionary to save the articles contents
        article_dict = { 
            
            'genre': genre,
            'publish_date': published, 
            'source': source, 
            'title': title,
            'authors': author,
            'content': content
        }

        # append to container list
        container.append(article_dict)
    
    # creating a dataframe from all scrapped articles
    article_df = pd.DataFrame(container)

    # printing the dataframe shape
    print(f'dataframe shape: {article_df.shape}')

    return article_df

In [None]:
# creating the news article function

def get_news_articles(website_url):

    # create the unique response object
    response = get(website_url)

    # creating a topic/genre object
    genre = re.findall(r'\w+\/?$', website_url)[0]

    # create the soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # creating a list of titles/headlines to iterate throug
    titles = soup.find_all('span', itemprop = 'headline')

    dates = soup.find_all('span', class_ = 'date')

    sources = soup.find_all('a', class_ = 'source')

    authors = soup.find_all('span', class_ = 'author')

    contents = soup.find_all('div', itemprop = 'articleBody')

    # creating a container list to hold article contents in
    container = []

    # iterate through the total number of headlines on website
    for num in range(len(titles)):
        
        published = dates[num].text

        title = titles[num].text

        author = authors[num].text

        content = contents[num].text
        
        '''IF Statement to handle instances where there is not a source.
        This code can probably be written more efficiently and/or across all collected attributes.'''
        
        if num in range(len(sources)):

                source = sources[num].text

        else: 

            source = None
            
        # creating a dictionary to save the articles contents
        article_dict = { 
            
            'genre': genre,
            'publish_date': published, 
            'source': source, 
            'title': title,
            'authors': author,
            'content': content
        }

        # append to container list
        container.append(article_dict)
    
    # creating a dataframe from all scrapped articles
    article_df = pd.DataFrame(container)

    # printing the dataframe shape
    print(f'dataframe shape: {article_df.shape}')

    return article_df

In [37]:
# let's test this function

inshort_ent = get_news_articles('https://inshorts.com/en/read/entertainment')
inshort_ent.head() # checks out!

dataframe shape: (25, 6)


Unnamed: 0,genre,publish_date,source,title,authors,content
0,entertainment,31 Aug,Instagram,"Shah Rukh Khan celebrates Ganesh Chaturthi, sh...",Daisy Mowke,Actor Shah Rukh Khan took to social media to s...
1,entertainment,31 Aug,Hindustan Times,Jacqueline knew about Sukesh's criminal past &...,Daisy Mowke,The ED's chargesheet filed against Jacqueline ...
2,entertainment,31 Aug,ANI,Jacqueline Fernandez summoned by Delhi court i...,Daisy Mowke,Delhi's Patiala House Court has summoned actre...
3,entertainment,31 Aug,The Associated Press,Aamir Khan to not charge fee for Laal Singh Ch...,Daisy Mowke,Aamir Khan's 'Laal Singh Chaddha' has not been...
4,entertainment,31 Aug,Bollywood Hungama,S Korea may hold survey on BTS members' mandat...,Apaar Sharma,South Korea is considering a survey to determi...


In [38]:
# trying another website

inshort_sports = get_news_articles('https://inshorts.com/en/read/sports')
inshort_sports.head() # checks out!

dataframe shape: (25, 6)


Unnamed: 0,genre,publish_date,source,title,authors,content
0,sports,31 Aug,BCCI,India beat Hong Kong to reach Asia Cup Super 4...,Anmol Sharma,India beat Hong Kong by 40 runs to qualify for...
1,sports,31 Aug,ICC,Suryakumar Yadav smashes most sixes ever by an...,Anmol Sharma,Suryakumar Yadav on Wednesday broke the record...
2,sports,31 Aug,ICC,Hardik Pandya achieves his highest-ever spot i...,Anmol Sharma,Following his match-winning performance agains...
3,sports,31 Aug,ICC,India announce their playing XI for match agai...,Anmol Sharma,Hong Kong captain Nizakat Khan won the toss an...
4,sports,31 Aug,Times Now,Indian fan receives death threats for wearing ...,Anmol Sharma,"Indian fan, who was seen wearing a Pakistan je..."


In [39]:
# entertainment

inshort_bus = get_news_articles('https://inshorts.com/en/read/business')
inshort_bus.head() # checks out!

dataframe shape: (25, 6)


Unnamed: 0,genre,publish_date,source,title,authors,content
0,business,31 Aug,Twitter,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...
1,business,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Anmol Sharma,Snap said on Wednesday it will lay off 20% of ...
2,business,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ananya Goyal,Two senior advertising executives at Snap quit...
3,business,31 Aug,Reuters,Musk seeks to delay Twitter trial to Nov amid ...,Ananya Goyal,Tesla CEO Elon Musk is seeking to delay the tr...
4,business,31 Aug,News18,Viral video shows Amazon parcels thrown out of...,Ridham Gambhir,A video from Guwahati railway station has gone...


In [40]:
# technology

inshort_tech = get_news_articles('https://inshorts.com/en/read/technology')
inshort_tech.head() # checks out!

dataframe shape: (25, 6)


Unnamed: 0,genre,publish_date,source,title,authors,content
0,technology,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Ananya Goyal,Snap said on Wednesday it will lay off 20% of ...
1,technology,30 Aug,Reuters,Musk cites whistleblower's claims in new notic...,Ananya Goyal,Tesla CEO Elon Musk's legal team has filed ano...
2,technology,30 Aug,Reuters,American man sues Tesla over car suddenly stop...,Ridham Gambhir,"Jose Alvarez Toledo, a Tesla Model 3 owner fro..."
3,technology,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ridham Gambhir,Two senior advertising executives at Snap quit...
4,technology,31 Aug,Reuters,Facebook's Gaming app to be shut down in Octob...,Anmol Sharma,Facebook’s Gaming app for iOS and Android is s...


In [54]:
# let's now work on the inshort function
# where i want to capture all required genre datasets

frames = [inshort_bus, inshort_tech, inshort_ent, inshort_sports]

inshort_articles = pd.concat(frames, axis = 0).reset_index(drop = True)
inshort_articles

Unnamed: 0,genre,publish_date,source,title,authors,content
0,business,31 Aug,Twitter,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...
1,business,31 Aug,Reuters,"Snap to lay off 20% of staff, cancel several p...",Anmol Sharma,Snap said on Wednesday it will lay off 20% of ...
2,business,31 Aug,Reuters,2 top executives at Snap quit hours after repo...,Ananya Goyal,Two senior advertising executives at Snap quit...
3,business,31 Aug,Reuters,Musk seeks to delay Twitter trial to Nov amid ...,Ananya Goyal,Tesla CEO Elon Musk is seeking to delay the tr...
4,business,31 Aug,News18,Viral video shows Amazon parcels thrown out of...,Ridham Gambhir,A video from Guwahati railway station has gone...
...,...,...,...,...,...,...
95,sports,31 Aug,Sportskeeda,India faced a 'lot of difficulty' when they la...,Anmol Sharma,Ex-Team India opener Wasim Jaffer said Team In...
96,sports,31 Aug,Times Now,Pant can get into this side if Rahul doesn't f...,Anmol Sharma,Former India cricketer Saba Karim has said wit...
97,sports,31 Aug,Hindustan Times,I can't believe his place is under threat: Sty...,Anmol Sharma,During a discussion ahead of India's match aga...
98,sports,31 Aug,CricTracker,"Team India can tackle our bowlers, Pak can't: ...",Anmol Sharma,Former Afghanistan captain Asghar Afghan said ...


In [55]:
# creating an inshort_articles

inshort_articles.to_json("inshort_articles.json") # checks out!

----

``JSON Cache Functions:``

In [63]:
# creating a function to first: check if the Codeup Blogs dataset exists, if not: scrape the web for it

def get_codeup_blogs():

    # creating the operating system filename for referencing
    filename = "codeup_blogs.json"
    
    # check to see if the file path exists
    if os.path.isfile(filename):
        
        # if found, read the csv as a Pandas JSON
        codeup_blogs = pd.read_json(filename)

        # return the blogs data
        return codeup_blogs
    
    # if not cached, then retrieve the data from Codeup's blog site
    else:

        # set the Codeup Blogs url
        url = "https://codeup.com/blog/"

        # providing url headers for referencing/web access
        headers = {'User-Agent': 'Codeup Data Science'}

        # creating the response object to access to the url
        response = get(url, headers = headers)

        # creating the Codeup Blogs soup object
        soup = BeautifulSoup(response.content, "html.parser")
        
        # selecting/extracting all urls from the blog home page
        urls = soup.select('h2 a[href]')[:]

        # container list to store needed contents/attributes
        container = []

        # let's extract all articles in the Codeup blog post website
        for num in range(len(urls)):

            # extracting the article url from Codeup blog urls
            article_url = urls[num]['href']

            # creating the response object (Article Website)
            response = get(article_url, headers = headers)

            # create the soup object
            soup = BeautifulSoup(response.content, "html.parser")

            # extract the title
            title = soup.find('h1', class_ = "entry-title").text
            
            # extract the publish date
            published = soup.find('span', class_ = "published").text.strip()
            
            # extract article body
            contents = soup.find('div', class_ = 'entry-content').text.strip()

            # create dictionary that holds article contents
            article_dict = { 
                
                "article_title": title,
                "publish_date": published,
                "contents": contents
            }

            # append article dictionary to the container list
            container.append(article_dict)

        # create an articles/blogs dataframe
        df = pd.DataFrame(container).sort_values("publish_date").reset_index(drop = True)
        
        # creating a .json file in local directory for future referencing
        codeup_blogs = df.to_json("codeup_blogs.json")

        # return articles/blogs
        return codeup_blogs

In [64]:

codeup_blogs = get_codeup_blogs()
codeup_blogs