In [None]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)

In [None]:
print(response.text[:400])

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
article = soup.find('div', class_='jupiterx-post-content')
article.text

In [None]:
with open('article.txt', 'w') as f:
    f.write(article.text)

In [None]:
def get_article_text(url):
    # if we already have the data, read it locally
    if os.path.exists('article.txt'):
        with open('article.txt') as f:
            return f.read()

    # otherwise go fetch the data
    url = url
    headers = {'User-Agent': 'Codeup Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text)
    article = soup.find('div', class_='jupiterx-post-content')

    # save it for next time
    with open('article.txt', 'w') as f:
        f.write(article.text)

    return article.text

# Exercieses

### Codeup Blog Articles

Scrape the article text from the following pages:
- https://codeup.com/codeups-data-science-career-accelerator-is-here/
- https://codeup.com/data-science-myths/
- https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
- https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
- https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

`{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}`

Plus any additional properties you think might be helpful.

##### Bonus:

- Scrape the text of all the articles linked on codeup's blog page.

In [None]:
# Setting up a list of urls:

urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 'https://codeup.com/data-science-myths/', 'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/', 'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [None]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Codeup Data Science'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [None]:
def get_all_urls():
    
    url = 'https://codeup.com/resources/#blog'
    
    soup = make_soup(url)
    
    urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
    
    urls = {link.get('href') for link in urls_list}

    urls = list(urls)
        
    return urls

In [None]:
# Faith's way:

def get_blog_articles(urls, cached=False):
    '''
    This function takes in a list of Codeup Blog urls and a parameter
    with default cached == False which scrapes the title and text for each url, 
    creates a list of dictionaries with the title and text for each blog, 
    converts list to df, and returns df.
    If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('big_blogs.json')
        
    # cached == False completes a fresh scrape for df     
    else:

        # Create an empty list to hold dictionaries
        articles = []

        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)

            # Save the title of each blog in variable title
            title = soup.find('h1').text

            # Save the text in each blog to variable text
            content = soup.find('div', class_="jupiterx-post-content").text

            # Create a dictionary holding the title and content for each blog
            article = {'title': title, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('big_blogs.json')
    
    return df

In [None]:
# This was my original get articles, but it's not as robust as the primary way that Faith showed me.

urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 'https://codeup.com/data-science-myths/', 'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/','https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
def get_blog_articles_luke(url_list, cached = False):
    final = [] 
    for x in url_list:
        url = x
        headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        article_title = soup.title.string
        article = soup.find('div', class_='jupiterx-post-content')
        article_text = article.text
        item = {
            'title': article_title,
            'content': article_text
        }
        final.append(item)
        
        
    df = pd.DataFrame(final)
    
    df.to_json('big_blogs.json')
        
        
            # save it for next time
    with open('article.txt', 'w') as f:
        f.write(article.text)
    return final

# Big thanks to Matt for his help getting me pointed in the right direction on this!

#### Big thanks to Matt for his help getting me pointed in the right direction on this!

In [None]:
article_list = get_blog_articles(urls)
article_list.head()

#### 1st webpage

In [None]:
# Doing it manually, no dictionary... these are the steps to create the function and understand it.

url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers = headers)


In [None]:
response.content

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
soup

In [None]:
soup.get_text()

In [None]:
soup.title.string

In [None]:
# Returns the same thing, review this SO article: https://stackoverflow.com/questions/35496332/differences-between-text-and-get-text

### News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

- Business
- Sports
- Technology
- Entertainment

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

`{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}`

In [None]:
# Start simple; function that handles a single article and returns the dictionary I need.

In [None]:
url = 'https://inshorts.com/en/read'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)
# soup = BeautifulSoup(response.text)
# article = soup.find('div', class_='jupiterx-post-content')

In [None]:
soup = BeautifulSoup(response.text)

In [None]:
soup.get_text()

In [None]:
article = soup.find('div', class_='articleBody')
article

In [None]:
soup.title.string

In [None]:
url = 'https://inshorts.com/en/read'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)

In [None]:
response.text

In [None]:
soup = BeautifulSoup(response.text)

In [None]:
article_content = soup.find('div', itemprop='articleBody').text

In [None]:
article_content

In [None]:
soup.find('span', itemprop='headline').text

In [None]:
soup_test = BeautifulSoup(response.content, 'html.parser')

In [None]:
soup_test.title.string

In [None]:
soup.prettify()

In [None]:
# Using a function that allows 

source_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/business',
             'https://inshorts.com/en/read/entertainment']

def build_news_dataset(source_urls):
    news_data = []
    for url in source_urls:
        news_category = url.split('/')[-1]
        data = get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'title': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'content': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['title', 'content', 'category']]
    return df

In [None]:
news_df = build_news_dataset(source_urls)
news_df.head(10)

In [None]:
# Complete. I'm going to look at the shape of my text:

news_df.shape

In [None]:
news_df.category.value_counts()