# Acquire (Web-Scrapping)

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [8]:
url = 'https://www.example.com'

In [6]:
import requests

In [9]:
response = requests.get(url)

In [10]:
response

<Response [200]>

In [11]:
response.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [12]:
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
soup

<!DOCTYPE html>

<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative example

In [14]:
soup.h1

<h1>Example Domain</h1>

In [15]:
soup.h1.text

'Example Domain'

In [16]:
soup.p

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

In [17]:
soup.find('p')

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

In [19]:
list_of_p_elements = soup.find_all('p')

In [20]:
list_of_p_elements

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [21]:
for element in list_of_p_elements:
    print(element.a)

None
<a href="https://www.iana.org/domains/example">More information...</a>


### How to see permissions
Add in the end of the link `/robots.txt` to see permissions. What and where you're allowed to web scrap and what you're not

In [22]:
import time

In [23]:
print('Hello')
time.sleep(5)
print('Yvette!')

Hello
Yvette!


## NY Times Website

In [30]:
response = requests.get('https://www.nytimes.com')

In [31]:
response

<Response [200]>

In [32]:
soup = BeautifulSoup(response.content, 'html.parser')

In [33]:
soup

<!DOCTYPE html>

<html class="nytapp-vi-homepage" lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<meta charset="utf-8"/>
<title data-rh="true">The New York Times - Breaking News, US News, World News and Videos</title>
<meta content="Live news, investigations, opinion, photos and video by the journalists of The New York Times from more than 150 countries around the world. Subscribe for coverage of U.S. and international news, politics, business, technology, science, health, arts, sports and more." data-rh="true" name="description"/><meta content="https://www.nytimes.com" data-rh="true" property="og:url"/><meta content="website" data-rh="true" property="og:type"/><meta content="The New York Times - Breaking News, US News, World News and Videos" data-rh="true" property="og:title"/><meta content="Live news, investigations, opinion, photos and video by the journalists of The New York Times from more than 150 countries around the world. Subscribe for coverage of U.S. and i

In [34]:
soup.find_all('h1')

[<h1 class="story-wrapper indicate-hover css-18cyl96">McCarthy Loses 12th Vote for Speaker but Gains Supporters</h1>]

In [37]:
soup.find_all('h3')

[<h3 class="indicate-hover css-vnb41v">House Speaker Vote</h3>,
 <h3 class="indicate-hover css-vnb41v">Russia-Ukraine War</h3>,
 <h3 class="indicate-hover css-vnb41v">U.S. Jobs Report</h3>,
 <h3 class="indicate-hover css-1ekzmg">House Republicans Try Again After 14 Holdouts Flip</h3>,
 <h3 class="indicate-hover css-66vf3i">Analysis: Kevin McCarthy is letting the far right disrupt the House — and hold him hostage to their demands.</h3>,
 <h3 class="indicate-hover css-66vf3i">We’re tracking every lawmaker’s vote here.</h3>,
 <h3 class="css-le4k3i">Advertisement</h3>,
 <h3 class="indicate-hover css-on97le">U.S. Job Growth Slows but Remains Solid</h3>,
 <h3 class="indicate-hover css-66vf3i">The hospitality, health care and construction sectors generated job growth in December.</h3>,
 <h3 class="indicate-hover css-66vf3i">Stocks rose as traders bet that a slowdown in job growth could reduce pressure on prices.</h3>,
 <h3 class="indicate-hover css-on97le">Damar Hamlin Has Breathing Tube Remo

In [38]:
soup.find_all('a')

[<a class="css-1f8er69" href="#site-content">Skip to content</a>,
 <a class="css-1f8er69" href="#site-index">Skip to site index</a>,
 <a aria-label="New York Times Logo. Click to visit the homepage" class="css-nhjhh0 ell52qj1" data-testid="masthead-mobile-logo" href="/"><svg fill="#000" viewbox="0 0 184 25"><path d="M13.8 2.9c0-2-1.9-2.5-3.4-2.5v.3c.9 0 1.6.3 1.6 1 0 .4-.3 1-1.2 1-.7 0-2.2-.4-3.3-.8C6.2 1.4 5 1 4 1 2 1 .6 2.5.6 4.2c0 1.5 1.1 2 1.5 2.2l.1-.2c-.2-.2-.5-.4-.5-1 0-.4.4-1.1 1.4-1.1.9 0 2.1.4 3.7.9 1.4.4 2.9.7 3.7.8v3.1L9 10.2v.1l1.5 1.3v4.3c-.8.5-1.7.6-2.5.6-1.5 0-2.8-.4-3.9-1.6l4.1-2V6l-5 2.2C3.6 6.9 4.7 6 5.8 5.4l-.1-.3c-3 .8-5.7 3.6-5.7 7 0 4 3.3 7 7 7 4 0 6.6-3.2 6.6-6.5h-.2c-.6 1.3-1.5 2.5-2.6 3.1v-4.1l1.6-1.3v-.1l-1.6-1.3V5.8c1.5 0 3-1 3-2.9zm-8.7 11l-1.2.6c-.7-.9-1.1-2.1-1.1-3.8 0-.7 0-1.5.2-2.1l2.1-.9v6.2zm10.6 2.3l-1.3 1 .2.2.6-.5 2.2 2 3-2-.1-.2-.8.5-1-1V9.4l.8-.6 1.7 1.4v6.1c0 3.8-.8 4.4-2.5 5v.3c2.8.1 5.4-.8 5.4-5.7V9.3l.9-.7-.2-.2-.8.6-2.5-2.1L18.5 9V.8h-.2l-3.

In [41]:
article_links = soup.find_all('a', class_='css-9mylee')

In [42]:
article_links[0]

<a aria-hidden="false" class="css-9mylee" data-uri="nyt://legacycollection/53caa53a-ae2b-54a8-b7a3-32e9eab577ae" href="https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy"><h3 class="indicate-hover css-vnb41v">House Speaker Vote</h3><div class="css-1xlo3nw"><time aria-hidden="true" class="css-12i6afj" datetime="2023-01-06T19:51:25.992Z"><div class="css-ki347z"><span class="css-1stvlmo" data-time="abs">Jan. 6, 2023, 2:51 p.m. ET</span><span class="css-kpxlkr" data-time="rel"></span></div></time><span aria-live="polite" class="css-1dv1kvn"></span></div></a>

In [44]:
article_links[0]

<a aria-hidden="false" class="css-9mylee" data-uri="nyt://legacycollection/53caa53a-ae2b-54a8-b7a3-32e9eab577ae" href="https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy"><h3 class="indicate-hover css-vnb41v">House Speaker Vote</h3><div class="css-1xlo3nw"><time aria-hidden="true" class="css-12i6afj" datetime="2023-01-06T19:51:25.992Z"><div class="css-ki347z"><span class="css-1stvlmo" data-time="abs">Jan. 6, 2023, 2:51 p.m. ET</span><span class="css-kpxlkr" data-time="rel"></span></div></time><span aria-live="polite" class="css-1dv1kvn"></span></div></a>

In [45]:
article_links[0]['href']

'https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy'

In [47]:
article_links[0].text

'House Speaker VoteJan. 6, 2023, 2:51 p.m. ET'

In [48]:
for element in article_links:
    print(element['href'])

https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy
https://www.nytimes.com/live/2023/01/06/world/russia-ukraine-news
https://www.nytimes.com/live/2023/01/06/business/jobs-report-december-economy
https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy
https://www.nytimes.com/live/2023/01/06/us/house-speaker-vote-mccarthy
https://www.nytimes.com/2023/01/06/us/politics/mccarthy-house-speaker-republicans.html
https://www.nytimes.com/interactive/2023/01/04/us/politics/house-speaker-vote-tally.html
https://www.nytimes.com/live/2023/01/06/business/jobs-report-december-economy
https://www.nytimes.com/live/2023/01/06/business/jobs-report-december-economy/jobs-december-sectors
https://www.nytimes.com/2023/01/06/business/markets-stocks-jobs-inflation.html
https://www.nytimes.com/2023/01/06/sports/football/damar-hamlin-bills-update.html
https://www.nytimes.com/2023/01/05/sports/football/damar-hamlin-response-audio-recording.html
https://theathletic.com/4062586/20

### Exercises 

##### 1. Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named `get_blog_articles` that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:
```python
{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}
```

In [50]:
url = 'https://codeup.com/blog/'
headers = {'User-Agent': 'Codeup Data Science'}
response = requests.get(url, headers=headers)

In [51]:
response

<Response [200]>

In [52]:
soup = BeautifulSoup(response.content, 'html.parser')

In [61]:
h2 = soup.find_all('h2')

In [66]:
h2

[<h2 class="entry-title"><a href="https://codeup.com/data-science/become-a-data-scientist/">Become a Data Scientist in 6 Months!</a></h2>,
 <h2 class="entry-title"><a href="https://codeup.com/employers/hiring-tech-talent/">Hiring Tech Talent Around the Holidays</a></h2>,
 <h2 class="entry-title"><a href="https://codeup.com/cloud-administration/cap-funding-options/">Cloud Administration Program New Funding Options</a></h2>,
 <h2 class="entry-title"><a href="https://codeup.com/dallas-info/it-professionals-dallas/">Why Dallas is a Great Location for IT Professionals</a></h2>,
 <h2 class="entry-title"><a href="https://codeup.com/codeup-news/codeup-voted-1-technical-school-in-dfw/">Codeup is ranked #1 Best in DFW 2022</a></h2>,
 <h2 class="entry-title"><a href="https://codeup.com/tips-for-prospective-students/financing/codeups-scholarships/">Codeup’s Scholarship Offerings</a></h2>,
 <h2 style="text-align: center;">Git Codeupdates</h2>]

In [143]:
h2[0].a.text

'Become a Data Scientist in 6 Months!'

In [126]:
s = BeautifulSoup(requests.get(h2[0].a['href'], headers=headers).content, 'html.parser')

In [144]:
texts = s.find('div', class_='entry-content').find_all('p')
text = ''
for t in texts:
    if not t.text.startswith('*Codeup'):
        text += t.text
    else:
        break

In [145]:
print(text)

Are you feeling unfulfilled in your work but want to avoid returning to the traditional educational route? Codeup can help! Starting over as a professional is daunting and not always ideal. Codeup can help you go from a career you are bored with, to a job that excites you in just 6 months!Here’s how…During our 20-week program, you will have the opportunity to take your career to new heights with data science being one of the most needed jobs in tech.You’ll gather data, then clean it, explore it for trends, and apply machine learning models to make predictions.Upon completing this program, you will know how to turn insights into actionable recommendations. You’ll be a huge asset to any company, having all the technical skills to become a data scientist with projects upon projects of experience under your belt.A common reason individuals opt not to change their careers is fear it is too late. Codeup has crafted a program that will guide you through your career transition and prove that y

__Put everything into a function__

In [227]:
def get_blog_articles():
    '''
    saves the text of 5 articles from the codeup website
    '''
    url = 'https://codeup.com/blog/'
    headers = {'User-Agent': 'Codeup Data Science'}
    response = requests.get(url, headers=headers)

    # request the content from the main page
    soup = BeautifulSoup(response.content, 'html.parser')

    # create a dictionary to hold link, title and content of the article
    blog_articles = {
        #'link':[],
        'title':[],
        'content':[]
    }
    for i in range(len(h2)-1):
        # link
        link = h2[i].a['href']
        #blog_articles['link'].append(link)
        # title
        blog_articles['title'].append(h2[i].a.text)
        # request the content from the article using the link
        s = BeautifulSoup(requests.get(link, headers=headers).content, 'html.parser')
        # inside the <div class='entry-content ...> find all paragraphers'
        texts = s.find('div', class_='entry-content').find_all('p')
        # text holds all words from the article
        text = ''
        for t in texts:
            # each article has the same ending *Codeup ..., so I don't grab it
            if not t.text.startswith('*Codeup'):
                text += t.text
            else:
                break
        # add the article's text to the dictionary
        blog_articles['content'].append(text)
    return blog_articles

In [228]:
pd.DataFrame(get_blog_articles())

Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...


##### 2. News Articles

We will now be scraping text data from `inshorts`, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:
- Business
- Sports
- Technology
- Entertainment
The end product of this should be a function named `get_news_articles` that returns a list of dictionaries, where each dictionary has this shape:
```python
{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}

```

In [168]:
url = 'https://inshorts.com/en/read'
response = requests.get(url)
response

<Response [200]>

In [169]:
soup = BeautifulSoup(response.content, 'html.parser')

In [185]:
categories = soup.find('ul', class_='category-list').find_all('a')[1:]

In [192]:
categories[0]['href']

'/en/read/national'

In [195]:
categories_links = {}
link = 'https://inshorts.com'

In [196]:
for c in categories:
    categories_links[c.text.strip()] = link + c['href']

In [194]:
link + categories[0]['href']

'https://inshorts.com/en/read/national'

In [197]:
categories_links

{'India': 'https://inshorts.com/en/read/national',
 'Business': 'https://inshorts.com/en/read/business',
 'Sports': 'https://inshorts.com/en/read/sports',
 'World': 'https://inshorts.com/en/read/world',
 'Politics': 'https://inshorts.com/en/read/politics',
 'Technology': 'https://inshorts.com/en/read/technology',
 'Startup': 'https://inshorts.com/en/read/startup',
 'Entertainment': 'https://inshorts.com/en/read/entertainment',
 'Miscellaneous': 'https://inshorts.com/en/read/miscellaneous',
 'Hatke': 'https://inshorts.com/en/read/hatke',
 'Science': 'https://inshorts.com/en/read/science',
 'Automobile': 'https://inshorts.com/en/read/automobile'}

In [226]:
l = categories_links['India']
s = BeautifulSoup(requests.get(l).content, 'html.parser')
#link + s.find_all('div', class_='news-card')[0].find('a')['href']
#s.find_all('div', class_='news-card')[0].find('a').text.strip()
s.find_all('div', class_='news-card')[0].find('div', itemprop="articleBody").text

"Rajasthan's Anti-Corruption Bureau has withdrawn its order prohibiting the release of photos and names of the accused involved in cases of bribery after criticism. The bureau on Wednesday asked its officials not to reveal the names and photos of bribery case accused and suspects until they are convicted by court. Opposition BJP had targeted the state government over the order."

In [230]:
news_cards = s.find_all('div', class_='news-card')

In [231]:
len(news_cards)

25

In [234]:
shorts = {
    'title':[],
    'content':[],
    'category':[]
}
for key in categories_links:
    l = categories_links[key]
    s = BeautifulSoup(requests.get(l).content, 'html.parser')
    news_cards = s.find_all('div', class_='news-card')
    for news in news_cards:
        # add title
        shorts['title'].append(news.find('a').text.strip())
        shorts['content'].append(news.find('div', itemprop="articleBody").text)
        shorts['category'].append(key)

In [235]:
pd.DataFrame(shorts)

Unnamed: 0,title,content,category
0,Rajasthan ACB takes back order prohibiting rel...,Rajasthan's Anti-Corruption Bureau has withdra...,India
1,Why is Uttarakhand's Joshimath sinking?,Several factors have led to the subsidence of ...,India
2,She might be blackmailing my son: Father of ma...,"Shyam Mishra, father of Shankar Mishra who uri...",India
3,"India to host Voice of Global South Summit, ov...",India will host the Voice of Global South Summ...,India
4,Man beats woman with helmet on busy road for r...,A CCTV footage of a man beating up a woman wit...,India
...,...,...,...
292,BluSmart secures $100 mn credit line to purcha...,Ride-hailing startup BluSmart has secured a $1...,Automobile
293,"Tesla cuts Model 3, Model Y prices in China fo...",Tesla on Friday reduced prices for its Model 3...,Automobile
294,Tata Motors to complete purchase of Ford's San...,Tata Motors on Friday said the acquisition of ...,Automobile
295,Domestic passenger vehicle sales in 2022 highe...,Domestic passenger vehicle sales rose by 23% t...,Automobile


In [238]:
def get_news_articles():
    url = 'https://inshorts.com/en/read'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    categories = soup.find('ul', class_='category-list').find_all('a')[1:] # first one is All News, I don't grab it
    # dictionary to hold categories names and links
    categories_links = {}
    # main link
    link = 'https://inshorts.com'
    for c in categories:
        # add path to the main link
        categories_links[c.text.strip()] = link + c['href'] 
    # create a dictionary to return
    shorts = {
        'title':[],
        'content':[],
        'category':[]
    }
    # loop through the categories and scrap the news
    for key in categories_links:
        # link
        l = categories_links[key]
        # new soup
        s = BeautifulSoup(requests.get(l).content, 'html.parser')
        # all news cards from the page
        news_cards = s.find_all('div', class_='news-card')
        for news in news_cards:
            # add title
            shorts['title'].append(news.find('a').text.strip())
            # grab the text of the article
            shorts['content'].append(news.find('div', itemprop="articleBody").text)
            # add the category name
            shorts['category'].append(key)
    return shorts

In [239]:
pd.DataFrame(get_news_articles())

Unnamed: 0,title,content,category
0,Rajasthan ACB takes back order prohibiting rel...,Rajasthan's Anti-Corruption Bureau has withdra...,India
1,Why is Uttarakhand's Joshimath sinking?,Several factors have led to the subsidence of ...,India
2,"India to host Voice of Global South Summit, ov...",India will host the Voice of Global South Summ...,India
3,Air India crew didn't show good judgement rega...,"The woman passenger, in her complaint regardin...",India
4,Man beats woman with helmet on busy road for r...,A CCTV footage of a man beating up a woman wit...,India
...,...,...,...
292,Tesla shares fall further after firm misses 20...,"Tesla's shares, which dipped roughly 65% last ...",Automobile
293,GM beats Toyota to reclaim top US automaker sp...,General Motors (GM) reclaimed the top US autom...,Automobile
294,India becomes world's 3rd largest auto market ...,India surpassed Japan to become the third-larg...,Automobile
295,Ola Electric to launch multiple e-motorcycles ...,Ola Electric CEO Bhavish Aggarwal on Wednesday...,Automobile
