In [70]:
#importing the necessary libraries
import requests 
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

In [3]:
# Entering the required URL for scrapping.
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

In [4]:
#Converting to soup.
soup = bs(r.content)

In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Find and find_all()

In [6]:

first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [7]:
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [8]:
#Passing in a list of elements to look for.
first_header = soup.find_all(["h1","h2"])
first_header


[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [9]:
#passing attributes to the find/find all function

paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [10]:
# Nesting find/find_all calls

body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [11]:
# Searching specific strings in the find/find_all()
para = soup.find_all("p", string=re.compile("Some"))
para

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
headers = soup.find_all('h2' , string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select(CSS selector)

In [13]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [14]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [15]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [16]:
soup.select("p#paragraph-id b")


[<b>Some bold text</b>]

In [17]:
soup.select("body>p")

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Get different properties of the html

In [18]:
header = soup.find("h2")
header.string

div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [19]:
# Get a specific property from an element

link = soup.find("a")
print(link['href'])

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

https://keithgalli.github.io/web-scraping/webpage.html


'paragraph-id'

## Code navigation

In [20]:
# Path syntax

soup.body.p.a['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [21]:
# Terms: Parent, sibling, child
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [22]:
## starting and importing another url

# Entering the required URL for scrapping.
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")
#Converting to soup.
website = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [23]:
links = website.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [31]:
link_2 = website.find_all('ul')
link_2

[<ul class="fun-facts">
 <li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>
 <li>Middle name is Ronald</li>
 <li>Never had been on a plane until college</li>
 <li>Dunkin Donuts coffee is better than Starbucks</li>
 <li>A favorite book series of mine is <i>Ender's Game</i></li>
 <li>Current video game of choice is <i>Rocket League</i></li>
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>
 </ul>,
 <ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.

In [37]:
links = website.find("ul", attrs={'class':'socials'})
links.find_all('a')

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

# New Project Using the concepts [ CleverQazi]

In [40]:
## starting and importing another url

# Entering the required URL for scrapping.
r = requests.get('https://forecast.weather.gov/MapClick.php?lat=40.71455000000003&lon=-74.00713999999994#.X2SU8z_is5g')
#Converting to soup.
soup = bs(r.content,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport"/>
  <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
  <title>
   National Weather Service
  </title>
  <meta content="National Weather Service" name="DC.title">
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="http://www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robots"/>

In [47]:
week = soup.find(id='seven-day-forecast-body')
print(week)

<div class="panel-body" id="seven-day-forecast-body">
<div id="seven-day-forecast-container"><ul class="list-unstyled" id="seven-day-forecast-list"><li class="forecast-tombstone">
<div class="tombstone-container">
<p class="period-name">Today<br/><br/></p>
<p><img alt="Today: A 20 percent chance of showers before 8am.  Cloudy, then gradually becoming mostly sunny, with a high near 71. Breezy, with a north wind 13 to 20 mph. " class="forecast-icon" src="DualImage.php?i=shra&amp;j=wind_bkn&amp;ip=20" title="Today: A 20 percent chance of showers before 8am.  Cloudy, then gradually becoming mostly sunny, with a high near 71. Breezy, with a north wind 13 to 20 mph. "/></p><p class="short-desc">Breezy.<br/>Slight Chance<br/>Showers then<br/>Partly Sunny</p><p class="temp temp-high">High: 71 °F</p></div></li><li class="forecast-tombstone">
<div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Mostly clear, with a low around 47. North wind 16 to 1

In [62]:
# Finding the tombstone container. (The information contained container.)

sub_list = week.find_all(class_='tombstone-container')
sub_list1 = sub_list[0]

In [63]:
print(sub_list1.find(class_ = 'period-name').get_text())
print(sub_list1.find(class_ = 'short-desc').get_text())
print(sub_list1.find(class_ = 'temp').get_text())

Today
Breezy.Slight ChanceShowers thenPartly Sunny
High: 71 °F


In [67]:
period_names = [sub_list1.find(class_='period-name').get_text() for sub_list1 in sub_list]
print(period_names)
short_description = [sub_list1.find(class_='short-desc').get_text() for sub_list1 in sub_list]
print(short_description)
total_temp = [sub_list1.find(class_='temp').get_text() for sub_list1 in sub_list]
print(total_temp)

['Today', 'Tonight', 'Saturday', 'SaturdayNight', 'Sunday', 'SundayNight', 'Monday', 'MondayNight', 'Tuesday']
['Breezy.Slight ChanceShowers thenPartly Sunny', 'Mostly Clear', 'Sunny', 'Clear', 'Sunny', 'Clear', 'Sunny', 'Mostly Clear', 'Sunny']
['High: 71 °F', 'Low: 47 °F', 'High: 64 °F', 'Low: 49 °F', 'High: 63 °F', 'Low: 49 °F', 'High: 64 °F', 'Low: 52 °F', 'High: 69 °F']


In [75]:
weather_stuff = pd.DataFrame(
    {'period':period_names,
    'description':short_description,
    'temperature':total_temp,
    }
)
weather_stuff

Unnamed: 0,period,description,temperature
0,Today,Breezy.Slight ChanceShowers thenPartly Sunny,High: 71 °F
1,Tonight,Mostly Clear,Low: 47 °F
2,Saturday,Sunny,High: 64 °F
3,SaturdayNight,Clear,Low: 49 °F
4,Sunday,Sunny,High: 63 °F
5,SundayNight,Clear,Low: 49 °F
6,Monday,Sunny,High: 64 °F
7,MondayNight,Mostly Clear,Low: 52 °F
8,Tuesday,Sunny,High: 69 °F
