Extracting Data using a webscraping

Steps for extracting the data
1- Send an HTTP request to the web page using the requests library.
2- Parse the HTML content of the web page using BeautifulSoup.
3- Identify the HTML tags that contain the data you want to extract.
4- Use BeautifulSoup methods to extract the data from the HTML tags.
5- Print the extracted data

In [1]:
# Import the libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Send an HTTP request to the web page using the requests library.

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html"

page = requests.get(url).text

In [5]:
# Parse the HTML content of the web page using BeautifulSoup.

soup = BeautifulSoup(page, "html.parser")

In [None]:
# Identify the HTML tags that contain the data you want to extract.

# find how many table on the page
tables = soup.find("table", class_="W(100%) M(0)")
rows = tables.find("tbody").find_all("tr")

netflix_data = []

for row in rows:
    col = row.find_all("td")
    Date = col[0].text.strip()
    Open = col[1].text.strip()
    High = col[2].text.strip()
    Low = col[3].text.strip()
    Close = col[4].text.strip()
    Adj_Close = col[5].text.strip()
    Volume = col[6].text.strip()
    
    netflix_data.append([Date, Open, High, Low, Close, Adj_Close, Volume])
    
netflix_data_df = pd.DataFrame(netflix_data, columns=["Date", "Open", "High", "Low", "Close", "Adj_Close", "Volume"])


netflix_data_df.head()
netflix_data_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj_Close,Volume
65,"Jan 01, 2016",109.0,122.18,90.11,91.84,91.84,488193200
66,"Dec 01, 2015",124.47,133.27,113.85,114.38,114.38,319939200
67,"Nov 01, 2015",109.2,126.6,101.86,123.33,123.33,320321800
68,"Oct 01, 2015",102.91,115.83,96.26,108.38,108.38,446204400
69,"Sep 01, 2015",109.35,111.24,93.55,103.26,103.26,497401200


# Extracting data using pandas library

We can also use the pandas read_html function from the pandas library and use the URL for extracting data.

In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


# Send an HTTP request to the web page using the requests library.

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html"

read_html_pandas_data = pd.read_html(url)

netflix_dataframe = read_html_pandas_data[0]

netflix_dataframe.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Jun 01, 2021",504.01,536.13,482.14,528.21,528.21,78560600
1,"May 01, 2021",512.65,518.95,478.54,502.81,502.81,66927600
2,"Apr 01, 2021",529.93,563.56,499.0,513.47,513.47,111573300
3,"Mar 01, 2021",545.57,556.99,492.85,521.66,521.66,90183900
4,"Feb 01, 2021",536.79,566.65,518.28,538.85,538.85,61902300
