# Beautifulsoup for web scraping

In [15]:
!pip install requests beautifulsoup4



In [16]:
import requests
from bs4 import BeautifulSoup

# Tabula for pdf scraping

In [17]:
pip install tabula-py

Note: you may need to restart the kernel to use updated packages.


# PDF Data Extraction using Scraping API

In [18]:
import tabula

def extract_tables(pdf_path):
    try:
        tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
        return tables
    except Exception as e:
        print(f"Error during PDF scraping: {str(e)}")
        return None
pdf_path = 'C:/Users/Maira/Downloads/PDF and Web Data Scraping for Machine Learning/sample.pdf'
extracted_tables = extract_tables(pdf_path)
if extracted_tables:
    for i, table in enumerate(extracted_tables):
        print(f"Table {i + 1}:\n{table}\n")
else:
    print("PDF scraping failed.")

Table 1:
             Name  Age Occupation
0        John Doe   30   Engineer
1      Jane Smith   25   Designer
2     Bob Johnson   40    Teacher
3     Alice Brown   35     Doctor
4  Charlie Wilson   28  Developer
5       Eva Davis   45     Writer
6     Frank White   32     Artist
7    Grace Miller   38  Scientist
8     Henry Moore   50    Manager



# Web Scraping for Additional Data

In [19]:
import requests
from bs4 import BeautifulSoup
req = requests.get("https://www.python.org/")
soup = BeautifulSoup(req.content, "html.parser")
res = soup.title
print(res.get_text())
print(res.prettify())
print(soup.prettify())

Welcome to Python.org
<title>
 Welcome to Python.org
</title>

<!DOCTYPE html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" dir="ltr" lang="en">
 <!--<![endif]-->
 <head>
  <!-- Google tag (gtag.js) -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-TF35YF9CVH">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'G-TF35YF9CVH');
  </script>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <link href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js" rel="prefetch"/>
  <link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js" rel="prefetch"/>
  <

In [20]:
import requests
from bs4 import BeautifulSoup
url = 'https://www.python.org/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
news_headlines = []
for headline in soup.find_all('div', class_='shrubbery')[0].find_all('li'):
    news_headlines.append(headline.text.strip())
for headline in news_headlines:
    print(headline)

2024-01-18
Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉
2024-01-18
Announcing the Deputy Developer in Residence and the Supporting Developer in Residence
2024-01-18
Python 3.13.0 alpha 3 is now available.
2024-01-12
EU’s Cyber Resilience Act Passes with Wins for Open Source
2023-12-15
Python Software Foundation - December 2023 Newsletter


# Data Transformation

In [21]:
# Extracted PDF data
pdf_data = [
    {"Name": "John Doe", "Age": 30, "Occupation": "Engineer"},
    {"Name": "Jane Smith", "Age": 25, "Occupation": "Designer"},
    {"Name": "Bob Johnson", "Age": 40, "Occupation": "Teacher"},
    {"Name": "Alice Brown", "Age": 35, "Occupation": "Doctor"},
    {"Name": "Charlie Wilson", "Age": 28, "Occupation": "Developer"},
    {"Name": "Eva Davis", "Age": 45, "Occupation": "Writer"},
    {"Name": "Frank White", "Age": 32, "Occupation": "Artist"},
    {"Name": "Grace Miller", "Age": 38, "Occupation": "Scientist"},
    {"Name": "Henry Moore", "Age": 50, "Occupation": "Manager"}
]

# Extracted web data
web_data = [
    {"date": "2024-01-18", "title": "Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉"},
    {"date": "2024-01-18", "title": "Announcing the Deputy Developer in Residence and the Supporting Developer in Residence"},
    {"date": "2024-01-18", "title": "Python 3.13.0 alpha 3 is now available."},
    {"date": "2024-01-12", "title": "EU’s Cyber Resilience Act Passes with Wins for Open Source"},
    {"date": "2023-12-15", "title": "Python Software Foundation - December 2023 Newsletter"}
]

# Combining PDF and web data into a unified format
combined_data = []

for pdf_record, web_record in zip(pdf_data, web_data):
    combined_record = {
        "Name": pdf_record["Name"],
        "Age": pdf_record["Age"],
        "Occupation": pdf_record["Occupation"],
        "Date": web_record["date"],
        "Title": web_record["title"]
    }
    combined_data.append(combined_record)

# Displaying the combined data
for record in combined_data:
    print(record)

{'Name': 'John Doe', 'Age': 30, 'Occupation': 'Engineer', 'Date': '2024-01-18', 'Title': 'Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉'}
{'Name': 'Jane Smith', 'Age': 25, 'Occupation': 'Designer', 'Date': '2024-01-18', 'Title': 'Announcing the Deputy Developer in Residence and the Supporting Developer in Residence'}
{'Name': 'Bob Johnson', 'Age': 40, 'Occupation': 'Teacher', 'Date': '2024-01-18', 'Title': 'Python 3.13.0 alpha 3 is now available.'}
{'Name': 'Alice Brown', 'Age': 35, 'Occupation': 'Doctor', 'Date': '2024-01-12', 'Title': 'EU’s Cyber Resilience Act Passes with Wins for Open Source'}
{'Name': 'Charlie Wilson', 'Age': 28, 'Occupation': 'Developer', 'Date': '2023-12-15', 'Title': 'Python Software Foundation - December 2023 Newsletter'}


# Database API Interaction

In [27]:
import sqlite3
conn = sqlite3.connect('data.db')
# cursor object to execute SQL commands
cursor = conn.cursor()
# Executing SQL command to create a table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS combined_data (
        id INTEGER PRIMARY KEY,
        Name TEXT,
        Age INTEGER,
        Occupation TEXT,
        Date TEXT,
        Title TEXT
    )
''')
# Commiting the transaction
conn.commit()
# Inserting provided data into the combined_data table
cursor.execute('''
    INSERT INTO combined_data (Name, Age, Occupation, Date, Title)
    VALUES (?, ?, ?, ?, ?)
''', ('John Doe', 30, 'Engineer', '2024-01-18', 'Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉'))
cursor.execute('''
    INSERT INTO combined_data (Name, Age, Occupation, Date, Title)
    VALUES (?, ?, ?, ?, ?)
''', ('Jane Smith', 25, 'Designer', '2024-01-18', 'Announcing the Deputy Developer in Residence and the Supporting Developer in Residence'))
cursor.execute('''
    INSERT INTO combined_data (Name, Age, Occupation, Date, Title)
    VALUES (?, ?, ?, ?, ?)
''', ('Bob Johnson', 40, 'Teacher', '2024-01-18', 'Python 3.13.0 alpha 3 is now available.'))
cursor.execute('''
    INSERT INTO combined_data (Name, Age, Occupation, Date, Title)
    VALUES (?, ?, ?, ?, ?)
''', ('Alice Brown', 35, 'Doctor', '2024-01-12', 'EU’s Cyber Resilience Act Passes with Wins for Open Source'))
cursor.execute('''
    INSERT INTO combined_data (Name, Age, Occupation, Date, Title)
    VALUES (?, ?, ?, ?, ?)
''', ('Charlie Wilson', 28, 'Developer', '2023-12-15', 'Python Software Foundation - December 2023 Newsletter'))

conn.commit()
conn.close()

In [25]:
import sqlite3
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
cursor.execute('SELECT * FROM combined_data')
rows = cursor.fetchall()
for row in rows:
    print(row)
cursor.close()
conn.close()

(1, 'John Doe', 30, 'Engineer', '2024-01-18', 'Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉')
(2, 'Jane Smith', 25, 'Designer', '2024-01-18', 'Announcing the Deputy Developer in Residence and the Supporting Developer in Residence')
(3, 'Bob Johnson', 40, 'Teacher', '2024-01-18', 'Python 3.13.0 alpha 3 is now available.')
(4, 'Alice Brown', 35, 'Doctor', '2024-01-12', 'EU’s Cyber Resilience Act Passes with Wins for Open Source')
(5, 'Charlie Wilson', 28, 'Developer', '2023-12-15', 'Python Software Foundation - December 2023 Newsletter')
(6, 'John Doe', 30, 'Engineer', '2024-01-18', 'Announcing Python Software Foundation Fellow Members for Q3 2023! 🎉')
(7, 'Jane Smith', 25, 'Designer', '2024-01-18', 'Announcing the Deputy Developer in Residence and the Supporting Developer in Residence')
(8, 'Bob Johnson', 40, 'Teacher', '2024-01-18', 'Python 3.13.0 alpha 3 is now available.')
(9, 'Alice Brown', 35, 'Doctor', '2024-01-12', 'EU’s Cyber Resilience Act Passes with Win