# Session 1 HTML Structure, HTML Parsing

https://www.youtube.com/watch?v=salY_Sm6mv4

Sample HTML

```html
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1 id="main-heading">Welcome to Web Scraping with Python</h1>
    <p class="intro">This is a sample web page for demonstration.</p>
    <div class="content">
        <p>This is a paragraph inside a div with class "content".</p>
    </div>
    <ul>
    <ul>
        <li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>
        <li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>
        <li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>
    </ul>
    </ul>
    <table>
        <tr>
            <th>Name</th>
            <th>Age</th>
        </tr>
        <tr>
            <td>John</td>
            <td>30</td>
        </tr>
        <tr>
            <td>Alice</td>
            <td>25</td>
        </tr>
    </table>
</body>
</html>
```

```var pageTitleElements = document.querySelectorAll(".page-title");

if (pageTitleElements.length > 0) {
    pageTitleElements.forEach(function(element) {
        console.log(element.textContent);
    });
} else {
    console.log("No elements with the class 'page-title' found.");
}
```

<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1 id="main-heading">Welcome to Web Scraping with Python</h1>
    <p class="intro">This is a sample web page for demonstration.</p>
    <div class="content">
        <p>This is a paragraph inside a div with class "content".</p>
    </div>
    <ul>
    <ul>
        <li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>
        <li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>
        <li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>
    </ul>
    </ul>
    <table>
        <tr>
            <th>Name</th>
            <th>Age</th>
        </tr>
        <tr>
            <td>John</td>
            <td>30</td>
        </tr>
        <tr>
            <td>Alice</td>
            <td>25</td>
        </tr>
    </table>
</body>
</html>


Read HTML from string



In [None]:
import requests
from bs4 import BeautifulSoup

html_content = '''
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1 id="main-heading">Welcome to Web Scraping with Python</h1>
    <p class="intro">This is a sample web page for demonstration.</p>
    <div class="content">
        <p>This is a paragraph inside a div with class "content".</p>
    </div>
    <ul>
    <ul>
        <li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>
        <li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>
        <li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>
    </ul>
    </ul>
    <table>
        <tr>
            <th>Name</th>
            <th>Age</th>
        </tr>
        <tr>
            <td>John</td>
            <td>30</td>
        </tr>
        <tr>
            <td>Alice</td>
            <td>25</td>
        </tr>
    </table>
</body>
</html>
'''

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
soup



<!DOCTYPE html>

<html>
<head>
<title>Sample Page</title>
</head>
<body>
<h1 id="main-heading">Welcome to Web Scraping with Python</h1>
<p class="intro">This is a sample web page for demonstration.</p>
<div class="content">
<p>This is a paragraph inside a div with class "content".</p>
</div>
<ul>
<ul>
<li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>
<li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>
<li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>
</ul>
</ul>
<table>
<tr>
<th>Name</th>
<th>Age</th>
</tr>
<tr>
<td>John</td>
<td>30</td>
</tr>
<tr>
<td>Alice</td>
<td>25</td>
</tr>
</table>
</body>
</html>

# Read HTML from local file
```python
# Open and read the HTML file
with open('your_file.html', 'r') as file:
    html = file.read()
soup = BeautifulSoup(html, 'html.parser')
soup
```

In [None]:
import requests
from bs4 import BeautifulSoup

page = requests.get('https://raw.githubusercontent.com/misrori/rdata/main/sample.html')

In [None]:
import requests
from bs4 import BeautifulSoup

page = requests.get('https://raw.githubusercontent.com/misrori/rdata/main/sample.html')

print(page.status_code)
print(page.content)
print(page.text)

# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

# Select elements by ID
main_heading = soup.find(id='main-heading')
print(main_heading.text)

# Select elements by class
intro_paragraph = soup.find(class_='intro')
print(intro_paragraph.text)


# Select elements by tag
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
    print("Paragraph:", paragraph.text)


# Select with select and select_one
print(soup.select_one('#main-heading').text)
print(soup.select('.intro').text)
print(soup.select('p').text)



# Select a table and its rows
table = soup.find('table')
rows = table.find_all('td')
for row in rows:
    print(row.text)
print([x.string for x in rows])
print([x.text for x in rows])


# Select list items
list_items = soup.find_all('li')
for item in list_items:
    print("List Item:", item.text)


200
b'<!DOCTYPE html>\n<html>\n<head>\n    <title>Sample Page</title>\n</head>\n<body>\n    <h1 id="main-heading">Welcome to Web Scraping with Python</h1>\n    <p class="intro">This is a sample web page for demonstration.</p>\n    <div class="content">\n        <p>This is a paragraph inside a div with class "content".</p>\n    </div>\n    <ul>\n        <li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>\n        <li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>\n        <li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>\n    </ul>\n    <table>\n        <tr>\n            <th>Name</th>\n            <th>Age</th>\n        </tr>\n        <tr>\n            <td>John</td>\n            <td>30</td>\n        </tr>\n        <tr>\n            <td>Alice</td>\n            <td>25</td>\n        </tr>\n    </table>\n</body>\n</html>\n'
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1 id="main-heading">Welcome to Web Scrap

In [None]:
# Select list items
list_items = soup.find_all('li')
for item in list_items:
    print(item)
    print(item.text)
    print(item.find('a')['href'])  # item.a['href']
    print('-------')

<li>Item 1 - <a href="https://www.example1.com">Example 1</a></li>
Item 1 - Example 1
https://www.example1.com
-------
<li>Item 2 - <a href="https://www.example2.com">Example 2</a></li>
Item 2 - Example 2
https://www.example2.com
-------
<li>Item 3 - <a href="https://www.example3.com">Example 3</a></li>
Item 3 - Example 3
https://www.example3.com
-------


In [None]:
# if pd.read_html does not work, we can use pd.read_html using requests.
import pandas as pd

url = "https://raw.githubusercontent.com/misrori/rdata/main/sample.html"

r = requests.get(url)
df_list = pd.read_html(r.text) # this parses all the tables in webpages to a list
df = df_list[0]
df.head()

Unnamed: 0,Name,Age
0,John,30
1,Alice,25


#Task
Scrape this site
https://www.scrapethissite.com/pages/



In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd

page = requests.get('https://www.scrapethissite.com/pages/')

print(page.status_code)
print(page.content)
print(page.text)

# Create a BeautifulSoup object
t = BeautifulSoup(page.text, 'html.parser')


200
b'<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n    <title>Learn Web Scraping | Scrape This Site | A public sandbox for learning web scraping</title>\n    <link rel="icon" type="image/png" href="/static/images/scraper-icon.png" />\n\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="Here are some practice pages you can scrape.">\n\n    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" crossorigin="anonymous">\n    <link href=\'https://fonts.googleapis.com/css?family=Lato:400,700\' rel=\'stylesheet\' type=\'text/css\'>\n    <link rel="stylesheet" type="text/css" href="/static/css/styles.css">\n\n    \n\n  </head>\n\n  <body>\n    <nav id="site-nav">\n            <div class="container">\n

In [None]:
titles = t.find_all(class_='page-title')
titles = [x.text.strip() for x in titles]

In [None]:
links = t.find_all(class_='page-title')
links = [f"https://www.scrapethissite.com{x.find('a')['href']}" for x in links]


In [None]:
summary = t.find_all(class_='lead')
summary = [x.text.strip() for x in summary]


In [None]:
pd.DataFrame({'title': titles, 'link':links, 'summary':summary})

Unnamed: 0,title,link,summary
0,Countries of the World: A Simple Example,https://www.scrapethissite.com/pages/simple/,A single page that lists information about all...
1,"Hockey Teams: Forms, Searching and Pagination",https://www.scrapethissite.com/pages/forms/,Browse through a database of NHL team stats si...
2,Oscar Winning Films: AJAX and Javascript,https://www.scrapethissite.com/pages/ajax-java...,Click through a bunch of great films. Learn ho...
3,Turtles All the Way Down: Frames & iFrames,https://www.scrapethissite.com/pages/frames/,Some older sites might still use frames to bre...
4,Advanced Topics: Real World Challenges You'll ...,https://www.scrapethissite.com/pages/advanced/,"Scraping real websites, you're likely run into..."
