# Web Scraping:

###### **Beautiful Soup**: Library for pulling data out of HTML and XML files.
###### **Scrapy**: Framework for web crawling and scraping.

## Beautiful Soup

from bs4 import BeautifulSoup
import requests

###### Define the URL to scrape
url = "https://www.example.com"

###### Send a GET request to the URL and get the HTML content
response = requests.get(url)
html_content = response.text

###### Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

###### 1. Extract Title of the Page
title = soup.title.string
print("Title:", title)

###### 2. Find and Print All Links
links = soup.find_all("a")
for link in links:
    print("Link:", link.get("href"))

###### 3. Find and Print All Paragraphs
paragraphs = soup.find_all("p")
for paragraph in paragraphs:
    print("Paragraph:", paragraph.text)

###### 4. Find and Print Specific Element by Class
specific_element = soup.find("div", class_="specific-class")
if specific_element:
    print("Specific Element:", specific_element.text)
else:
    print("Specific Element not found")

###### 5. Extract Image Source
image = soup.find("img")
if image:
    image_source = image.get("src")
    print("Image Source:", image_source)
else:
    print("Image not found")

###### 6. Extract Data from Tables
table = soup.find("table")
if table:
    rows = table.find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        row_data = [cell.text for cell in cells]
        print("Table Row:", row_data)
else:
    print("Table not found")


## Scrapy

import scrapy

class MySpider(scrapy.Spider):
    name = 'my_spider'
    start_urls = ['https://www.example.com']

    def parse(self, response):
        # 1. Extract Title of the Page
        title = response.css('title::text').get()
        self.log(f"Title: {title}")

        # 2. Find and Print All Links
        links = response.css('a::attr(href)').getall()
        for link in links:
            self.log(f"Link: {link}")

        # 3. Find and Print All Paragraphs
        paragraphs = response.css('p::text').getall()
        for paragraph in paragraphs:
            self.log(f"Paragraph: {paragraph}")

        # 4. Find and Print Specific Element by Class
        specific_element = response.css('.specific-class::text').get()
        if specific_element:
            self.log(f"Specific Element: {specific_element}")
        else:
            self.log("Specific Element not found")

        # 5. Extract Image Source
        image_source = response.css('img::attr(src)').get()
        if image_source:
            self.log(f"Image Source: {image_source}")
        else:
            self.log("Image not found")

        # 6. Extract Data from Tables
        rows = response.css('table tr')
        for row in rows:
            row_data = row.css('td::text').getall()
            self.log(f"Table Row: {row_data}")

##### Run the spider
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
process.crawl(MySpider)
process.start()
