## Web scraping using lxml

In [18]:
import lxml.html as web
from lxml.etree import XPath
import math
import csv

In [19]:
baseUrl="http://books.toscrape.com/"
bookUrl=baseUrl+"catalogue/category/books/childrens_11/index.html"
pageUrl=baseUrl+"catalogue/category/books/childrens_11/page-" #page-1,pag
columns=['title','price','stock','imageUrl','rating','url'] #for CSV head

## Empty dataSet and default page values

In [20]:
dataSet=[]
page=1
totalPages=1

## Save dataSet to CSV file

In [21]:
def writeto_csv(data,filename,columns):
     with open(filename,'w+',newline='',encoding="UTF-8") as file:
          writer = csv.DictWriter(file,fieldnames=columns)
          writer.writeheader()
          writer = csv.writer(file)
          for element in data:
               writer.writerows([element])

In [22]:
# Web scraping loop
while page <= totalPages:
    source = web.parse(pageUrl + str(page) + ".html").getroot()  # Read and parse the page

    # Pagination handling
    if page == 1:
        perpageArticles = source.xpath("//form[@class='form-horizontal']//input[@name='perpage']/@value")
        totalArticles = source.xpath("//form[@class='form-horizontal']//input[@name='total']/@value")

        if perpageArticles and totalArticles:
            totalPages = math.ceil(int(totalArticles[0]) / int(perpageArticles[0]))

        print("TotalPages found:", totalPages)

    print(f"Processing Page {page} from {totalPages}")

    # Paths for individual elements
    articles = source.xpath("//ol[contains(@class,'row')]/li[position()>0]")
    titlePath = ".//article[contains(@class,'product_pod')]/h3/a/@title"
    linkPath = ".//article[contains(@class,'product_pod')]/h3/a/@href"
    pricePath = ".//article/div[2]/p[contains(@class,'price_color')]/text()"
    stockPath = ".//article/div[2]/p[contains(@class,'availability')]/text()"
    imagePath = ".//article/div[1][contains(@class,'image_container')]/img/@src"
    ratingPath = ".//article/p[contains(@class,'star-rating')]/@class"

    # Iterate through all articles
    for row in articles:
        title = row.xpath(titlePath)[0].strip() if row.xpath(titlePath) else ""
        link = row.xpath(linkPath)[0].replace('../../../', baseUrl + 'catalogue/') if row.xpath(linkPath) else ""
        price = row.xpath(pricePath)[0] if row.xpath(pricePath) else ""
        availability = row.xpath(stockPath)[0].strip() if row.xpath(stockPath) else ""
        image = row.xpath(imagePath)[0].replace('../../../../', baseUrl).strip() if row.xpath(imagePath) else ""
        rating = row.xpath(ratingPath)[0].replace('star-rating', '').strip() if row.xpath(ratingPath) else ""

        # Add to dataset if title is not missing
        if title:
            dataSet.append([title, price, availability, image, rating, link])

    print("Rows in Dataset:", len(dataSet))
    page += 1  # Increment page for loop

# Print total number of elements collected
print(f"Total items collected: {len(dataSet)}")

# Save the dataset to a CSV file
writeto_csv(dataSet, 'books.csv', columns)


TotalPages found: 1
Processing Page 1 from 1
Rows in Dataset: 20
Total items collected: 20
