In [1]:
import nest_asyncio
nest_asyncio.apply()

# Install the asyncio reactor BEFORE importing scrapy
import twisted.internet.asyncioreactor
twisted.internet.asyncioreactor.install()

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd

# Storage for scraped items
scraped_data = []

class BooksSpider(scrapy.Spider):
    name = "Books"
    start_urls = ["https://books.toscrape.com/catalogue/page-1.html"]
    max_items = 250
    item_count = 0

    def parse(self, response):
        category = response.css("ul.breadcrumb li:nth-child(3) a::text").get()

        for book in response.css("article.product_pod"):
            if self.item_count >= self.max_items:
                return
            self.item_count += 1
            scraped_data.append({
                "title": book.css("h3 a::attr(title)").get(),
                "price": book.css("p.price_color::text").get(),
                "availability": book.css("p.instock.availability::text").getall()[-1].strip(),
                "url": response.urljoin(book.css("h3 a::attr(href)").get())
            })

        # Pagination
        next_page = response.css("li.next a::attr(href)").get()
        if next_page and self.item_count < self.max_items:
            yield response.follow(next_page, self.parse)

# Run Scrapy
process = CrawlerProcess(settings={"LOG_LEVEL": "ERROR"})
process.crawl(BooksSpider)
process.start()  # Blocks until spider finishes

# Save to CSV
if scraped_data:
    df = pd.DataFrame(scraped_data)
    df.to_csv("Books.csv", index=False, encoding="utf-8")
    print(f"âœ… Scraped {len(df)} books and saved to books.csv")
    display(df.head())


âœ… Scraped 250 books and saved to books.csv


Unnamed: 0,title,price,availability,url
0,A Light in the Attic,Â£51.77,In stock,https://books.toscrape.com/catalogue/a-light-i...
1,Tipping the Velvet,Â£53.74,In stock,https://books.toscrape.com/catalogue/tipping-t...
2,Soumission,Â£50.10,In stock,https://books.toscrape.com/catalogue/soumissio...
3,Sharp Objects,Â£47.82,In stock,https://books.toscrape.com/catalogue/sharp-obj...
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,https://books.toscrape.com/catalogue/sapiens-a...


In [2]:
# Convert price to numeric
df["price_num"] = (
    df["price"]
    .astype(str)                # Ensure string type
    .str.replace("Â£", "", regex=False)  # Remove pound sign
    .str.strip()                 # Remove spaces
    .astype(float)               # Convert to float
)

In [3]:
print("ðŸ“Š Dataset shape:", df.shape)
print("ðŸ’° Average price:", df["price_num"].mean())
print("ðŸ’Ž Most expensive book:\n", df.loc[df["price_num"].idxmax()])
print("ðŸ“‰ Cheapest book:\n", df.loc[df["price_num"].idxmin()])

ðŸ“Š Dataset shape: (250, 5)
ðŸ’° Average price: 34.500080000000004
ðŸ’Ž Most expensive book:
 title           Thomas Jefferson and the Tripoli Pirates: The ...
price                                                      Â£59.64
availability                                             In stock
url             https://books.toscrape.com/catalogue/thomas-je...
price_num                                                   59.64
Name: 133, dtype: object
ðŸ“‰ Cheapest book:
 title                                                    Patience
price                                                      Â£10.16
availability                                             In stock
url             https://books.toscrape.com/catalogue/patience_...
price_num                                                   10.16
Name: 84, dtype: object


In [4]:
availability_counts = df["availability"].value_counts()
print("ðŸ“¦ Availability counts:\n", availability_counts)

ðŸ“¦ Availability counts:
 availability
In stock    250
Name: count, dtype: int64


In [5]:
top10 = df.sort_values("price_num", ascending=False).head(10)
print("ðŸ”¥ Top 10 Most Expensive Books:\n", top10[["title", "price_num"]])

ðŸ”¥ Top 10 Most Expensive Books:
                                                  title  price_num
133  Thomas Jefferson and the Tripoli Pirates: The ...      59.64
68        The Death of Humanity: and the Case for Life      58.11
135  The White Cat and the Monk: A Retelling of the...      58.08
186  I Had a Nice Time And Other Lies...: How to fi...      57.36
100  Immunity: How Elie Metchnikoff Changed the Cou...      57.36
40                      Slow States of Collapse: Poems      57.31
15   Our Band Could Be Your Life: Scenes from the A...      57.25
231                                The Wright Brothers      56.80
122  A Piece of Sky, a Grain of Rice: A Memoir in F...      56.76
58                                 The Past Never Ends      56.50
