# Multithreaded Web Scraping & Data Analysis

Scraping books from **books.toscrape.com**, saving data to JSON, and analyzing it using Pandas.

This notebook demonstrates:
- Web scraping
- Multithreading
- JSON file handling
- Data preprocessing & analysis with Pandas

In [1]:

import requests
from bs4 import BeautifulSoup
import threading
import json
import pandas as pd
import time


In [2]:

# ---------------- CONFIG ----------------
BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"
PAGES_TO_SCRAPE = 5
OUTPUT_JSON_FILE = "scraped_books.json"

scraped_data = []
data_lock = threading.Lock()


In [3]:

# ---------------- PART 1: SCRAPING ----------------
def scrape_page(page_number):
    url = BASE_URL.format(page_number)
    print(f"scraping page {page_number}...")

    try:
        response = requests.get(url, timeout=10)
        response.encoding = "utf-8"

        if response.status_code != 200:
            print(f"failed page {page_number}")
            return

        soup = BeautifulSoup(response.text, "html.parser")
        books = soup.find_all("article", class_="product_pod")

        page_data = []

        for book in books:
            title = book.h3.a.get("title", "").strip()
            price_raw = book.find("p", class_="price_color").text.strip()
            availability = book.find(
                "p", class_="instock availability"
            ).text.strip()
            rating = book.find("p", class_="star-rating")["class"][1]

            page_data.append({
                "title": title,
                "price_raw": price_raw,
                "availability": availability,
                "rating": rating,
                "page": page_number
            })

        with data_lock:
            scraped_data.extend(page_data)

        print(f"page {page_number} done ({len(page_data)} books)")

    except Exception as e:
        print(f"error on page {page_number}: {e}")


In [4]:

# ---------------- PART 2: MULTITHREADING ----------------
def run_multithreaded_scraping():
    threads = []

    for page in range(1, PAGES_TO_SCRAPE + 1):
        t = threading.Thread(target=scrape_page, args=(page,))
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    print("scraping finished")


In [5]:

# ---------------- PART 3: SAVE JSON ----------------
def save_to_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"saved {len(data)} records to {filename}")


In [6]:

# ---------------- PART 4: DATA ANALYSIS ----------------
def analyze_data(filename):
    print("starting analysis...")

    df = pd.read_json(filename)

    # clean price safely
    df["price_numeric"] = (
        df["price_raw"]
        .astype(str)
        .str.replace(r"[^0-9.]", "", regex=True)
    )

    df["price_numeric"] = pd.to_numeric(
        df["price_numeric"], errors="coerce"
    )

    rating_map = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }

    df["rating_score"] = df["rating"].map(rating_map)

    df = df.dropna(subset=["price_numeric", "rating_score"])

    print("total books:", len(df))
    print("average price:", round(df["price_numeric"].mean(), 2))
    print("5-star books:", len(df[df["rating_score"] == 5]))

    df.to_csv("processed_books_data.csv", index=False)
    print("processed data saved")


In [7]:

# ---------------- MAIN ----------------
start_time = time.time()

run_multithreaded_scraping()
save_to_json(scraped_data, OUTPUT_JSON_FILE)
analyze_data(OUTPUT_JSON_FILE)

print("execution time:", round(time.time() - start_time, 2), "seconds")


scraping page 1...
scraping page 2...
scraping page 3...
scraping page 4...
scraping page 5...
page 4 done (20 books)
page 5 done (20 books)
page 3 done (20 books)
page 1 done (20 books)
page 2 done (20 books)
scraping finished
saved 100 records to scraped_books.json
starting analysis...
total books: 100
average price: 34.56
5-star books: 19
processed data saved
execution time: 0.86 seconds
