Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
filename = "scraped_books.csv"

def get_rating(rating_str):
    if rating_str == "One":
        return 1
    elif rating_str == "Two":
        return 2
    elif rating_str == "Three":
        return 3
    elif rating_str == "Four":
        return 4
    elif rating_str == "Five":
        return 5
    else:
        return 0

# Create CSV header if file doesn't exist or is empty
with open(filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Title", "Price", "Rating", "Availability", "Genre"])

In [None]:
page = 1
while True:
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        break  # Stop if no more pages
    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    if not books:
        break

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.replace("£", "")
        rating = get_rating(book.p["class"][1])
        availability = book.find("p", class_="instock availability").text.strip()

        # Visit book page to get genre
        book_link = book.h3.a["href"]
        book_url = "https://books.toscrape.com/catalogue/" + book_link
        book_resp = requests.get(book_url)
        book_soup = BeautifulSoup(book_resp.text, "html.parser")
        genre = book_soup.find("ul", class_="breadcrumb").find_all("li")[2].text.strip()

        # Write data to CSV
        with open(filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([title, price, rating, availability, genre])

    print(f"✅ Page {page} scraped successfully.")
    page += 1

print(f"\nData saved to '{filename}' successfully!")

✅ Page 1 scraped successfully.
✅ Page 2 scraped successfully.
✅ Page 3 scraped successfully.
✅ Page 4 scraped successfully.
✅ Page 5 scraped successfully.
✅ Page 6 scraped successfully.
✅ Page 7 scraped successfully.
✅ Page 8 scraped successfully.
✅ Page 9 scraped successfully.
✅ Page 10 scraped successfully.
✅ Page 11 scraped successfully.
✅ Page 12 scraped successfully.
✅ Page 13 scraped successfully.
✅ Page 14 scraped successfully.
✅ Page 15 scraped successfully.
✅ Page 16 scraped successfully.
✅ Page 17 scraped successfully.
✅ Page 18 scraped successfully.
✅ Page 19 scraped successfully.
✅ Page 20 scraped successfully.
✅ Page 21 scraped successfully.
✅ Page 22 scraped successfully.
✅ Page 23 scraped successfully.
✅ Page 24 scraped successfully.
✅ Page 25 scraped successfully.
✅ Page 26 scraped successfully.
✅ Page 27 scraped successfully.
✅ Page 28 scraped successfully.
✅ Page 29 scraped successfully.
✅ Page 30 scraped successfully.
✅ Page 31 scraped successfully.
✅ Page 32 scraped

#Pyspark

In [None]:
#Step 2
#Pyspark install

In [None]:
pip install pyspark #Installation



In [None]:
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from pyspark.sql import SparkSession # Add this import

# Initialize SparkSession if not already active
# This part is crucial to handle cases where spark might not be defined
if 'spark' not in locals() or 'spark' not in globals():
    spark = SparkSession.builder.appName("BookDataProcessing").getOrCreate()
    print("SparkSession initialized successfully within this cell.")
else:
    print("SparkSession already active.")

# Define the schema based on the scraped data: title, price, rating, availability, genre
schema = StructType([
    StructField("title", StringType(), True),
    StructField("price", StringType(), True),
    StructField("rating", IntegerType(), True),
    StructField("availability", StringType(), True),
    StructField("genre", StringType(), True)
])

# Load the data into a PySpark DataFrame
books_df = spark.read.csv("scraped_books.csv", header=True, schema=schema)

# Clean the 'price' column: remove non-numeric characters (like '£' or 'Â') and cast to float
books_df = books_df.withColumn("price",
                             regexp_replace(col("price"), "[^0-9\\.]", "").cast(FloatType()))

# Display the updated schema and show the first few rows to verify
books_df.printSchema()
books_df.show()

When `cell_id: a42cf8b7` is executed, it will first check if a SparkSession is already active. Since `cell_id: 70006402` has already initialized Spark, it will output:

```
SparkSession already active.
```

Then, it will print the schema of the `books_df` after the price cleaning, which will show the `price` column as `float`:

```
root
 |-- title: string (nullable = true)
 |-- price: float (nullable = true)
 |-- rating: integer (nullable = true)
 |-- availability: string (nullable = true)
 |-- genre: string (nullable = true)
```

Finally, it will display the first 20 rows of the DataFrame, where the `price` column values will be cleaned and converted to floating-point numbers (e.g., `Â51.77` becomes `51.77`):

```
+--------------------+-----+------+------------+------------------+
|               title|price|rating|availability|             genre|
+--------------------+-----+------+------------+------------------+
|A Light in the Attic|51.77|     3|    In stock|            Poetry|
|  Tipping the Velvet|53.74|     1|    In stock|Historical Fiction|
|          Soumission| 50.1|     1|    In stock|           Fiction|
|       Sharp Objects|47.82|     4|    In stock|           Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock|           History|
|     The Requiem Red|22.65|     1|    In stock|       Young Adult|
|The Dirty Little ...|33.34|     4|    In stock|          Business|
|The Coming Woman:...|17.93|     3|    In stock|           Default|
|The Boys in the B...| 22.6|     4|    In stock|           Default|
|     The Black Maria|52.15|     1|    In stock|            Poetry|
|Starving Hearts (...|13.99|     2|    In stock|           Default|
|Shakespeare's Son...|20.66|     4|    In stock|            Poetry|
|         Set Me Free|17.46|     5|    In stock|       Young Adult|
|Scott Pilgrim's P...|52.29|     5|    In stock|    Sequential Art|
|Rip it Up and Sta...|35.02|     5|    In stock|             Music|
|Our Band Could Be...|57.25|     3|    In stock|             Music|
|                Olio|23.88|     1|    In stock|            Poetry|
|Mesaerion: The Be...|37.59|     1|    In stock|   Science Fiction|
|Libertarianism fo...|51.33|     2|    In stock|          Politics|
|It's Only the Him...|45.17|     2|    In stock|            Travel|
+--------------------+-----+------+------------+------------------+
only showing top 20 rows
```

In [None]:
from pyspark.sql import SparkSession

# Explicitly initialize SparkSession
spark = SparkSession.builder.appName("BookScrapingAnalysis").getOrCreate()
print("SparkSession initialized successfully.")

SparkSession initialized successfully.


In [None]:
# Load the data into a PySpark DataFrame
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Define the schema based on the scraped data: title, price, rating, availability, genre
# Load 'price' as StringType first to allow cleaning
schema = StructType([
    StructField("title", StringType(), True),
    StructField("price", StringType(), True),
    StructField("rating", IntegerType(), True),
    StructField("availability", StringType(), True),
    StructField("genre", StringType(), True)
])

# Use the correct filename and specify header=True
books_df = spark.read.csv("scraped_books.csv", header=True, schema=schema)

# Display the schema and the first few rows of the DataFrame
books_df.printSchema()
books_df.show()

root
 |-- title: string (nullable = true)
 |-- price: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- availability: string (nullable = true)
 |-- genre: string (nullable = true)

+--------------------+------+------+------------+------------------+
|               title| price|rating|availability|             genre|
+--------------------+------+------+------------+------------------+
|A Light in the Attic|Â51.77|     3|    In stock|            Poetry|
|  Tipping the Velvet|Â53.74|     1|    In stock|Historical Fiction|
|          Soumission|Â50.10|     1|    In stock|           Fiction|
|       Sharp Objects|Â47.82|     4|    In stock|           Mystery|
|Sapiens: A Brief ...|Â54.23|     5|    In stock|           History|
|     The Requiem Red|Â22.65|     1|    In stock|       Young Adult|
|The Dirty Little ...|Â33.34|     4|    In stock|          Business|
|The Coming Woman:...|Â17.93|     3|    In stock|           Default|
|The Boys in the B...|Â22.60|     4|    I

In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.82)] [Connecting to security.ub                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://developer.download.nvidia.com/compute/c

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
#Verify Installation
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkSetupTest").getOrCreate()
print(spark.version)

3.5.1


In [None]:
##Check the Schema

In [None]:
# Create a new DataFrame 'df' from 'books_df'
df = books_df

print("DataFrame 'df' created from 'books_df'.")
df.printSchema()
df.show()

DataFrame 'df' created from 'books_df'.
root
 |-- title: string (nullable = true)
 |-- price: float (nullable = true)
 |-- rating: integer (nullable = true)
 |-- availability: string (nullable = true)
 |-- genre: string (nullable = true)

+--------------------+-----+------+------------+------------------+
|               title|price|rating|availability|             genre|
+--------------------+-----+------+------------+------------------+
|A Light in the Attic|51.77|     3|    In stock|            Poetry|
|  Tipping the Velvet|53.74|     1|    In stock|Historical Fiction|
|          Soumission| 50.1|     1|    In stock|           Fiction|
|       Sharp Objects|47.82|     4|    In stock|           Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock|           History|
|     The Requiem Red|22.65|     1|    In stock|       Young Adult|
|The Dirty Little ...|33.34|     4|    In stock|          Business|
|The Coming Woman:...|17.93|     3|    In stock|           Default|
|The Boys in 

In [None]:
import os

file_name = "books.csv"
if os.path.exists(file_name):
    print(f"The file '{file_name}' exists in the current directory.")
else:
    print(f"The file '{file_name}' does NOT exist in the current directory.")

The file 'books.csv' does NOT exist in the current directory.


In [None]:
df.show(10) #view 1st 10 rows

+--------------------+-----+------+------------+------------------+
|               title|price|rating|availability|             genre|
+--------------------+-----+------+------------+------------------+
|A Light in the Attic|51.77|     3|    In stock|            Poetry|
|  Tipping the Velvet|53.74|     1|    In stock|Historical Fiction|
|          Soumission| 50.1|     1|    In stock|           Fiction|
|       Sharp Objects|47.82|     4|    In stock|           Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock|           History|
|     The Requiem Red|22.65|     1|    In stock|       Young Adult|
|The Dirty Little ...|33.34|     4|    In stock|          Business|
|The Coming Woman:...|17.93|     3|    In stock|           Default|
|The Boys in the B...| 22.6|     4|    In stock|           Default|
|     The Black Maria|52.15|     1|    In stock|            Poetry|
+--------------------+-----+------+------------+------------------+
only showing top 10 rows



In [None]:
df.describe(["Price", "Rating"]).show() #Summary Statistics (Numerical Columns Only)

+-------+------------------+------------------+
|summary|             Price|            Rating|
+-------+------------------+------------------+
|  count|              1000|              1000|
|   mean|  35.0703499917984|             2.923|
| stddev|14.446689721037984|1.4349669439154795|
|    min|              10.0|                 1|
|    max|             59.99|                 5|
+-------+------------------+------------------+



In [None]:
##Step 3: Filtering Data

In [None]:
expensive_books = df.filter(df.price > 20)
expensive_books.show(10)

+--------------------+-----+------+------------+------------------+
|               title|price|rating|availability|             genre|
+--------------------+-----+------+------------+------------------+
|A Light in the Attic|51.77|     3|    In stock|            Poetry|
|  Tipping the Velvet|53.74|     1|    In stock|Historical Fiction|
|          Soumission| 50.1|     1|    In stock|           Fiction|
|       Sharp Objects|47.82|     4|    In stock|           Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock|           History|
|     The Requiem Red|22.65|     1|    In stock|       Young Adult|
|The Dirty Little ...|33.34|     4|    In stock|          Business|
|The Boys in the B...| 22.6|     4|    In stock|           Default|
|     The Black Maria|52.15|     1|    In stock|            Poetry|
|Shakespeare's Son...|20.66|     4|    In stock|            Poetry|
+--------------------+-----+------+------------+------------------+
only showing top 10 rows



In [None]:
high_rated_books = df.filter(df.rating >= 4)
high_rated_books.show(10)

+--------------------+-----+------+------------+--------------+
|               title|price|rating|availability|         genre|
+--------------------+-----+------+------------+--------------+
|       Sharp Objects|47.82|     4|    In stock|       Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock|       History|
|The Dirty Little ...|33.34|     4|    In stock|      Business|
|The Boys in the B...| 22.6|     4|    In stock|       Default|
|Shakespeare's Son...|20.66|     4|    In stock|        Poetry|
|         Set Me Free|17.46|     5|    In stock|   Young Adult|
|Scott Pilgrim's P...|52.29|     5|    In stock|Sequential Art|
|Rip it Up and Sta...|35.02|     5|    In stock|         Music|
|Chase Me (Paris N...|25.27|     5|    In stock|       Romance|
|          Black Dust|34.53|     5|    In stock|       Romance|
+--------------------+-----+------+------------+--------------+
only showing top 10 rows



In [None]:
premium_books = df.filter((df.price > 20) & (df.rating >= 4))
premium_books.show(5)

+--------------------+-----+------+------------+--------+
|               title|price|rating|availability|   genre|
+--------------------+-----+------+------------+--------+
|       Sharp Objects|47.82|     4|    In stock| Mystery|
|Sapiens: A Brief ...|54.23|     5|    In stock| History|
|The Dirty Little ...|33.34|     4|    In stock|Business|
|The Boys in the B...| 22.6|     4|    In stock| Default|
|Shakespeare's Son...|20.66|     4|    In stock|  Poetry|
+--------------------+-----+------+------------+--------+
only showing top 5 rows



In [None]:
premium_books.write.csv("premium_books.csv", header=True, mode="overwrite")