In [0]:
# =========================================
# bronze_to_silver.py
# Description: Clean data using Bing Web Search to fill null descriptions
# =========================================

from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import StringType
import requests
import pandas 

# Set Spark config for Azure Data Lake Gen2
spark.conf.set("fs.azure.account.key.storagemo.dfs.core.windows.net", "")

# Paths
bronze_path = "abfss://bronze@kkstoragemo.dfs.core.windows.net/BooksDataset50Values.csv"
silver_path = "abfss://silver@kkstoragemo.dfs.core.windows.net/BooksDatasetCleaned.parquet"

In [0]:
BING_API_KEY = ""

In [0]:
# Read CSV from Bronze
df_bronze = spark.read.option("header", "true").csv(bronze_path)

# Define function to fetch description and category from Bing
def bing_fetch_info(title):
    if not title:
        return None, None
    try:
        headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
        params = {"q": f"{title} book summary and genre", "count": 1, "mkt": "en-IN"}
        response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params)
        results = response.json()
        if "webPages" in results and results["webPages"]["value"]:
            snippet = results["webPages"]["value"][0]["snippet"]
            # Try to infer a possible category from the snippet
            possible_category = None
            for word in snippet.split(","):
                if "genre" in word.lower() or "category" in word.lower():
                    possible_category = word.strip()
                    break
            return snippet, possible_category
    except Exception as e:
        print(f"Error fetching Bing info for '{title}': {e}")
        return None, None
    return None, None

# UDF returning struct<Description:string, Category:string>
@udf("struct<Description:string, Category:string>")
def bing_info_udf(title):
    desc, category = bing_fetch_info(title)
    return {"Description": desc, "Category": category}

# Apply UDF only where Description or Category is null
df_enriched = df_bronze.withColumn(
    "bing_info", when((col("Description").isNull()) | (col("Category").isNull()), bing_info_udf(col("Title")))
)

# Fill missing values using Bing info
df_silver = df_enriched.withColumn(
    "Description",
    when(col("Description").isNull(), col("bing_info.Description")).otherwise(col("Description"))
).withColumn(
    "Category",
    when(col("Category").isNull(), col("bing_info.Category")).otherwise(col("Category"))
).drop("bing_info")

# Save to Silver
df_silver.write.mode("overwrite").parquet(silver_path)

In [0]:
df_silver.display()

Title,Authors,Description,Category,Publisher,Publish Date,Price
Goat Brothers,"By Colton, Larry","Genres Nonfiction Memoir College Biography. Hardcover. First published January 1, 1993. Book details & editions. ... thus his 1960s biography held my interest. Although when he was in a frat house with his Goat Brothers, I was in the barracks with my card playing, book reading and bowling buddies. ... his story and the Brothers K book overlap. 1960s baseball. (Duncan played for the high school closest to the one I attended.) The Brothers K","History , General",Doubleday,"Friday, January 1, 1993",Price Starting at $8.79
The Missing Person,"By Grumbach, Doris","Book Details. Title: Missing Person Author: Patrick Modiano Translator: Daniel Weissbort Pages: 168 Published: September 5, 1978 Genres: Fiction, Mystery, Literary Fiction. Synopsis of Missing Person Introduction to Guy Roland. Set against the haunting backdrop of Paris, “Missing Person” opens with Guy Roland. For ten years, he has lived in ...","Fiction , General",Putnam Pub Group,"Sunday, March 1, 1981",Price Starting at $4.99
Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.","With over 939,000 copies in print, used and recommended by more than 5,500 hospitals, and now completely revised and updated, Don't Eat Your Heart Out Cookbook is the bible for anyone seeking a heart-healthy diet. Incorporating the latest scientific and nutritional studies, lay expert Joe Piscatella outlines an effective plan for life-long heart health and explains the science behind it in plain-speaking language we all can understand. Packed with 400 healthy, low-fat recipes-soups, salads ...","Cooking , Reference",Workman Pub Co,"Thursday, September 1, 1983",Price Starting at $4.99
When Your Corporate Umbrella Begins to Leak: A Handbook for White Collar Re-Employment,"By Davis, Paul D.",Amazon.in - Buy When Your Corporate Umbrella Begins to Leak: A Handbook for White Collar Re-Employment book online at best prices in India on Amazon.in. Read When Your Corporate Umbrella Begins to Leak: A Handbook for White Collar Re-Employment book reviews & author details and more at Amazon.in. Free delivery on qualified orders.,,Natl Pr Books,"Monday, April 1, 1991",Price Starting at $4.99
Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy","An illustration of an open book. Texts. An illustration of two cells of a film strip. Video. An illustration of an audio speaker. Audio An illustration of a 3.5"" floppy disk. ... Breastfeeding : a parent's guide by Spangler, Amy. Publication date 2006 Topics Breastfeeding, Breast Feeding Publisher Atlanta : Amy Spangler Collection internetarchivebooks; americana; inlibrary; printdisabled Contributor Internet Archive Language English Item Size 208.8M . Cover title",,Amy Spangler,"Saturday, February 1, 1997",Price Starting at $5.32
The Foundation of Leadership: Enduring Principles to Govern Our Lives,"By Short, Bo","As a result, those who read this book will find themselves aspiring to new levels of leadership and success. The author explores five specific qualities -- vision, courage, perseverance, responsibility and character -- that serve as the foundation of true leadership. These are the hallmark qualities of our Founding Fathers.",,Excalibur Press,"Wednesday, January 1, 1997",Price Starting at $6.06
Chicken Soup for the Soul: 101 Stories to Open the Heart and Rekindle the Spirit,"By Canfield, Jack (COM) and Hansen, Mark Victor (COM)","An illustration of an open book. Texts. An illustration of two cells of a film strip. Video. An illustration of an audio speaker. Audio An illustration of a 3.5"" floppy disk. ... Chicken soup for the soul : 101 stories to open the heart & rekindle the spirit ... 101 stories to open the heart & rekindle the spirit by Canfield, Jack, 1944-; Hansen, Mark Victor. Publication date 1993 Topics Spiritual life Publisher","Self-help , Personal Growth , Self-Esteem",Health Communications Inc,"Saturday, May 1, 1993",Price Starting at $4.99
Journey Through Heartsongs,"By Stepanek, Mattie J. T.","Collects poems written by the eleven-year-old muscular dystrophy patient, sharing his feelings and thoughts about his life, the deaths of his siblings, nature, faith, and hope.","Poetry , General",VSP Books,"Saturday, September 1, 2001",Price Starting at $19.96
In Search of Melancholy Baby,"By Aksyonov, Vassily, Heim, Michael Henry, and Bouis, Antonina W.","The Russian author offers an affectionate chronicle of life in the United States, with discussions of such topics as the European charm of Washington, D.C., and the American immigration bureaucracy","Biography & Autobiography , General",Random House,"Monday, June 1, 1987",Price Starting at $4.99
Christmas Cookies,"By Eakin, Katherine M. and Deaman, Joane (EDT)",The Gift of the Christmas Cookie Story Summary. The Gift of the Christmas Cookie by Dandi Daley Mackall is a realistic Christian fiction story set during the Depression-Era 1930s. Jack and his mother face a pained separation from a beloved husband and father when he hops a train west to find work to support his family. ... Bite-Size Holiday Lessons by Amy Krouse Rosenthal and illustrated by Jane Dyer is a charmingly-illustrated book that uses baking Christmas cookies as a metaphor for a well ...,"Cooking , General",Oxmoor House,"Sunday, June 1, 1986",Price Starting at $12.98
