#Ingestion

## Define kaggle credentials path

In [0]:
kaggle_credentials_path = "dev_bronze.kaggle_auth.kaggle_authentificaton_parameters"

## Extract kaggle credentials

In [0]:
# Load the credentials into row
row = spark.table(kaggle_credentials_path).select("username", "key").first()

# Validate and assign safely
if row and row['username'] and row['key']:
    KAGGLE_USERNAME = row['username']
    KAGGLE_KEY = row['key']
else:
    raise ValueError("Kaggle credentials not found or incomplete in the table.")

## Create kaggle authentification function

In [0]:
def authentificate_kaggle(KAGGLE_USERNAME, KAGGLE_KEY):
    import os
    os.environ['KAGGLE_USERNAME'] = KAGGLE_USERNAME
    os.environ['KAGGLE_KEY'] = KAGGLE_KEY

    from kaggle.api.kaggle_api_extended import KaggleApi

    api = KaggleApi()
    api.authenticate()

    print('Authentification is successful')

    return api

## Call kaggle authentification function

In [0]:
try:
    api = authentificate_kaggle(KAGGLE_USERNAME, KAGGLE_KEY)
#in case of error, install the module
except Exception as e:
    print(e)
    print("Intalling the module: pip install kaggle")

    import sys
    import subprocess

    subprocess.check_call([sys.executable, "-m", "pip", "install", "kaggle"])

    api = authentificate_kaggle(KAGGLE_USERNAME, KAGGLE_KEY)

## Define kaggle file parameters

In [0]:
#define file name and path
file_name = 'arashnic/book-recommendation-dataset'
file_path = '/Volumes/dev_bronze/books/books_raw_data'

##Download kaggle files

In [0]:
api.dataset_download_files(dataset = file_name, path= file_path, unzip=True)

#Bronze tables creation

## Import of libraries

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, DecimalType

## Creation of `bronze_books` table

In [0]:
books_bronze = spark.read.option("inferSchema", True).option("header", True).csv("/Volumes/dev_bronze/books/books_raw_data/Books.csv")

# Save your DataFrame as a managed table in Unity Catalog
books_bronze.write.mode("overwrite").saveAsTable("dev_bronze.books.books_bronze")

##Creation of `bronze_ratings` table

In [0]:

ratings_bronze = spark.read.option("inferSchema", True).option("header", True).csv("/Volumes/dev_bronze/books/books_raw_data/Ratings.csv")

# Save your DataFrame as a managed table in Unity Catalog
ratings_bronze.write.mode("overwrite").saveAsTable("dev_bronze.books.ratings_bronze")

##Creation of bronze_users table

In [0]:

users_bronze = spark.read.option("inferSchema", True).option("header", True).csv("/Volumes/dev_bronze/books/books_raw_data/Users.csv")

# Save your DataFrame as a managed table in Unity Catalog
users_bronze.write.mode("overwrite").saveAsTable("dev_bronze.books.users_bronze")