In [0]:
%pip install kaggle

In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql import functions as f 
from pyspark.sql.types import *

from datetime import datetime
import pytz

import os
import shutil

In [0]:
particao = datetime.now(pytz.timezone('America/Sao_Paulo')).strftime('%Y-%m-%d')
print(f"""particao: {particao}""")

In [0]:
kaggle_dir = os.path.expanduser("~/.kaggle")
print(f"""kaggle_dir_raiz:      {kaggle_dir}""")
os.makedirs(kaggle_dir, exist_ok=True)
print(f"""kaggle_dir_destino:   {kaggle_dir}""")

In [0]:
source_path         = "/Volumes/workspace/default/landing_zone/kaggle.json"
destination_path    = os.path.join(kaggle_dir, "kaggle.json")

print(f""" > source_path:       {source_path}""")
print(f""" > destination_path:  {destination_path}""")

In [0]:
try:
    shutil.copy(source_path, destination_path)
    print(f"Arquivo '{source_path}' \ncopiado para '{destination_path}' com sucesso.")
    # Ajustar permissões para segurança (importante!)
    os.chmod(destination_path, 0o600) # Permissões de leitura/escrita apenas para o proprietário
    print(f"Permissões de '{destination_path}' ajustadas para 0o600.")
except FileNotFoundError:
    print(f"Erro: O arquivo de origem '{source_path}' não foi encontrado.")
except Exception as e:
    print(f"Ocorreu um erro ao copiar o arquivo: {e}")

In [0]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [0]:
# Baixar o dataset
# dataset = "thedevastator/unlock-profits-with-e-commerce-sales-data"
dataset = "rush4ratio/video-game-sales-with-ratings"
api.dataset_download_files(dataset, path="/Volumes/workspace/default/landing_zone/", unzip=True)

In [0]:
display(
    dbutils.fs.ls('/Volumes/workspace/default/landing_zone/')
)

In [0]:
file_name_contains = 'Video_Games_Sales'
for file in dbutils.fs.ls('/Volumes/workspace/default/landing_zone/'):
    if file.name.__contains__(file_name_contains):
        source_file = file.path.replace("dbfs:","")
        print(source_file)

In [0]:
struct = StructType([StructField('Name', StringType(), True), StructField('Platform', StringType(), True), StructField('Year_of_Release', StringType(), True), StructField('Genre', StringType(), True), StructField('Publisher', StringType(), True), StructField('NA_Sales', StringType(), True), StructField('EU_Sales', StringType(), True), StructField('JP_Sales', StringType(), True), StructField('Other_Sales', StringType(), True), StructField('Global_Sales', StringType(), True), StructField('Critic_Score', StringType(), True), StructField('Critic_Count', StringType(), True), StructField('User_Score', StringType(), True), StructField('User_Count', StringType(), True), StructField('Developer', StringType(), True), StructField('Rating', StringType(), True)])

In [0]:
df = (
    spark
    .read
    .format("csv")
    .option('sep', ',')
    .option('header', 'true')
    .schema(struct)
    .load(source_file)
)

display(df.limit(10))

In [0]:
df = df.withColumn("data_ingestao",f.lit(particao))

In [0]:
display(df.limit(10))

In [0]:
spark.sql("CREATE CATALOG if NOT EXISTS bronze")

In [0]:
spark.sql("CREATE DATABASE if not EXISTS ingestion")
#spark.sql("drop table if exists bronze.default.video_games_sales")

In [0]:
# spark.sql("USE CATALOG bronze")

In [0]:
query = (
    df
    .write
    .format("delta")
    .mode("append")
    .partitionBy("data_ingestao")
    .saveAsTable("bronze.ingestion.video_games_sales")
)


In [0]:
display(
  spark
  .table("bronze.ingestion.video_games_sales")
  .limit(10)
)

In [0]:
#remover o arquivo que acabei de ler e ingerir 

dbutils.fs.rm(source_file)


In [0]:
%sql
describe detail bronze.ingestion.video_games_sales

In [0]:
%sql
describe history bronze.ingestion.video_games_sales

In [0]:
#spark.sql( 'DROP TABLE IF EXISTS bronze.ingestion.video_games_sales' )