# Silver Layer - Data Transformation

## 1) Create Connection to Azure Storage account

In [0]:
storage_account = "team04sa"
application_id = "7ab46e7b-cc68-4f3f-9903-9a6bae8e347a"
directory_id = "b7a954b3-aa07-453e-b8a3-97101aeffcad"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", application_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", "6CO8Q~LRNOBGY5V~1UjmhmTdtEQwcbNbiB6ojcaw")
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token")

## 2) Read Data From Bronze Layer

In [0]:
# Path of silver and bronze layers
bronze_path = "abfss://fooddata@team04sa.dfs.core.windows.net/bronze/"
silver_base = "abfss://fooddata@team04sa.dfs.core.windows.net/silver/"

# Read bronze data
bronze_df = (spark.read.format("delta")
             .option("header", True)
            .schema(schema)
            .load(bronze_path))
df.printSchema()

## 3) Preparing Data for Enrichment

### 3.1) Import json File from Kaggle Dataset

In [0]:
# File location and type
file_location = "/FileStore/tables/kaggle.json"
file_type = "json"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
user_info = spark.read.format(file_type) \
            .option("inferSchema", infer_schema) \
            .option("header", first_row_is_header) \
            .option("sep", delimiter) \
            .load(file_location)

display(user_info)

### 3.2) Create a View or Table

In [0]:
# Create a view or table
temp_table_name = "kaggle_json"

user_info.createOrReplaceTempView(temp_table_name)

### 3.3) Extra Safety

In [0]:
# Don't show anything else! Get exactly one line with key/username
row = (df.select("username", "key").limit(1).collect()[0])

import os
os.environ["KAGGLE_USERNAME"] = row["username"]
os.environ["KAGGLE_KEY"] = row["key"]

### 3.4) Safe Data

In [0]:
!pip install kaggle
import kaggle, os

target_dir = "/dbfs/FileStore/kaggle"
os.makedirs(target_dir, exist_ok=True)

kaggle.api.authenticate()
kaggle.api.dataset_download_files(
    "asaniczka/forex-exchange-rate-since-2004-updated-daily",
    path=target_dir,
    unzip=True
)

print("Download ok ->", target_dir)

## 4) Preprocessing Exchange Data

### 4.1) Read Data from storage

In [0]:
file_location = "/FileStore/kaggle/daily_forex_rates.csv"

forex_df = (spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load(file_location))

forex_df.printSchema()
forex_df.show(5, truncate=False)