## OpenDict DEMO

### 1. Install client libraries

In [None]:
%pip install pyspark-opendic==0.4.0
%pip install snowflake-opendic==0.1.21

### 2 Configure spark-iceberg session

#### 2.1 read_secret()

In [None]:
from pyspark.sql import SparkSession  # type: ignore

def read_secret(secret_name):
    """ Get `secret_name` from docker-compose secret store"""
    secret_path = f"/run/secrets/{secret_name}"
    try:
        with open(secret_path, "r") as f:
            return f.read().strip()  # Remove any trailing newline
    except FileNotFoundError:
        print(f"Secret {secret_name} not found.")
        return None

#### 2.2 Define session variables

In [None]:
ENGINEER_CLIENT_ID = read_secret("engineer_client_id")
ENGINEER_CLIENT_SECRET =  read_secret("engineer_client_secret")

CATALOG_NAME = 'polaris'
FILE_IO = "org.apache.iceberg.io.ResolvingFileIO"
CATALOG_URI = "http://polaris:8181/api/catalog"
OATH_SERVER_URI = "http://polaris:8181/api/catalog/v1/oauth/tokens"

# OATH_SERVER_URI = "https://opendict.duckdns.org/api/catalog/v1/oauth/tokens"
# CATALOG_NAME = 'AZURE_CATALOG'
# ADLS_IO="org.apache.iceberg.azure.adlsv2.ADLSFileIO"
# CATALOG_URI="https://opendict.duckdns.org/api/catalog"


#### 2.3 Configure spark session with variables

In [None]:
def create_session(client_id, client_secret, scope, fileio_impl, oath_server_uri):
    spark = (SparkSession.builder
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.0,software.amazon.awssdk:bundle:2.28.17,software.amazon.awssdk:url-connection-client:2.28.17")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
        .config("spark.sql.catalog.polaris", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.polaris.type", "rest")
        .config("spark.sql.catalog.polaris.warehouse", CATALOG_NAME)
        .config("spark.sql.catalog.polaris.uri", CATALOG_URI)
        .config("spark.sql.catalog.polaris.credential", f"{client_id}:{client_secret}")
        .config("spark.sql.catalog.polaris.scope", scope)
        .config("spark.sql.catalog.polaris.auth.type", "OAUTH2")
        .config("spark.sql.defaultCatalog", "polaris")
        .config("oauth2-server-uri", oath_server_uri)
        .config("spark.driver.extraJavaOptions", "-Divy.cache.dir=/tmp -Divy.home=/tmp")
        .config("spark.sql.catalog.polaris.token-refresh-enabled", "true")
        .config("spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation", 'vended-credentials')
        .config("spark.sql.catalog.polaris.io-impl", fileio_impl)
        .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")).getOrCreate()
        
    print("Spark Running")
    return spark


## Start Spark Session
# spark = create_session(client_id=ENGINEER_CLIENT_ID, client_secret=ENGINEER_CLIENT_SECRET, scope='PRINCIPAL_ROLE:ALL',fileio_impl=ADLS_IO )
spark = create_session(client_id=ENGINEER_CLIENT_ID, client_secret=ENGINEER_CLIENT_SECRET, scope='PRINCIPAL_ROLE:ALL',fileio_impl=FILE_IO, oath_server_uri=OATH_SERVER_URI )
spark

#### 2.4 Wrap spark session with the opendict-spark library

In [None]:
from pyspark_opendic.catalog import OpenDicCatalog  # type: ignore

# Init opendict client library
# API_URI= "https://opendict.duckdns.org/api"

POLARIS_BASE_URL = "http://polaris:8181/api"

catalog = OpenDicCatalog(spark, POLARIS_BASE_URL)
print("Catalog initialized")

### 3. Configure up opendict-snowflake

In [None]:
from snowflake_opendic.snow_opendic import snowflake_connect # type: ignore

def read_secret(secret_name):
    """ Get `secret_name` from docker-compose secret store"""
    secret_path = f"/run/secrets/{secret_name}"
    try:
        with open(secret_path, "r") as f:
            return f.read().strip()  # Remove any trailing newline
    except FileNotFoundError:
        print(f"Secret {secret_name} not found.")
        return None

def snowflake_init_db(conn):
    with conn.cursor() as curr:
        curr.execute("CREATE DATABASE IF NOT EXISTS OPENDIC;")
        curr.execute("use OPENDIC;")
        curr.execute("CREATE SCHEMA IF NOT EXISTS EXPERIMENT;")

ENGINEER_CLIENT_ID = read_secret("engineer_client_id")
ENGINEER_CLIENT_SECRET = read_secret("engineer_client_secret")

print("Secrets read ✔️")

config_path = f"/run/secrets/snowflake-conf"
SNOWFLAKE_CONN = snowflake_connect(config_path)
snowflake_init_db(SNOWFLAKE_CONN)

print("Snowflake conn initialized ✔️")


from snowflake_opendic.catalog import OpenDicSnowflakeCatalog


POLARIS_URI= "https://opendict.duckdns.org/api"

snowflake_catalog = OpenDicSnowflakeCatalog(SNOWFLAKE_CONN, POLARIS_URI, ENGINEER_CLIENT_ID, ENGINEER_CLIENT_SECRET)
print("Catalog initialized ✔️")

### 4. Scenario and datalake overview

#### 4.1 The AZURE DATALAKE


```
warehouse/
├── SYSTEM/
└── nyc/taxis
```

In [None]:
spark.sql("SHOW NAMESPACES").toPandas()

#### 4.2 The taxis dataset

In [None]:
spark.sql("""
          SELECT tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, fare_amount 
          FROM nyc.taxis limit 10
          """).toPandas()

In [None]:
spark.sql("""
          SELECT tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, usd_to_dkk(fare_amount) 
          FROM nyc.taxis limit 10
          """).toPandas()

#### 4.3 Task definition

Translate fare_amount DKK and add a 25% MOMS rate.

**Problem**
- Access same table in Snowflake and Spark
- Want to define once
- Want updates to propogate

**Solution**: 
- OpenDict user-defined object
- Define, create, map, and sync function between multiple engines.

In [None]:
-- Code example
CREATE FUNCTION usd_to_dkk(amount FLOAT)
  RETURNS FLOAT
  AS
  $$
    amount * 6.52 * 1.25 
  $$

#### 4.4. Define the schema for a OpenDict function object

In [None]:
catalog.sql(
    """
    DEFINE OPEN function
    props {
        "args": "map",
        "language": "string",
        "def": "string",
        "comment": "string",
        "return_type": "string"
    }
    """
)

#### 4.5. Create a new function

In [None]:
catalog.sql(
 """
 CREATE OPEN function usd_to_dkk
    PROPS {
            "args": {
                "amount": "DOUBLE"
                },
            "language": "SQL",
            "def": "amount * 6.52 * 1.25",
            "comment": "Conversion function for USD to DKK including VAT",
            "return_type": "DOUBLE"
        }
"""
)

#### 4.6. Create Mappings for spark and snowflake

In [None]:
catalog.sql(
"""
ADD OPEN MAPPING function PLATFORM spark
SYNTAX {
    CREATE <type> <name>(<args>)
    RETURNS <return_type>
    LANGUAGE <language>
    AS 'RETURN <def>';
}
PROPS {
    "args": {
            "propType": "map",
            "format": "<key> <value>",
            "delimiter": ", "
        }
}
"""
)

In [None]:
catalog.sql(
"""
ADD OPEN MAPPING function PLATFORM snowflake
SYNTAX {
CREATE OR REPLACE <type> <name>(<args>)
RETURNS <return_type>
LANGUAGE <language>
AS 
$$
<def>
$$;
}
PROPS {
    "args": {
            "propType": "map",
            "format": "<key> <value>",
            "delimiter": ", "
        }
}
"""
)

#### 4.7 Sync to engines

In [None]:
catalog.sql(
    """
    SYNC OPEN OBJECTS for spark
    """
)

In [None]:
snowflake_catalog.sql(
    """
    SYNC OPEN OBJECTS for snowflake
    """
)

### Utils

#### List objects

In [None]:
catalog.sql(
    """
    SHOW OPEN TYPES
    """
)

In [None]:
catalog.sql(
    """
    SHOW OPEN function
    """
)

In [None]:
# Show mapping for <object> to <platform>. Example: [Platform_mapping(function_v2 -> snowflake)]
catalog.sql(
    """
    SHOW OPEN MAPPING function_v2 PLATFORM snowflake
    """
)

In [None]:
# Show all mappings from <object>. Example: [snowflake,spark]
catalog.sql(
    """
    SHOW OPEN PLATFORMS FOR function
    """
)

In [None]:
catalog.sql(
    """
    SHOW OPEN PLATFORMS
    """
)

In [None]:
catalog.sql(
    """
    SHOW OPEN MAPPINGS FOR snowflake
    """
)

In [None]:
catalog.sql(
    """
    SYNC OPEN function_v2 for snowflake
    """
)

In [None]:
catalog.sql(
    """
    SYNC OPEN OBJECTS for snowflake
    """
)

#### Drop objects

In [None]:
catalog.sql(
    """
    DROP OPEN function
    """
)


In [None]:
catalog.sql(
    """
    DROP OPEN MAPPINGS for snowflake
    """
)


In [None]:
catalog.sql(
    """
    DROP OPEN MAPPINGS for spark
    """
)

#### Visualize opendic tables

In [None]:
%%sql
show tables in SYSTEM

In [49]:
%%sql
show namespaces in SYSTEM

namespace
SYSTEM.PLATFORM_MAPPINGS


In [None]:
%%sql
USE SYSTEM

In [None]:
%%sql
select * from SYSTEM.function

In [None]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc

In [None]:
%%sql

DROP TABLE IF EXISTS nyc.taxis

In [None]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet")
df.write.saveAsTable("nyc.taxis")

In [None]:
%%sql

SELECT *
FROM nyc.taxis limit 10