# Getting started

In [None]:
# Before you start, you should read the **README** file carefully. 
# There, we walk you through setting up your local environment - a crucial step to ensure proper execution of the code in the sections below.
# Happy coding! 🚀

# Default imports

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from delta.tables import DeltaTable
from pyspark.sql import DataFrame

# Insert Access Token to get access to AWS environment

In [None]:
import requests

resp = requests.get(
    "https://api.nekt.ai/api/v1/jupyter-credentials/",
    headers={"X-Jupyter-Token": "INSERT_ACCESS_TOKEN_HERE"},
)
credentials = resp.json()

# Create session

In [None]:
# Create a Spark session with your AWS Credentials
# Check comments in the lines for further instruction

conf = (
    SparkConf()
    .setAppName("Nekt-Transformation")  # replace with your desired name
    .set("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0,org.apache.hadoop:hadoop-aws:3.3.4")
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.hadoop.fs.s3a.access.key", credentials["aws_access_key_id"])
    .set("spark.hadoop.fs.s3a.secret.key", credentials["aws_secret_access_key"])
    .set("spark.hadoop.fs.s3a.session.token", credentials["aws_session_token"])  # optional
    .set("spark.sql.shuffle.partitions", "4")  # default is 200 partitions which is too many for local
    .setMaster("local[*]")  # replace the * with your desired number of cores. * for use all.
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Load input tables

In [None]:
INPUT_TABLES = []  # replace with your input tables code generated on the platform

### Load input tables
delta_dict = {}
for table in INPUT_TABLES:
    table_layer = table.get("layer")
    table_name = table.get("name")
    if not delta_dict.get(table_layer):
        delta_dict[table_layer] = {}

    try:
        delta_dict[table_layer][table_name] = DeltaTable.forPath(spark, table.get("path"))
    except:
        delta_dict[table_layer][table_name] = None
        print(f'Failed to load delta table "{table_name}" from layer "{table_layer}".')
    else:
        print(f'Delta table "{table_name}" loaded from layer "{table_layer}".')

# User transformation

## Custom imports

In [None]:
# [OPTIONAL] Add more dependencies here as needed to perform your transformation
# Since we're using poetry, you should run `poetry add <package-name>` in order to install it in your virtual environment
# A list of existing dependencies can be found on the poetry.lock file

## Load dataframes

In [None]:
# For each data frame you want to access in your notebook, run the following code:

# delta_table: DeltaTable = delta_dict.get("layer_name").get("table_name")
# df: DataFrame = delta_table.toDF()

# Where `layer_name` is a string with the name of the layer that contains the specific table you want. For example: 'service' or 'trusted'
# And `table_name` is a string with the name of the table you want to access
# The table names and layers are all listed in the INPUT_TABLES object from the 'Load input tables' section above

delta_table: DeltaTable = delta_dict.get("INSERT_LAYER_NAME_HERE").get("INSERT_TABLE_NAME_HERE")
df: DataFrame = delta_table.toDF() # rename your data frame as you wish!

## Helpers

In [None]:
# [OPTIONAL] Add custom functions to be used later in your code here

## Transformation script

In [None]:
# Use this section (and feel free to add more sections & sub-sections!) to run your own code related 

## Transformation test

In [None]:
#Run some (or all) of the commands below to validate your final dataframe and make sure it is what you need
#In these examples, 'new_df' is the final dataframe name

print(new_df.columns)
new_df.show(5, truncate=False)
new_df.count()

## Final result

In [None]:
# Now, you should generate a function pasting on its body all the necessary code to get to your final dataframe. 
# You shouldn't copy to this final function commands that print something out, count or do any verification. Just the ones that are indeed part of the transformation. 
# Don't forget to test it in the next block of code!

def user_transformation(delta_dict):
    #Load dataframe section
    delta_table: DeltaTable = delta_dict.get("INSERT_LAYER_NAME_HERE").get("INSERT_TABLE_NAME_HERE")
    df: DataFrame = delta_table.toDF()
    
    #Custom imports section
    
    #Helpers section

    #Transformation scripts section

    
    return new_df

## Function test

In [None]:
#Run this block of code to ensure you haven't missed andy piece of code and your final function is returning the desired dataframe.
#You DON'T need to copy it to the platform, it's just a verification step.

dataframe = user_transformation(delta_dict)
dataframe.show(5)

# Add your transformation

Now that you have tested your final function, go to [Add transformation](https://app.nekt.ai/transformations/add-transformation), select your input tables, give your new table a name, and paste the `user_transformation(delta_dict)` in the code section. 

If you have new necessary dependencies, make sure to add them too in the 'Define your dependencies' section. Proceed with the flow and you'll have the new table in your Lakehouse ready to be used!