# {TPL_PROJECT}

This is `app` notebook.

## Pre-requisites

### For python


In [None]:
import sys

# install additional libraries here.
!{sys.executable} -m pip install delta-spark minio pysftp pandas retrying

## Execution

### Dependencies


In [None]:
import io
import math
import json
import os
import random
import requests
import secrets
import socket
import string

from pprint import pprint, pformat
from datetime import datetime


In [None]:
import pandas as pd

from delta import *
from minio import Minio
from pyspark import SparkConf
from pyspark.sql import functions as F, DataFrame, SparkSession
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType
from retrying import retry


In [None]:
import logging

# create logger
logger = logging.getLogger(__name__)

# create formatter
log_formatter = logging.Formatter('%(asctime)s %(levelname)s [Notebook][%(name)s] [%(module)s.%(funcName)s] %(message)s')

default_handler = logging.StreamHandler(sys.stdout)
default_handler.setLevel(logging.DEBUG)
default_handler.setFormatter(log_formatter)

# create console handler and set level to debug
console_handler = logging.StreamHandler(sys.__stdout__)
console_handler.setLevel(logging.DEBUG)

# add console formatter to handlers
console_handler.setFormatter(log_formatter)

# add handlers to logger
logger.addHandler(default_handler)
logger.addHandler(console_handler)

# set log level for all handlers to debug
logger.setLevel(logging.DEBUG)

### Configure (parameters from Apache Papermill)


In [None]:
config = {}
workingDir = {}


### Set system variables


In [None]:
home_dir = os.path.expanduser("~")
working_dir = os.getcwd()

if workingDir:
    working_dir = workingDir
    sys.path.append(workingDir)

logger.info("home_dir: %s", home_dir)
logger.info("working_dir: %s", working_dir)

logger.info(pformat(sys.path))


In [None]:
from core import *

if not config:
    with open(working_dir + "/config.json", "r", encoding="utf-8") as f:
        config = json.load(f)
config = configuration_from_dict(config)


In [None]:
from core.enums import *
from core.spark import *

from app import *
from app.config import *


### Variables


In [None]:
operation = Operation(config=config, logger=logger)

current_datetime = datetime.now()
current_timestamp = int(current_datetime.utcnow().timestamp() * 1000)
execute_timestamp = current_datetime.strftime("%Y-%m-%d_%H%M%S")


In [None]:
app_name_prefix = config.app.name
app_name_suffix = config.app.suffix

app_name = gen_name(app_name_prefix, "_", execute_timestamp)

### Initializing spark


In [None]:
spark_packages = [
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    # "com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21",
    # "org.mongodb.spark:mongo-spark-connector:10.0.3",
    "com.solytic:spark-mssql-connector_2.12:1.4.0",
    "org.apache.hadoop:hadoop-azure:3.3.4",
    "com.azure:azure-storage-blob:12.20.2",
]

operation.initialize_spark(name=app_name, packages=spark_packages)

sc = operation.get_current_spark_session().sparkContext

logger.info("SparkContext: %s", sc)
logger.info(
    "Hadoop version: %s", sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion()
)


#### Debugging global variables


In [None]:
# debugging
logger.info("current datetime: %s", current_datetime)
logger.info("current timestamp: %s", current_timestamp)

operation.print_debug_vars()


### Execute


In [None]:
# add more functions here.