In [None]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [None]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

In [None]:
request_with_pages_df: DataFrame = spark_session.read.json(path='resources/request_with_pages.json')
request_with_cap_df: DataFrame = spark_session.read.json(path='resources/request_with_cap.json')
tender_costs_df: DataFrame = spark_session.read.csv(path='resources/tender_costs.csv')

In [None]:
request_with_cap_and_pages_df: DataFrame = request_with_pages_df \
    .join(request_with_cap_df, on=['request']) \
    .select(request_with_pages_df.iun, request_with_pages_df.request, request_with_pages_df.pages, request_with_pages_df.product_type, request_with_cap_df.cap)

In [None]:
request_with_cost_df: DataFrame = request_with_cap_and_pages_df \
    .join(tender_costs_df, on=[tender_costs_df._c7.contains(request_with_cap_and_pages_df.cap), tender_costs_df._c9 == request_with_cap_and_pages_df.product_type]) \
    .select(
        request_with_cap_and_pages_df.iun,
        request_with_cap_and_pages_df.request,
        request_with_cap_and_pages_df.pages,
        request_with_cap_and_pages_df.product_type,
        tender_costs_df._c0.alias('basePrice'),
        tender_costs_df._c1.alias('basePrice100'),
        tender_costs_df._c2.alias('basePrice1000'),
        tender_costs_df._c3.alias('basePrice2000'),
        tender_costs_df._c4.alias('basePrice250'),
        tender_costs_df._c5.alias('basePrice350'),
        tender_costs_df._c6.alias('basePrice50'),
        tender_costs_df._c8.alias('pagePrice'),
    )

# request_with_cost_df.show(truncate=False)

In [None]:
vat_scale_const: float = 0.9304056

weight_column: Column = 5 + (5 * request_with_cost_df.pages)
base_price_for_weight: Column = when(weight_column <= 20, request_with_cost_df.basePrice) \
    .when(weight_column <= 50, request_with_cost_df.basePrice50) \
    .when(weight_column <= 100, request_with_cost_df.basePrice100) \
    .when(weight_column <= 250, request_with_cost_df.basePrice250) \
    .when(weight_column <= 350, request_with_cost_df.basePrice350) \
    .when(weight_column <= 1000, request_with_cost_df.basePrice1000) \
    .when(weight_column <= 2000, request_with_cost_df.basePrice2000)

price_column: Column = format_number((base_price_for_weight + (request_with_cost_df.pagePrice * (request_with_cost_df.pages - 1))), 2).alias('completedPrice')
price_column_without_vat: Column = format_number((price_column * vat_scale_const), 2).alias('completedPriceWithoutVat')

request_with_price_df: DataFrame = request_with_cost_df.select(request_with_cost_df.iun, request_with_cost_df.request, price_column, price_column_without_vat)
request_with_price_df.show(truncate=False)

In [None]:
request_with_price_df.write.mode('overwrite').options(header='True', delimiter=';').csv('resources/prices.csv')

request_with_price_df.select(format_number(sum(request_with_price_df.completedPrice), 2), format_number(sum(request_with_price_df.completedPriceWithoutVat), 2)).show(truncate=False)