### Importing the required libraries

In [1]:
import re
from pyspark.sql.types import *
import pyspark.sql.functions as sf
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, max
from datetime import date, datetime

### Start Spark Session

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("DataTransformation")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .config("spark.sql.warehouse.dir", "/storage_layer/gold/")\
    .enableHiveSupport()\
    .getOrCreate()
    # .config ("spark.sql.hive.convertMetastoreOrc","false")\

sc = spark.sparkContext

In [3]:
today_date = date.today().isoformat()
current_time = datetime.now()

# Define the directory of the files, depending on the date and batch's hour
silver_layer_directory_path = f'/user/itversity/q-retail-company/silver/{today_date}/hour-{current_time.hour}'

# Define the sliver layer directory
gold_layer_directory_path = f'/user/itversity/q-retail-company/gold/{today_date}/hour-{current_time.hour}'

## Utility Functions
---

### Functions for interacting with Hive tables

In [4]:
# setting up the database name
database_name = "qcompany"

# a function that reads a df from hive table
def get_table_data(database_name, table_name):
    spark.sql("REFRESH TABLES")
    return spark.table(f"{database_name}.{table_name}")

# a function that writes a df to a hive external table
def write_df_to_table(database_name, table_name, df):
    spark.sql("REFRESH TABLES")
    df.write.mode("overwrite").saveAsTable(f"{database_name}.{table_name}")


# a function that takes database and returns an array of the current tables
def get_current_tables(database_name):
    spark.sql("REFRESH TABLES")
    tables_df = spark.sql(f"SHOW TABLES IN {database_name}")
    table_names = [row.tableName for row in tables_df.collect()]
    return table_names

# a function that checks if a current table exists in the current tables or not
def table_exists(table_name, database_name):
    current_tables = get_current_tables(database_name)
    if table_name in current_tables:
        return True
    else:
        return False
    
# a function that writes the current table to a temp table and then insert overwrite the original table
def overwriteCurrentTable(final_df_to_write, final_table_name, temp_table_name, database_name):
    final_df_to_write.write.mode("overwrite").saveAsTable(f"{database_name}.{temp_table_name}")
    # Overwrite the original table with the temporary table data
    spark.sql(f"INSERT OVERWRITE TABLE {database_name}.{final_table_name} SELECT * FROM {database_name}.{temp_table_name}")
    # Drop the temporary table if no longer needed
    spark.sql(f"DROP TABLE {database_name}.{temp_table_name}")

### Some Useful Functions

In [5]:
# a function that takes a df, sk and id  name and adds the surrogate to the df
def add_surrogate_key(df, sk_name, id):
    df = df.withColumn(sk_name, sf.row_number().over(Window.orderBy(id)))
    return df

In [6]:
# a function that takes 2 dfs and upsert them into 1 df
def upsert_dfs(current_df, new_df, common_column="id", surrogate_key=0):
    
    """
    The upsertion algorithm:
        1- convert the dfs to temp views to use sql on
        2- get the currentAsIs 
        3- get the currentUpdate
        4- get the newDataToAdd
        5- join the currentAsIs and currentUpdate to get the currentUpdated
        6- find the max_branch_key from the current df
        7- add row number column under the branch_key to the newDataToAdd
        8- add the max_branch_key to the new branch_key column in the newDataToAdd
        9- merge by name
        10- reordering the columns to make the sk first
    """

    # 1- convert the dfs to temp views to use sql on
    current_df.createOrReplaceTempView("current_data")
    new_df.createOrReplaceTempView("new_data")
    
    # 2- get the currentAsIs part
    currentAsIsQuery = f"""
        SELECT current.*  
        FROM current_data current
        LEFT JOIN new_data AS new
        ON current.{common_column} = new.{common_column}
        WHERE new.{common_column} IS NULL
    """
    currentAsIsDF =         spark.sql(currentAsIsQuery)
    # currentAsIsDF.show()

    # 3- get the currentUpdate part
    currentUpdateQuery = f"""
        SELECT new.*, current.{surrogate_key}
        FROM current_data AS current
        JOIN new_data AS new
            ON current.{common_column} = new.{common_column}
    """
    currentUpdateDF = spark.sql(currentUpdateQuery)
    # currentUpdateDF.show()

    # 4- get the currentUpdate part
    newDataToAddQuery = f"""
        SELECT new.*, current.{surrogate_key}
        FROM current_data AS current
        RIGHT JOIN new_data AS new
            ON current.{common_column} = new.{common_column}
        WHERE current.{common_column} IS NULL
    """
    newDataToAddDF = spark.sql(newDataToAddQuery)
    # newDataToAddDF.show()
    
    # 5- join the currentAsIs and currentUpdate to get the currentUpdated
    current_updated_df = currentAsIsDF.unionByName(currentUpdateDF)
    # current_updated_df.show()

    # 6- find the max_surrogate_key from the current df
    max_surrogate_key = current_updated_df.select(max(col(f"{surrogate_key}"))).collect()[0][0]
    # print(max_surrogate_key)

    # 7- add row number column under the branch_key to the newDataToAdd
    newDataToAddDF = add_surrogate_key(newDataToAddDF, f"{surrogate_key}", f"{common_column}")
    # newDataToAddDF.show()

    # 8- add the max_branch_key to the new branch_key column in the newDataToAdd
    newDataToAddDF = newDataToAddDF.withColumn(f"{surrogate_key}", col(f"{surrogate_key}") + max_surrogate_key)
    # newDataToAddDF.show()

    # 9- return the union  of the current and new
    full_data = current_updated_df.unionByName(newDataToAddDF)#.orderBy(f"{common_column}")
    
    # 10- reordering the columns to make the sk first
    final_df = full_data.select(f"{surrogate_key}", *[col for col in full_data.columns if col != f"{surrogate_key}"])
    final_df = final_df.orderBy(f"{surrogate_key}")
    final_df.show()
    return final_df


## The Actual Transformation
---
### The Transformation Main Algorithm:
1. define the schema
2. read the file using the schema
3. check if the table exists [and read it if it exists]
4. load the final df to hive

## Branches File

#### Branch Dimension

In [7]:
# Environment variables
table = "branch"
database_name = "qcompany"
schema = f"{table}_schema"
dim_name = f"{table}_dim"
dim_key = f"{table}_key"
dim_id = f"{table}_id"
stagging_dim_name = f"{table}_dim_stagging"
current_dim_df_name = f"current_{table}_df" #df from hive
final_df_to_write = f"final_{table}_df"

# read from some sources

df1_path = f'{silver_layer_directory_path}/{table}es_cleaned.csv'
df2_path = f'{silver_layer_directory_path}/{table}es_cleaned.csv'

"""
# Define the directory of the files, depending on the date and batch's hour
silver_layer_directory_path = f'/user/itversity/q-retail-company/silver/{today_date}/hour-{current_time.hour}'

# Define the sliver layer directory
gold_layer_directory_path = f'/user/itversity/q-retail-company/gold/{today_date}/hour-{current_time.hour}'"""

"\n# Define the directory of the files, depending on the date and batch's hour\nsilver_layer_directory_path = f'/user/itversity/q-retail-company/silver/{today_date}/hour-{current_time.hour}'\n\n# Define the sliver layer directory\ngold_layer_directory_path = f'/user/itversity/q-retail-company/gold/{today_date}/hour-{current_time.hour}'"

In [8]:
# defining the schema 
schema = StructType([
    StructField("branch_id", IntegerType(), nullable=False),
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", DateType(), nullable=False),
    StructField("branch_class", StringType(), nullable=False)
])

# reading df1
df1 = spark.read.csv(df1_path, header=True, schema=schema)

# reading df2
df2 = spark.read.csv(df2_path, header=True, schema=schema)

if table_exists(dim_name, database_name):
    current_dim_df_name = get_table_data(database_name, dim_name)
    final_df_to_write = upsert_dfs(current_dim_df_name, df2, dim_id, dim_key)
    overwriteCurrentTable(final_df_to_write, dim_name, stagging_dim_name, database_name)    
    print("done, with upsertion 😎👌")
    
else:
    # add surrogate key
    df1 = add_surrogate_key(df1, dim_key, dim_id)
    # branch = add_surrogate_key(branch, "branch_key", "branch_id")


    # reorder the columns to match the desired schema
    df1 = df1.select(f"{dim_key}", *[col for col in df1.columns if col != f"{dim_key}"])
    write_df_to_table(database_name, dim_name, df1)

    print("done 😎👌")

+----------+---------+---------------+---------------------+------------+
|branch_key|branch_id|branch_location|branch_establish_date|branch_class|
+----------+---------+---------------+---------------------+------------+
|         1|        1|       New York|           2017-01-15|           A|
|         2|        2|    Los Angeles|           2016-07-28|           B|
|         3|        3|        Chicago|           2015-03-10|           A|
|         4|        4|        Houston|           2016-11-05|           D|
|         5|        5|        Phoenix|           2017-09-20|           C|
|         6|        6|       Oklahoma|           2016-09-20|           A|
+----------+---------+---------------+---------------------+------------+



done, with upsertion 😎👌


## Agents File


#### Agent Dimension

In [9]:
# Environment variables
table = "agent"
database_name = "qcompany"
schema = f"{table}_schema"
dim_name = f"{table}_dim"
dim_key = f"{table}_key"
dim_id = f"{table}_id"
stagging_dim_name = f"{table}_dim_stagging"
current_dim_df_name = f"current_{table}_df" #df from hive
final_df_to_write = f"final_{table}_df"

# read from some sources
df1_path = f'{silver_layer_directory_path}/salesAgents_cleaned.csv'
df2_path = f'{silver_layer_directory_path}/salesAgents_cleaned.csv'

In [10]:
# defining the schema 
schema = StructType([
    StructField("agent_id", IntegerType(), nullable=False),
    StructField("agent_name", StringType(), nullable=False),
    StructField("agent_hire_date", DateType(), nullable=False)
])

# reading df1
df1 = spark.read.csv(df1_path, header=True, schema=schema)

# reading df2
df2 = spark.read.csv(df2_path, header=True, schema=schema)

if table_exists(dim_name, database_name):
    current_dim_df_name = get_table_data(database_name, dim_name)
    final_df_to_write = upsert_dfs(current_dim_df_name, df2, dim_id, dim_key)
    overwriteCurrentTable(final_df_to_write, dim_name, stagging_dim_name, database_name)    
    print("done, with upsertion 😎👌")
    
else:
    # add surrogate key
    df1 = add_surrogate_key(df1, dim_key, dim_id)
    # branch = add_surrogate_key(branch, "branch_key", "branch_id")


    # reorder the columns to match the desired schema
    df1 = df1.select(f"{dim_key}", *[col for col in df1.columns if col != f"{dim_key}"])
    write_df_to_table(database_name, dim_name, df1)

    print("done 😎👌")

+---------+--------+------------------+---------------+
|agent_key|agent_id|        agent_name|agent_hire_date|
+---------+--------+------------------+---------------+
|        1|       1|          John Doe|     2020-06-03|
|        2|       2|        Jane Smith|     2018-05-13|
|        3|       3|   Michael Johnson|     2021-10-03|
|        4|       4|       Emily Brown|     2020-10-25|
|        5|       5|      David Wilson|     2021-04-08|
|        6|       6|       Emma Taylor|     2019-03-28|
|        7|       7|Christopher Miller|     2020-01-11|
|        8|       8|      Olivia Davis|     2021-10-24|
|        9|       9|   Daniel Martinez|     2018-10-08|
|       10|      10|      Sophia Moore|     2019-05-25|
|       11|      11|         john wick|     2018-07-10|
+---------+--------+------------------+---------------+



done, with upsertion 😎👌


## Transactions File
---
Now this will be a little bit different, as one file has many dimensions
so first will need to define the expected file schema to read on, after pulling the file to a df will extract the relevant data to do the trasnformation on and save later to hive

The Main df:

In [11]:
# defining the expected schema
main_transactions_schema = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", IntegerType(), nullable=False),
    StructField("customer_fname", StringType(), nullable=False),
    StructField("customer_lname", StringType(), nullable=False),
    StructField("customer_email", StringType(), nullable=False),
    StructField("agent_id", IntegerType(), nullable=False),
    StructField("branch_id", IntegerType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("offer_1", BooleanType(), nullable=True),
    StructField("offer_2", BooleanType(), nullable=True),
    StructField("offer_3", BooleanType(), nullable=True),
    StructField("offer_4", BooleanType(), nullable=True),
    StructField("offer_5", BooleanType(), nullable=True),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", FloatType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("shipping_address", StringType(), nullable=True)
])

In [12]:
# read the file
main_transactions = spark.read.csv(f'{silver_layer_directory_path}/transactions_cleaned.csv', header=True, schema=main_transactions_schema)
main_transactions4 = spark.read.csv(f'{silver_layer_directory_path}/transactions_cleaned.csv', header=True, schema=main_transactions_schema)
main_transactions.show(2)


+----------------+----------------+-----------+--------------+--------------+--------------------+--------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|customer_lname|      customer_email|agent_id|branch_id|product_id|product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|      2023-05-20|trx-152546429674|      85469|     Alexander|         Brown|alexander.brown@g...|       1|        2|        22|Coffee Maker|      Appliances|   null|   null|   null|   null|   null|   10|     79.99|       no| 

#### Customer Dimension

In [13]:
# defining the schema 
customer_schema = StructType([
    StructField("customer_id", IntegerType(), nullable=False),
    StructField("customer_fname", StringType(), nullable=False),
    StructField("customer_lname", StringType(), nullable=False),
    StructField("customer_email", StringType(), nullable=False)
])

customer = main_transactions.select("customer_id", "customer_fname", "customer_lname", "customer_email").distinct()
customer4 = main_transactions4.select("customer_id", "customer_fname", "customer_lname", "customer_email").distinct()
# customer.show()


In [14]:
# Environment variables
table = "customer"
database_name = "qcompany"
schema = f"{table}_schema"
dim_name = f"{table}_dim"
dim_key = f"{table}_key"
dim_id = f"{table}_id"
stagging_dim_name = f"{table}_dim_stagging"
current_dim_df_name = f"current_{table}_df" #df from hive
final_df_to_write = f"final_{table}_df"
df1 = customer
df2 = customer4

In [15]:
if table_exists(dim_name, database_name):
    current_dim_df_name = get_table_data(database_name, dim_name)
    final_df_to_write = upsert_dfs(current_dim_df_name, df2, dim_id, dim_key)
    overwriteCurrentTable(final_df_to_write, dim_name, stagging_dim_name, database_name)    
    print("done, with upsertion 😎👌")
    
else:
    # add surrogate key
    df1 = add_surrogate_key(df1, dim_key, dim_id)
    # branch = add_surrogate_key(branch, "branch_key", "branch_id")


    # reorder the columns to match the desired schema
    df1 = df1.select(f"{dim_key}", *[col for col in df1.columns if col != f"{dim_key}"])
    write_df_to_table(database_name, dim_name, df1)

    print("done 😎👌")

+------------+-----------+--------------+--------------+--------------------+
|customer_key|customer_id|customer_fname|customer_lname|      customer_email|
+------------+-----------+--------------+--------------+--------------------+
|           1|      85462|        Olivia|         Brown|olivia.brown@yaho...|
|           2|      85463|           Mia|      Williams|mia.williams@gmai...|
|           3|      85464|          Emma|      Williams|emma.williams@out...|
|           4|      85465|         James|        Taylor|james.taylor@gmai...|
|           5|      85466|       Michael|         Brown|michael.brown@yah...|
|           6|      85467|     Alexander|         Jones|alexander.jones@y...|
|           7|      85468|       William|         Davis|william.davis@yah...|
|           8|      85469|     Alexander|         Brown|alexander.brown@g...|
|           9|      85470|           Ava|        Wilson|ava.wilson@hotmai...|
|          10|      85471|           Ava|      Williams|ava.will

done, with upsertion 😎👌


###

#### Product Dimension

In [16]:
# defining the schema 
product_schema = StructType([
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_price", DoubleType(), nullable=False),
    StructField("product_category", StringType(), nullable=False)
])

product = main_transactions.select("product_id", "product_name", "unit_price", "product_category").distinct().orderBy("product_id")
product4 = main_transactions4.select("product_id", "product_name", "unit_price", "product_category").distinct().orderBy("product_id")
product4.show(30)
product4.count()


+----------+-----------------+----------+----------------+
|product_id|     product_name|unit_price|product_category|
+----------+-----------------+----------+----------------+
|         1|           Laptop|    999.99|     Electronics|
|         2|       Smartphone|    699.99|     Electronics|
|         3|           Tablet|    299.99|     Electronics|
|         4|       Headphones|     99.99|     Electronics|
|         5|          T-Shirt|     19.99|        Clothing|
|         6|            Jeans|     49.99|        Clothing|
|         7|            Dress|     59.99|        Clothing|
|         8|         Sneakers|     79.99|        Footwear|
|         9|            Boots|    129.99|        Footwear|
|        10|          Sandals|     39.99|        Footwear|
|        11|               TV|    899.99|     Electronics|
|        12|          Monitor|    299.99|     Electronics|
|        13|          Printer|    149.99|     Electronics|
|        14|           Camera|    399.99|     Electronic

30

In [17]:
# Environment variables
table = "product"
database_name = "qcompany"
schema = f"{table}_schema"
dim_name = f"{table}_dim"
dim_key = f"{table}_key"
dim_id = f"{table}_id"
stagging_dim_name = f"{table}_dim_stagging"
current_dim_df_name = f"current_{table}_df" #df from hive
final_df_to_write = f"final_{table}_df"
df1 = product
df2 = product4

In [18]:
if table_exists(dim_name, database_name):
    current_dim_df_name = get_table_data(database_name, dim_name)
    final_df_to_write = upsert_dfs(current_dim_df_name, df2, dim_id, dim_key)
    overwriteCurrentTable(final_df_to_write, dim_name, stagging_dim_name, database_name)    
    print("done, with upsertion 😎👌")
    
else:
    # add surrogate key
    df1 = add_surrogate_key(df1, dim_key, dim_id)
    # branch = add_surrogate_key(branch, "branch_key", "branch_id")


    # reorder the columns to match the desired schema
    df1 = df1.select(f"{dim_key}", *[col for col in df1.columns if col != f"{dim_key}"])
    write_df_to_table(database_name, dim_name, df1)

    print("done 😎👌")

+-----------+----------+------------+----------+----------------+
|product_key|product_id|product_name|unit_price|product_category|
+-----------+----------+------------+----------+----------------+
|          1|         1|      Laptop|    999.99|     Electronics|
|          2|         2|  Smartphone|    699.99|     Electronics|
|          3|         3|      Tablet|    299.99|     Electronics|
|          4|         4|  Headphones|     99.99|     Electronics|
|          5|         5|     T-Shirt|     19.99|        Clothing|
|          6|         6|       Jeans|     49.99|        Clothing|
|          7|         7|       Dress|     59.99|        Clothing|
|          8|         8|    Sneakers|     79.99|        Footwear|
|          9|         9|       Boots|    129.99|        Footwear|
|         10|        10|     Sandals|     39.99|        Footwear|
|         11|        11|          TV|    899.99|     Electronics|
|         12|        12|     Monitor|    299.99|     Electronics|
|         

done, with upsertion 😎👌


#### Transactions Fact

In [19]:
main_transactions.show(2)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|customer_lname|      customer_email|agent_id|branch_id|product_id|product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|      2023-05-20|trx-152546429674|      85469|     Alexander|         Brown|alexander.brown@g...|       1|        2|        22|Coffee Maker|      Appliances|   null|   null|   null|   null|   null|   10|     79.99|       no| 

#### Feature Engineering 
---

In [20]:
main_transactions = main_transactions.drop("customer_fname", "customer_lname", "customer_email", "product_name", "product_price", "product_category")

#### Offers

In [21]:
# adding the redeemed offer column
main_transactions = main_transactions.withColumn("offer_redeemed", 
                                                 sf.when(sf.col("offer_1") == True, "offer_1")\
                                                .when(sf.col("offer_2") == True, "offer_2")\
                                                .when(sf.col("offer_3") == True, "offer_3")\
                                                .when(sf.col("offer_4") == True, "offer_4")\
                                                .when(sf.col("offer_5") == True, "offer_5")\
                                                .otherwise("NA")
)

In [22]:

# adding the offer percentage column
main_transactions = main_transactions.withColumn("discount_pct", 
                                                 sf.when(sf.col("offer_redeemed") == "offer_1", 5)\
                                                 .when(sf.col("offer_redeemed") == "offer_2", 10)\
                                                 .when(sf.col("offer_redeemed") == "offer_3", 15)\
                                                 .when(sf.col("offer_redeemed") == "offer_4", 20)\
                                                 .when(sf.col("offer_redeemed") == "offer_5", 25)\
                                                .otherwise(0)
)

# main_transactions.show(5)

In [23]:
# dropping the offers(1..5) columns
main_transactions = main_transactions.drop("offer_1", "offer_2", "offer_3", "offer_4", "offer_5")


#### Price

In [24]:
# adding the total price column
main_transactions = main_transactions.withColumn("total_price", sf.col("unit_price") * sf.col("units"))

In [25]:
# adding the final price column
main_transactions = main_transactions.withColumn("final_price", sf.round(sf.col("total_price") * (1 - sf.col("discount_pct") / 100), 3))

Now, Separating the Transactions to online and offline

In [26]:
online_transactions = main_transactions.filter(sf.col("is_online") == "yes")
offline_transactions = main_transactions.filter(sf.col("is_online") == "no")

In [27]:
offline_transactions = offline_transactions.drop("shipping_address")
# offline_transactions.show(2)

In [28]:
online_transactions = online_transactions.drop("branch_id", "agent_id")
# online_transactions.show(2)

#### Address

In [29]:
# # online_transactions = 
online_transactions = online_transactions.withColumn("address", sf.split(online_transactions["shipping_address"], "/").getItem(0))
online_transactions = online_transactions.withColumn("shipping_city", sf.split(online_transactions["shipping_address"], "/").getItem(1))
online_transactions = online_transactions.withColumn("shipping_state", sf.split(online_transactions["shipping_address"], "/").getItem(2))
online_transactions = online_transactions.withColumn("shipping_postal_code", sf.split(online_transactions["shipping_address"], "/").getItem(3))
# online_transactions.show()


In [30]:
# defining the schema 
final_transactions_schema = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", IntegerType(), nullable=False),
    StructField("customer_key", IntegerType(), nullable=False),
    StructField("product_key", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("quantity", IntegerType(), nullable=False),
    StructField("Total_Price", DoubleType(), nullable=False),
    StructField("offer_redeemed", IntegerType(), nullable=False),
    StructField("discount_pct", DoubleType(), nullable=False),
    StructField("final_price", DoubleType(), nullable=False),
    StructField("payment_method", DoubleType(), nullable=False),
    StructField("branch_key", IntegerType(), nullable=False),
    StructField("agent_key", IntegerType(), nullable=False)
])

In [31]:
print("GG")

GG


In [32]:
spark.stop()