# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.3 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::653538097121:role/Glue
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: 09e07858-fd0f-41e4-b3fc-fc3949f1f0d9
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.3
--enable-glue-datacatalog true
Waiting for session 09e07858-fd0f-41e4-b3fc-fc

# 1. Extracting Data

In [17]:
# Get today's date in the format "YYYYMMDD"
# today = datetime.date.today().strftime("%Y%m%d")
today = "19980506"
df_orders = spark.read.format("csv").option("header", "true").load(f"s3://ijdhad-mydemo/landing_zone/orders/{today}")
df_orders_details = spark.read.format("csv").option("header", "true").load("s3://ijdhad-mydemo/landing_zone/orders_details")




# 2 Loading Data to S3 Raw Zone

In [13]:


year = today[0:4]
month = today[4:6]
day = today[6:8]
print(year+month+day)
df_orders.write.parquet(f"s3://ijdhad-mydemo/raw_zone_pyspark/orders/partitioned_data/year={year}/month={month}/day={day}")

19980506


### Update Partition

In [14]:
db_name = "demo"
table_name = "orders"
spark.sql(f"MSCK REPAIR TABLE {db_name}.{table_name}")

DataFrame[]


# 3. Transforming data

### 3.1 Join Table

In [21]:
df_orders_details = df_orders_details.withColumnRenamed("orderid", "orderid_2")
df_fact_orders_items = df_orders_details \
                    .join(df_orders, df_orders["orderid"] == df_orders_details["orderid_2"], how="right").drop(*["orderid_2"])
df_fact_orders_items.show()

+---------+---------+--------+--------+-------+----------+----------+----------+------------+-----------+-------+-------+--------------------+--------------------+-----------+----------+--------------+-----------+
|productid|unitprice|quantity|discount|orderid|customerid|employeeid| orderdate|requireddate|shippeddate|shipvia|freight|            shipname|         shipaddress|   shipcity|shipregion|shippostalcode|shipcountry|
+---------+---------+--------+--------+-------+----------+----------+----------+------------+-----------+-------+-------+--------------------+--------------------+-----------+----------+--------------+-----------+
|       16|    17.45|      14|    0.05|  11074|     SIMOB|         7|1998-05-06|  1998-06-03|       null|      2|  18.44|       Simons bistro|        Vinbæltet 34|  Kobenhavn|      null|          1734|    Denmark|
|       76|       18|       2|    0.15|  11075|     RICSU|         8|1998-05-06|  1998-06-03|       null|      2|   6.19|  Richter Supermarkt|  

In [24]:
from pyspark.sql.functions import expr, concat, substring, col,to_date
# Add 543 years to the orderdate column while maintaining month-day format
df_fact_orders_items = df_fact_orders_items.withColumn("new_year", expr("substring(orderdate, 1, 4) + 543"))
df_fact_orders_items = df_fact_orders_items.withColumn("orderdate", expr("concat(cast(substring(orderdate, 1, 4) + 543 as string), substring(orderdate, 5))"))
df_fact_orders_items = df_fact_orders_items.withColumn("orderdate", expr("replace(cast(orderdate as string), '.0', '')"))
df_fact_orders_items = df_fact_orders_items.withColumn("orderdate", to_date("orderdate", "yyyy-MM-dd"))





### 3.2 Load Transformed Data To Serving Zone

In [31]:
year = int(year) + 543
df_fact_orders_items.write.parquet(f"s3://ijdhad-mydemo/serving_zone_pyspark/fact_orders_items/partitioned_data/year={year}/month={month}/day={day}")




In [None]:
db_name = "demo"
table_name = "fact_orders_items"
spark.sql(f"MSCK REPAIR TABLE {db_name}.{table_name}")