### Ingest sessions.csv file about website_sessions

####Let's pass on Data Source name as parameter - It can be passed from Azure Data Factory as well

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

####Let's pass on Ingestion Date as parameter - It can be passed from Azure Data Factory as well

In [0]:
dbutils.widgets.text("p_file_date", "2012-03-19")
v_file_date = dbutils.widgets.get("p_file_date")

Let's run the configuration_ecom file to use the ingestion directory paths

In [0]:
%run "./configuration_ecom"

Let's run the common_functions_ecom file to use its functions [details can be found in next few steps]

In [0]:
%run "./common_functions_ecom"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,TimestampType

In [0]:
sessions_schema = StructType(fields=[StructField("website_session_id", DoubleType(), False),
                                     StructField("created_at", TimestampType(), True),
                                     StructField("user_id", IntegerType(), True),
                                     StructField("is_repeat_session", IntegerType(), True),
                                     StructField("utm_source", StringType(), True),
                                     StructField("utm_campaign", StringType(), True),
                                     StructField("utm_content", StringType(), True),
                                     StructField("device_type", StringType(), True),
                                     StructField("http_referer", StringType(), True)
])
							

In [0]:
sessions_df = spark.read \
.option("header", True) \
.schema(sessions_schema) \
.csv(f"{raw_folder_path}/website_sessions/{v_file_date}/sessions.csv")

##### Step 2 - Select the required columns and filter out any un-wanted records

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import *
sessions_selected_df = sessions_df.select(col("website_session_id"), col("created_at"), col("user_id"), col("is_repeat_session"), col("utm_source"), col("utm_campaign"), col("utm_content"), col("device_type"), col("http_referer")).filter(to_date(sessions_df.created_at,'yyyy-MM-dd HH:mm:ss')==v_file_date)					

##### Step 3 - Rename the columns as required, add partition Column

In [0]:
from pyspark.sql.functions import lit
from pyspark.sql import functions as sf

In [0]:
sessions_renamed_df = sessions_selected_df.withColumn("year_month", concat(year(sessions_selected_df.created_at),sf.format_string("%02d",month(sessions_selected_df.created_at))))\
.withColumn("file_date", lit(v_file_date))

##### Step 4 - Add ingestion date to the dataframe

In [0]:
sessions_final_df = add_ingestion_date(sessions_renamed_df)

##### Deleting the existing records of the same 'created_at' date,  if any, in the destination Table (in case of repeat job Trigger)

In [0]:
from delta.tables import *
from pyspark.sql.functions import *
deltaTable = DeltaTable.forPath(spark, "/mnt/mavendataguy/output/ecom_growth/sessions_staging")
deltaTable.delete(to_date(col("created_at"),'yyyy-MM-dd HH:mm:ss')==v_file_date)

##### Step 5 - Write data to datalake as parquet
###### Although we are ingesting records using created_at date column but the partition has been created using year_month column to meet the 'minimum number of records in a partition' criteria

In [0]:
sessions_final_df.write.mode("append").format("delta").partitionBy("year_month").save("/mnt/mavendataguy/output/ecom_growth/sessions_staging")
#sessions_final_df.write.mode("overwrite").format("delta").partitionBy("year_month").saveAsTable("ecom_growth.sessions_staging")

In [0]:
%sql
SELECT to_date(created_at) created_at, count(*) as daily_count
FROM ecom_growth.sessions_staging
group by to_date(created_at);

created_at,daily_count
2012-03-20,161
2012-03-19,136


In [0]:
dbutils.notebook.exit("Success")

Success