### Import Relevant Libraries and start Spark Session

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg as _avg, count, when
from pyspark.sql.functions import col, from_unixtime
import os
from datetime import datetime, timedelta



# Initialize Spark Session if need be. (Not necessary in Fabric)
spark = SparkSession.builder \
    .appName("TaskLogAnalysis") \
    .getOrCreate()

StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 3, Finished, Available, Finished)

### Read Today's Task Log from Lakehouse Filepath

In [2]:
# Define the path to the Lakehouse file 
lakehouse_path = "Files/task_logs/task_logs_2024"  #  the relative path of the Parquet data in the Lakehouse

# Get today's date in the required format
today_date = (datetime.now() - timedelta(days = 8))

today_date_str = today_date.strftime("%Y-%m-%d")

# Construct the file name for today's date
file_name = f"task_log_{today_date_str}_file_{today_date.day}.parquet"

# Full path to the file
file_path = os.path.join(lakehouse_path, file_name)

# Read the Parquet file for the day into a DataFrame
task_logs_df = spark.read.parquet(file_path)

# Show the data
task_logs_df.show()

StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 4, Finished, Available, Finished)

+--------------------+--------------------+--------------------+-------------+-------------+-----------+--------+------------+
|             task_id|             user_id|    task_description|   start_time|     end_time|     status|priority|hours_logged|
+--------------------+--------------------+--------------------+-------------+-------------+-----------+--------+------------+
|05875d3d-0a76-414...|8eabfce6-81c3-460...|Toward main recen...|1734181588000|1734252057000|In Progress|    High|        4.45|
|df611d86-a28e-461...|7ffd07b0-a9fd-42e...|Cultural down hug...|1734197538000|1734304174000|     Failed|     Low|        9.45|
|cd3ef4b9-a806-412...|a4137abe-e5f9-493...|Then particularly...|1734210719000|1734274089000|In Progress|  Medium|        4.76|
|d70336a3-03dd-415...|c8495c11-e418-405...|Focus player wate...|1734199039000|1734236980000|     Failed|  Medium|        4.81|
|3c5277d9-4fb9-47f...|5d4ddd1f-fdef-41e...|Season move bette...|1734208404000|1734298777000|In Progress|    Hig

#### Total number of rows in Today's Task Log

In [5]:
task_logs_df.count()


StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 7, Finished, Available, Finished)

1000000

### Perform Transformations

#### Transformations:
1. Adjust Timestamp columns to Datetime columns
2. Filter out failed tasks
3. Aggregate total hours, average hours, and task count per day

In [6]:
# 1. Convert Timestamp columns (start_time and end_time) into Datetime columns

# 1A. start_time
transformed_task_logs_df = task_logs_df.withColumn(
    "start_time",
    from_unixtime((col("start_time") / 1000).cast("long")).cast("timestamp")
    )


# 1B. end_time
transformed_task_logs_df = transformed_task_logs_df.withColumn(
    "end_time",
    from_unixtime((col("end_time") / 1000).cast("long")).cast("timestamp")
    )

# View Result:
transformed_task_logs_df.show()

StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 8, Finished, Available, Finished)

+--------------------+--------------------+--------------------+-------------------+-------------------+-----------+--------+------------+
|             task_id|             user_id|    task_description|         start_time|           end_time|     status|priority|hours_logged|
+--------------------+--------------------+--------------------+-------------------+-------------------+-----------+--------+------------+
|05875d3d-0a76-414...|8eabfce6-81c3-460...|Toward main recen...|2024-12-14 13:06:28|2024-12-15 08:40:57|In Progress|    High|        4.45|
|df611d86-a28e-461...|7ffd07b0-a9fd-42e...|Cultural down hug...|2024-12-14 17:32:18|2024-12-15 23:09:34|     Failed|     Low|        9.45|
|cd3ef4b9-a806-412...|a4137abe-e5f9-493...|Then particularly...|2024-12-14 21:11:59|2024-12-15 14:48:09|In Progress|  Medium|        4.76|
|d70336a3-03dd-415...|c8495c11-e418-405...|Focus player wate...|2024-12-14 17:57:19|2024-12-15 04:29:40|     Failed|  Medium|        4.81|
|3c5277d9-4fb9-47f...|5d4dd

In [7]:
# 2. Filter out failed tasks
completed_tasks_df = transformed_task_logs_df.filter(task_logs_df.status == "Completed")

# View Result
completed_tasks_df.show()


StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 9, Finished, Available, Finished)

+--------------------+--------------------+--------------------+-------------------+-------------------+---------+--------+------------+
|             task_id|             user_id|    task_description|         start_time|           end_time|   status|priority|hours_logged|
+--------------------+--------------------+--------------------+-------------------+-------------------+---------+--------+------------+
|54ee0a77-0c9d-4d4...|5e15f524-a18b-484...|Best anyone off d...|2024-12-14 17:03:30|2024-12-15 02:36:33|Completed|    High|        9.47|
|bd97d6bf-cc2b-42f...|0440ff89-ce30-4f6...|Authority financi...|2024-12-14 00:23:27|2024-12-15 02:49:22|Completed|    High|        4.16|
|8a10137c-f421-4d1...|f45a041e-cd41-43a...|Add collection wi...|2024-12-14 15:49:42|2024-12-15 03:40:24|Completed|    High|        10.3|
|3b8613fc-c2dd-427...|bba4fe2f-cc1a-4fd...|Role according sk...|2024-12-14 16:28:52|2024-12-15 11:46:35|Completed|    High|        7.95|
|01adc3a5-be47-406...|c822e494-eb3a-437..

In [8]:
# 3. Aggregate: Total and average hours logged, task count by priority and day
aggregated_df = completed_tasks_df.groupBy(
    col("priority"), col("start_time").cast("date").alias("task_date")
).agg(
    _sum("hours_logged").alias("total_hours_logged"),
    _avg("hours_logged").alias("avg_hours_logged"),
    count("task_id").alias("task_count")
)

# Show the aggregated results
aggregated_df.show(10)

StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 10, Finished, Available, Finished)

+--------+----------+------------------+------------------+----------+
|priority| task_date|total_hours_logged|  avg_hours_logged|task_count|
+--------+----------+------------------+------------------+----------+
|  Medium|2024-12-14| 697381.0199999975| 6.256333835719645|    111468|
|     Low|2024-12-14| 693151.2499999921|  6.24792682597049|    110941|
|    High|2024-12-14| 694759.1999999937|6.2561610776932755|    111052|
+--------+----------+------------------+------------------+----------+



### Save file as `CSV`

In [9]:
# Define output path for the CSV
output_csv_path = f"Files/processed_logs/log_{today_date_str}/"

# Write aggregated results to a CSV file
aggregated_df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_csv_path)       

StatementMeta(, 5668d765-bccc-441c-b058-bc3192e413aa, 11, Finished, Available, Finished)