In [None]:
# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. 
# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment.

In [None]:
import pandas as pd
from pyspark.sql.functions import col, date_sub, expr, to_date, date_add, explode, split
import random
import uuid

In [None]:
df = spark.sql("SELECT * FROM ckm_conv_processed")
# display(df)

In [None]:
record_count = df.count()

# Print the number of records
print(f"Total number of records in the DataFrame: {record_count}")

In [None]:
# This code manipulates sample data that allocates a percentage of the data to the current date, the previous day, and the last seven days, 
# while assigning the remaining records to any day within the last 30 days. 


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, current_timestamp, unix_timestamp, from_unixtime, expr,lit
from pyspark.sql.types import TimestampType

# Convert string columns to timestamp types
df = df.withColumn("StartTime", col("StartTime").cast("timestamp"))
df = df.withColumn("EndTime", col("EndTime").cast("timestamp"))
df = df.withColumn("ConversationDate", col("ConversationDate").cast("timestamp"))

# Calculate the maximum StartTime
max_date_df = df.select(spark_max(col("StartTime")).alias("max_date"))
max_date = max_date_df.collect()[0]["max_date"]

# Get current timestamp
current_ts = spark.sql("SELECT current_timestamp() as current_ts").collect()[0]["current_ts"]

print("max_date: ", max_date)
print("current time: ", current_ts)

# Calculate the difference in seconds between the current timestamp and the maximum StartTime
time_diff_seconds = (current_ts - max_date).total_seconds()

# Convert the time difference to days, hours, minutes, and seconds
days = int(time_diff_seconds // (24 * 3600))
hours = int((time_diff_seconds % (24 * 3600)) // 3600)
minutes = int((time_diff_seconds % 3600) // 60)
seconds = int(time_diff_seconds % 60)

# Total number of records
total_records = df.count()

# Calculate the number of records for each time range
today_count = int(total_records) * .4
yesterday_today_count = int(total_records * 0.25)
two_days_prior_count = int(total_records * 0.1)
last_7_days_count = int(total_records * 0.15)
current_month_count = int(total_records * 0.1)
prior_month_count = total_records - (yesterday_today_count + two_days_prior_count + current_month_count)

# # Assign random dates based on the calculated counts
df_temp = df.withColumn("row_num", expr(
    f"""
    CASE
        WHEN rand() < {today_count / total_records} THEN 1
        WHEN rand() < {(yesterday_today_count + today_count) / total_records} THEN 2
        WHEN rand() < {(last_7_days_count + yesterday_today_count + today_count) / total_records} THEN 3
        ELSE 4
    END
    """
))

# Generate new dates based on row_num
df_temp = df_temp.withColumn("NewStartTime", expr(
    """
    CASE
        WHEN row_num = 1 THEN current_date()
        WHEN row_num = 2 THEN date_add(current_date(), -1)
        WHEN row_num = 3 THEN date_add(current_date(), -cast(rand() * 7 as int))
        ELSE date_add(date_add(current_date(), -7), -30 + cast(rand() * 30 as int))
    END
    """
).cast('timestamp'))


# Combine the new date with the original time part of StartTime
df_temp = df_temp.withColumn("StartTime", expr("to_timestamp(concat(date_format(NewStartTime, 'yyyy-MM-dd'), ' ', date_format(StartTime, 'HH:mm:ss.SSS')))"))


# Adjust EndTime based on NewStartTime and Duration (Duration is in minutes)
interval_str = "Duration minutes"
df_temp = df_temp.withColumn("EndTime", expr("StartTime + make_interval(0, 0, 0, 0, 0, Duration, 0)"))


# Print the time difference in a sentence
# print(f"The difference between the current time and the maximum date is {days} days, {hours} hours, {minutes} minutes, and {seconds} seconds.")



# Combine the new date with the original time part of ConversationDate to form NewConversationDate
df_temp = df_temp.withColumn("ConversationDate", expr("concat(date_format(StartTime, 'yyyy-MM-dd'), ' ', date_format(ConversationDate, 'HH:mm:ss.SSS'))"))
df_temp = df_temp.withColumn("ConversationDate", col("ConversationDate").cast("timestamp"))


# Drop helper columns
df_temp = df_temp.drop("row_num", "NewStartTime")

# display(df_temp)



In [None]:
df_temp.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('ckm_conv_processed_temp')

In [None]:
df = spark.sql("SELECT * FROM ckm_conv_processed_temp ")

In [None]:
df.write.format('delta').mode('overwrite').option("overwriteSchema", "false").saveAsTable('ckm_conv_processed')