# 1. Objective
   The Objective of the notebook is to impute missing dates in-between the start and end date of each time series and leave the values of DV, IDVs to be null which can be treated in the missing value treatment phase.

# 2. Imports

In [0]:
import time
import os

import numpy as np
import pandas as pd

from datetime import datetime
import shutil

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F, types as T
session = get_active_session()

# 3. Setup environment

## 3.1. Load Config

In [None]:
import yaml
stage_path = "@ORANGE_ZONE_SBX_TA.PUBLIC.CONNECTIONS/config_new_PROD.yaml"
stream = session.file.get_stream(stage_path)
yaml_text = stream.read().decode()
app_config = yaml.safe_load(yaml_text)

## 3.2 Update Output Database, Schema , table

In [None]:
output_database = app_config["general_inputs"]["output_database"]
output_schema = app_config["general_inputs"]["output_schema"]
print(output_database, output_schema)

In [None]:
session.use_database(output_database)
session.use_schema(output_schema)
output_table_name = "PROD_MISSING_DATE_TREATMENT_OUTPUT"

In [None]:
# Example check (optional)
print("✅ Snowpark Session Initialized Successfully!")
print("Current Database:", session.get_current_database())
print("Current Schema:", session.get_current_schema())

## 3.2. Capturing necessary variables

In [0]:
# Get the modeling granularity
modeling_granularity_conf = app_config["general_inputs"]["modeling_granularity"]

# Get date and Dependent variable
dv_config = app_config["general_inputs"]["dependent_variable"]
ds_config = app_config["general_inputs"]["date_var"]

# Algorithms

# Future Forecast

# Creating `app_config`

Manual intervention not needed from here

# 4. Utility Functions

In [0]:
broadcast_date_col = ds_config
broadcast_granularity = modeling_granularity_conf
broadcast_algo_params = app_config['data_processing']['missing_value_treatment']

# 5. Load Data

In [None]:
df = session.table("ORANGE_ZONE_SBX_TA.PUBLIC.PROD_ADS_STABLE_V4")
df = df.withColumn(ds_config, F.col(ds_config).cast("timestamp"))

# 6. Identify missing dates

The aim is to create a dataframe that has all the dates supposed to be there. For the missing dates, if any, it would have nulls in the other columns. This dataframe would be passed through the missing value treatment phase. That would ensure that we have effectively tackled mising dates issue.

## 6.1. Generate granularity-wise start and end dates

In [0]:
# Finding start and end dates for each granularity
start_end_date_df = (
    df
    .group_by(modeling_granularity_conf)
    .agg(
        F.min(ds_config).alias('start_date'),
        F.max(ds_config).alias('end_date')
    )
)

## 6.2. Infer frequency of dates in data

In [0]:
few_dates = (
    df.
    select(ds_config)
    .distinct()
    .toPandas()[ds_config]
    .sort_values(ascending=True)
    .to_numpy()[-5:]
)

date_frequency = pd.infer_freq(few_dates)

print(f"Inferred frequency: {date_frequency}")

Inferred frequency: W-MON


## 6.3. Generate output dataset with all dates

This dataset would have nulls for the other columns wherever there is a missing date.

In [None]:


# 1️⃣ Define the Python function
def generate_dates(start_date, end_date, freq='D'):
    if start_date is None or end_date is None:
        return []
    return [d.date() for d in pd.date_range(start=start_date, end=end_date, freq=freq)]

# 2️⃣ Register the function as a Snowpark UDF
generate_dates_udf = F.udf(
    func=lambda start, end: generate_dates(start, end, date_frequency),
    return_type=T.ArrayType(T.DateType()),
    input_types=[T.DateType(), T.DateType()]
)

# 3️⃣ Apply the UDF to generate the list of dates
start_end_date_df = start_end_date_df.with_column(
    "dates_range",
    generate_dates_udf(F.col("start_date"), F.col("end_date"))
)

# 4️⃣ Explode the list into separate rows
df_dates = (
    start_end_date_df
    .with_column(ds_config, F.explode(F.col("dates_range")))
    .select(*modeling_granularity_conf, F.col(ds_config))
)
df_dates = df_dates.withColumn(ds_config, F.col(ds_config).cast("timestamp"))
# Show the resulting DataFrame
expected_no_of_dates = df_dates.count()
actual_no_of_dates = df.count()
if expected_no_of_dates != actual_no_of_dates:
    print(f"Expected no of dates: {expected_no_of_dates}")
    print(f"Actual no of dates: {actual_no_of_dates}")
    print(f"Dates in data are not as per the expectation. So, Imputation of missing dates is performed!!!")
else:
    print(f"There are no missing dates in between start and end date for each combination. Dates in data as expected")

In [None]:
start_end_date_df

In [None]:
df_dates

In [0]:
df = df.join(df_dates, on=[*modeling_granularity_conf, ds_config], how="right")

In [None]:
df.count()

# 7. Store output

In [0]:
results_s_with_ts = df.with_column("LOAD_TS", F.current_timestamp())
results_s_with_ts.write.mode("overwrite").save_as_table(output_table_name)
print(results_s_with_ts.count())

File stored successfully.


In [None]:
test_df = session.table(output_table_name)
print("All data ->", test_df.count())
latest_ts = test_df.select(F.max("LOAD_TS")).collect()[0][0]
latest_data = test_df.filter(F.col("LOAD_TS") == F.lit(latest_ts))
print("Latest data ->", latest_data.count())