# 1. Objective
  The Objective of the notebook is to generate new features (from the raw data) based on the provided inputs in the config. The notebook helps us to create the following types of features
  * Lag / Lead features
  * Log transformed features
  * Trend features
  * Seasonality features

# 2. Imports

In [None]:
from datetime import datetime
import os
import copy
import numpy as np
import pandas as pd
import sys
import traceback
import shutil
import yaml

# --- Snowpark Imports ---
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import col
from snowflake.snowpark.window import Window
from snowflake.snowpark.types import *

# ======================================================
# Initialize Snowpark Session
# ======================================================

# If running inside Snowflake (e.g., Snowflake Worksheet, Snowsight, or Streamlit for Snowflake)
session = get_active_session()

# If running locally, uncomment and configure connection details:

# ======================================================
# Snowpark is now ready to use (Equivalent to SparkSession)
# ======================================================

# Example check (optional)
print("✅ Snowpark Session Initialized Successfully!")
print("Current Database:", session.get_current_database())
print("Current Schema:", session.get_current_schema())

# 3. Setup environment

## 3.1. Load Config

In [None]:
import yaml
stage_path = "@ORANGE_ZONE_SBX_TA.PUBLIC.CONNECTIONS/config_new_PROD.yaml"
stream = session.file.get_stream(stage_path)
yaml_text = stream.read().decode()
app_config = yaml.safe_load(yaml_text)

## 3.2 Update Output Database, Schema , table

In [None]:
output_database = app_config["general_inputs"]["output_database"]
output_schema = app_config["general_inputs"]["output_schema"]
print(output_database, output_schema)

In [None]:
session.use_database(output_database)
session.use_schema(output_schema)
output_table_name = "PROD_FEATURE_ENGINEERING_OUTPUT"
intermediate_table_name = "PROD_FEATURE_ENGINEERING_INTERMEDIATE_RESULTS"

In [None]:
# Example check (optional)
print("✅ Snowpark Session Initialized Successfully!")
print("Current Database:", session.get_current_database())
print("Current Schema:", session.get_current_schema())

## 3.3. Capturing necessary variables

In [None]:
date_variable = app_config["general_inputs"]["date_var"]
date_format = app_config["general_inputs"]["date_format_pyspark"]
date_format_pandas = app_config["general_inputs"]["date_format_pandas"]

ds_config = app_config["general_inputs"]["date_var"]
modeling_granularity = app_config["general_inputs"]["modeling_granularity"]

categorical_columns_ads = app_config["feature_engineering_details"]["categorical_columns_ads"]

# Function to broadcast variables

In [0]:
def broadcast_variable_conf(x):
    """
    In Snowpark, there's no need for Spark-style broadcast variables.
    This function simply returns the variable directly,
    as Snowflake automatically optimizes variable usage in UDFs and queries.

    Parameters
    ----------
    x : any
        Configuration variables or constants.

    Returns
    -------
    dict or any
        The same configuration object (no broadcasting needed).
    """
    return x

# Broadcast generic parameters

In [0]:
broadcast_date_variable = broadcast_variable_conf(date_variable)

# Load data

In [None]:
df = session.table("PROD_MISSING_VALUE_TREATMENT_OUTPUT")

In [None]:
df

In [None]:
#from snowflake.snowpark.functions import col, to_date, lit

date_col = app_config["general_inputs"]["date_var"]

# Convert to date (Snowflake handles parsing; invalid → NULL automatically)
df = df.with_column(
    date_col,
    F.to_date(F.col(date_col))
)

In [None]:
df

# Metrics calculated at modeling_granularity level (within timeseries)
  The following metrics are calculated for each unique combination of `modeling_granularity` using the historical data available in a parallelized execution.
  1. Lag/Lead features
  2. Log Transformation

## 1. Feature Lag/Lead Generation

### 1.1. Config parameters

In [None]:
featurewise_lag_lead_dict = {}

for feature, lag_lead_value in app_config["feature_engineering_details"]["featurewise_lag_lead_dict"].items():
    try:
        # unpack safely
        lag_or_lead = lag_lead_value[0]
        offsets = lag_lead_value[1] if len(lag_lead_value) > 1 else []
        freq = lag_lead_value[2] if len(lag_lead_value) > 2 else "unknown"

        if lag_or_lead == "lag":
            featurewise_lag_lead_dict[feature] = [offsets, freq]
        else:
            featurewise_lag_lead_dict[feature] = [[-i for i in offsets], freq]

    except Exception as e:
        print(f"⚠️ Error processing {feature}: {lag_lead_value} — {e}")

In [0]:
lag_lead_needed = app_config["feature_engineering_details"]["lag_lead_needed"]

In [None]:
lag_lead_needed

### 1.2. Broadcast variables for use in lag/lead creation

In [0]:
broadcast_lag_lead_needed = broadcast_variable_conf(lag_lead_needed)
broadcast_featurewise_lag_lead_dict = broadcast_variable_conf(featurewise_lag_lead_dict)

In [None]:
lag_cols = []
for feature, (shift_value, fill_value) in broadcast_featurewise_lag_lead_dict.items():
        for cur_shift_val in shift_value:
            if cur_shift_val > 0:
                lag_cols.append(f'{feature}_LAG{cur_shift_val}')
            else:
                lag_cols.append(f'{feature}_LEAD{cur_shift_val}')

### 1.3. API for Lag/Lead creation

In [0]:
def create_featurewise_lag_lead_versions(
    df: pd.DataFrame,
    featurewise_lag_lead_dict: dict,
) -> pd.DataFrame:
    for feature, (shift_value, fill_value) in featurewise_lag_lead_dict.items():
        for cur_shift_val in shift_value:
            if cur_shift_val > 0:
                df[f'{feature}_LAG{cur_shift_val}'] = df[feature].shift(cur_shift_val, fill_value=fill_value)
            else:
                df[f'{feature}_LEAD{cur_shift_val}'] = df[feature].shift(cur_shift_val, fill_value=fill_value)
    
    return df

## 2. Log-transformation

### 2.1. Config parameters

In [0]:
log_transformation_needed = app_config["feature_engineering_details"]["log_transformation_needed"]

log_transformation_features = app_config["feature_engineering_details"]["log_transformation_features"]

### 2.2. Broadcast variables for use in log-transformation

In [0]:
broadcast_log_transformation_needed = broadcast_variable_conf(log_transformation_needed)
broadcast_log_transformation_features = broadcast_variable_conf(log_transformation_features)

In [None]:
log_cols = broadcast_log_transformation_features

### 2.3. API for log-transformation

In [0]:
def create_log_transformed_features(
    df: pd.DataFrame,
    features: list[str],
) -> pd.DataFrame:
    for feature in features:
        if (df[feature] < 0).any():
            raise ValueError(f'{feature} has negative values; log-transformation requires all positive.')
        else:
            df[f'LOG_{feature}'] = np.log1p(df[feature])
    
    return df

In [None]:
lag_cols

In [0]:
def feat_udf(udf_input_data: pd.DataFrame) -> pd.DataFrame:   
    try:
        udf_output_data = udf_input_data.copy()

        # Get the parameter values from the broadcasted variables
        date_column = broadcast_date_variable

        # Lag/Lead transformations
        if broadcast_lag_lead_needed:
            udf_output_data = create_featurewise_lag_lead_versions(
                df=udf_output_data,
                featurewise_lag_lead_dict=broadcast_featurewise_lag_lead_dict,
            )

        # Log transformations
        if broadcast_log_transformation_needed:
            udf_output_data = create_log_transformed_features(
                df=udf_output_data,
                features=broadcast_log_transformation_features,
            )

        lag_cols = []
        for feature, (shift_value, fill_value) in broadcast_featurewise_lag_lead_dict.items():
                for cur_shift_val in shift_value:
                    if cur_shift_val > 0:
                        lag_cols.append(f'{feature}_LAG{cur_shift_val}')
                    else:
                        lag_cols.append(f'{feature}_LEAD{cur_shift_val}')
        req_cols = modeling_granularity + [date_col]  + lag_cols + ["LOG_"+i for i in broadcast_log_transformation_features] + ["status"]
        udf_output_data["status"] = "success"
        return udf_output_data[req_cols]
    
    except Exception as e:
        udf_output_data = udf_input_data.copy()
        
        if broadcast_lag_lead_needed:
            lag_lead_columns = [
                    f'{feature}_LAG{shift_value}' if shift_value > 0
                    else f'{feature}_LEAD{shift_value}' for feature, (shift_list, _) in broadcast_featurewise_lag_lead_dict.items() for shift_value in shift_list
            ]
            udf_output_data[lag_lead_columns] = -1.0
        
        if broadcast_log_transformation_needed:
            log_transformation_columns = [
                f'LOG_{feature}' for feature in broadcast_log_transformation_features
            ]
            udf_output_data[log_transformation_columns] = -1.0
    
        lag_cols = []
        for feature, (shift_value, fill_value) in broadcast_featurewise_lag_lead_dict.items():
                for cur_shift_val in shift_value:
                    if cur_shift_val > 0:
                        lag_cols.append(f'{feature}_LAG{cur_shift_val}')
                    else:
                        lag_cols.append(f'{feature}_LEAD{cur_shift_val}')
        req_cols = modeling_granularity + [date_col]  + lag_cols + ["LOG_"+i for i in broadcast_log_transformation_features] + ["status"]

        udf_output_data["status"] = str(traceback.format_exc()) 
        
        return udf_output_data[req_cols]


### 5.2. Schema of the output

In [None]:
input_data = df

In [0]:


input_data_schema = [StructField(x,StringType()) for x in modeling_granularity] + [StructField(x,DateType()) for x in [date_col]]
output_data_schema = [] + input_data_schema

if lag_lead_needed:

    lag_lead_columns = [
            [f'{feature}_lag{cur_shift}' if cur_shift > 0 else f'{feature}_lead{cur_shift}' for cur_shift in shift_value]
            for feature, (shift_value, _) in featurewise_lag_lead_dict.items()
    ]
    print(lag_lead_columns)
    final_lead_lag_cols = []
    if len(lag_lead_columns)>1:
        #lag_lead_columns = lag_lead_columns[0] + lag_lead_columns[1]
        final_lead_lag_cols = [item for sublist in lag_lead_columns for item in sublist]
    # else:
    #     lag_lead_columns = lag_lead_columns[0]
    lag_lead_columns_schema = [StructField(x.replace("-",""),FloatType()) for x in final_lead_lag_cols]
    output_data_schema = output_data_schema + lag_lead_columns_schema

if log_transformation_needed:
    log_transformation_columns = [
        f'log_{feature}' for feature in log_transformation_features
    ]
    log_transformation_columns_schema = [StructField(x,FloatType()) for x in log_transformation_columns]
    output_data_schema = output_data_schema + log_transformation_columns_schema

output_data_schema = output_data_schema + [StructField("status", StringType())]
output_data_schema = StructType(output_data_schema)

print(output_data_schema)

### 5.3. Apply UDF

# Testing the UDF

In [None]:
t_df = input_data.to_pandas()
t_df = t_df[t_df["F_CODE"] == "0307"]
print(t_df.shape)
feat_udf(t_df)

In [None]:
raw_cols_lag = list(app_config["feature_engineering_details"]["featurewise_lag_lead_dict"].keys())
raw_cols_log = log_cols

In [None]:
req_cols = modeling_granularity + [ds_config] + raw_cols_lag + raw_cols_log

In [0]:
feature_engineering_output = input_data.select(req_cols).groupBy(modeling_granularity).applyInPandas(feat_udf, output_schema=output_data_schema)

In [0]:
feature_engineering_output

In [0]:
feature_engineering_output.columns

In [0]:
categorical_columns_ads

In [0]:
#feature_engineering_output.filter(F.col("status")!="success").display()

In [0]:
list(feature_engineering_output.columns)

### 5.5. Export intermediate results
This is needed to break the lazy evaluation nature of Spark and store the intermediate results and read later on to speeden up the downstream tasks

In [None]:
feature_engineering_output_all_cols = input_data.join(feature_engineering_output, on = [*modeling_granularity,date_col], how = "left")

In [None]:
feature_engineering_output_all_cols

In [None]:
feature_engineering_output_all_cols.count()

In [None]:
# # write_file(pos, algo_path, "/Pos_standardization_results (")
results_s_with_ts = feature_engineering_output_all_cols.with_column("LOAD_TS", F.current_timestamp())
results_s_with_ts.write.mode("overwrite").save_as_table(intermediate_table_name)
print(results_s_with_ts.count())

In [None]:
test_df = session.table(intermediate_table_name)
print("All data ->", test_df.count())
latest_ts = test_df.select(F.max("LOAD_TS")).collect()[0][0]
feature_engineering_output = test_df.filter(F.col("LOAD_TS") == F.lit(latest_ts))
print("Latest data ->", feature_engineering_output.count())

# Metrics calculated at higher levels (across timeseries)
 The following metrics are created at different hierarchical levels (or dimensions) in the data and then merged back into the granular data (`modeling_granularity` level). Eg - Brand level, Category level, etc
  1. SI Weekly
  2. SI Monthly
  3. SI Quarterly

## 1. Seasonality-index Generation

### 1.1. Derive frequency of data

In [None]:
# 1. Get distinct values from the date column into pandas
date_col = date_variable

date_column_content = (
    input_data
        .select(F.col(date_col))
        .dropDuplicates()
        .to_pandas()        # NOTE: .toPandas() → .to_pandas() in new Snowpark
)

# 2. Convert to pandas datetime
date_column_content[date_col] = pd.to_datetime(
    date_column_content[date_col], errors="coerce"
)

# 3. Create a sorted pandas Series of unique dates
history_dates = (
    pd.Series(date_column_content[date_col].dropna().unique())
      .sort_values(ignore_index=True)
)

# 4. Infer frequency from the last 3 dates
frequency = pd.infer_freq(history_dates.tail(3))


In [None]:
frequency

In [None]:
history_dates

### 1.2. Config parameters

In [None]:
app_config["feature_engineering_details"]["si_needed"]

In [0]:
si_needed = app_config["feature_engineering_details"]["si_needed"]
si_target_column = app_config["feature_engineering_details"]["si_target_column"]

In [None]:
si_needed

In [None]:
si_target_column

### 1.3. Broadcast variables for use in SI creation

In [0]:
broadcast_si_needed = broadcast_variable_conf(si_needed)
broadcast_frequency = broadcast_variable_conf(frequency)
broadcast_si_target_column = broadcast_variable_conf(si_target_column)

In [None]:
broadcast_si_target_column

In [None]:
broadcast_si_needed

In [None]:
broadcast_frequency

In [None]:
app_config["general_inputs"]['date_var']

### 1.4. API for SI creation

In [None]:
def create_seasonality_index(df, coln, si_target_column, granularity:str=None):
    """
    Calculate seasonality index (SI) for quantity based on the given DataFrame and configuration.
    """

    quantity_var = "NO_OF_NEW_JOINEES"
    
    # Convert all columns to uppercase
    df = df.toDF(*[c.upper() for c in df.columns])

    # Add REPORT_* columns in uppercase
    df = df.withColumn('REPORT_WEEK', F.weekofyear(F.col(app_config["general_inputs"]['date_var'].upper()))) \
           .withColumn('REPORT_MONTH', F.month(F.col(app_config["general_inputs"]['date_var'].upper()))) \
           .withColumn('REPORT_QUARTER', F.quarter(F.col(app_config["general_inputs"]['date_var'].upper()))) \
           .withColumn('REPORT_YEAR', F.year(F.col(app_config["general_inputs"]['date_var'].upper())))

    # Extract list of years
    years_list = [row['REPORT_YEAR'] for row in df.select('REPORT_YEAR').distinct().collect()]

    # Build naming prefix
    naming = "_".join(coln)

    all_years_si = None

    # Loop through years
    for year in years_list:

        year_df = df.filter(F.col('REPORT_YEAR') == year)

        ADS_categ = year_df.groupBy(coln + ['REPORT_MONTH', 'REPORT_QUARTER', 'REPORT_WEEK']) \
                           .agg(F.sum(F.col(si_target_column.upper())).alias(quantity_var))

        # ------------ MONTHLY SI ----------------
        if granularity == 'month':

            ADS_categ_monthly_avg = ADS_categ.groupBy(coln + ['REPORT_MONTH']) \
                                             .agg(F.mean(quantity_var).alias('MONTH_AVG'))

            si_df = ADS_categ_monthly_avg.join(
                        ADS_categ.groupBy(coln).agg(F.mean(quantity_var).alias('YEAR_AVG')),
                        on=coln, how='inner'
                   ).withColumn(f"{naming}_SI_MONTHLY", F.col("MONTH_AVG") / F.col("YEAR_AVG"))

            si_cols = ['REPORT_MONTH']
            si_value = f"{naming}_SI_MONTHLY"

        # ------------ QUARTERLY SI ----------------
        elif granularity == 'qtr':

            ADS_categ_quarterly_avg = ADS_categ.groupBy(coln + ['REPORT_QUARTER']) \
                                               .agg(F.mean(quantity_var).alias('QTR_AVG'))

            si_df = ADS_categ_quarterly_avg.join(
                        ADS_categ.groupBy(coln).agg(F.mean(quantity_var).alias('YEAR_AVG')),
                        on=coln, how='inner'
                    ).withColumn(f"{naming}_SI_QUARTERLY", F.col("QTR_AVG") / F.col("YEAR_AVG"))

            si_cols = ['REPORT_QUARTER']
            si_value = f"{naming}_SI_QUARTERLY"

        # ------------ WEEKLY SI (default) ----------------
        else:

            ADS_categ_week_avg = ADS_categ.groupBy(coln + ['REPORT_WEEK']) \
                                          .agg(F.mean(quantity_var).alias('WEEK_AVG'))

            si_df = ADS_categ_week_avg.join(
                        ADS_categ.groupBy(coln).agg(F.mean(quantity_var).alias('YEAR_AVG')),
                        on=coln, how='inner'
                    ).withColumn(f"{naming}_SI_WEEKLY", F.col("WEEK_AVG") / F.col("YEAR_AVG"))

            si_cols = ['REPORT_WEEK']
            si_value = f"{naming}_SI_WEEKLY"

        # Append
        if all_years_si is None:
            all_years_si = si_df
        else:
            all_years_si = all_years_si.union(si_df)

    # Average SI across years
    avg_si = all_years_si.groupBy(coln + si_cols).agg(F.mean(si_value).alias(si_value))

    # Join back to main DF
    si_df = df.join(avg_si, on=coln + si_cols, how='left')

    return si_df


In [0]:
if si_needed:
    for cat in categorical_columns_ads:
        if isinstance(cat,list):
            cat_ls = cat
            cat = "_".join(cat_ls)
        elif isinstance(cat,str):
            cat_ls = [cat]

        print(cat,"level SI creation process started...")
        # create si_weekly
        globals()[f"si_week_{cat}"] = create_seasonality_index(feature_engineering_output, coln=cat_ls, si_target_column = si_target_column, granularity=None)
        globals()[f"si_week_{cat}"] = globals()[f"si_week_{cat}"].select(app_config["general_inputs"]['date_var'], *modeling_granularity, f"{cat}_si_weekly")

        # #joining the si
        print(f"\t===> weekly level SI generated...")


        # create si_monthly
        globals()[f"si_month_{cat}"] = create_seasonality_index(feature_engineering_output, coln=cat_ls, si_target_column = si_target_column, granularity='month')
        globals()[f"si_month_{cat}"] = globals()[f"si_month_{cat}"].select(app_config["general_inputs"]['date_var'], *modeling_granularity, f"{cat}_si_monthly")
        print(f"\t===> monthly level SI generated...")


        # Create si_qtrly
        globals()[f"si_qtr_{cat}"] = create_seasonality_index(feature_engineering_output, coln=cat_ls, si_target_column = si_target_column,granularity='qtr')
        globals()[f"si_qtr_{cat}"] = globals()[f"si_qtr_{cat}"].select(app_config["general_inputs"]['date_var'], *modeling_granularity, f"{cat}_si_quarterly")

        # #joining the si
        print(f"\t===> quaterly level SI generated...")


### 1.5. Merging back with granular results

In [0]:
if si_needed:
    for cat in categorical_columns_ads:
        if isinstance(cat,list):
            cat_ls = cat
            cat = "_".join(cat_ls)
        elif isinstance(cat,str):
            cat_ls = [cat]

        print(f"Integrating {cat} level SI data to granular outputs")
        

        # Dropping duplicates (if any) before the merge operation
        globals()[f"si_week_{cat}"] = globals()[f"si_week_{cat}"].dropDuplicates()
        globals()[f"si_month_{cat}"] = globals()[f"si_month_{cat}"].dropDuplicates()
        globals()[f"si_qtr_{cat}"] = globals()[f"si_qtr_{cat}"].dropDuplicates()

        # create si_weekly
        #joining the si
        feature_engineering_output = feature_engineering_output.join(globals()[f"si_week_{cat}"], on = [app_config["general_inputs"]["date_var"], *modeling_granularity],how = "left")
        print(f"\t===>Joining weekly SI data")


        # create si_monthly
        #joining the si
        feature_engineering_output = feature_engineering_output.join(globals()[f"si_month_{cat}"],on = [app_config["general_inputs"]["date_var"], *modeling_granularity],how = "left")
        print(f"\t===>Joining monthly SI data")


        # # Create si_qtrly
        # #joining the si
        feature_engineering_output = feature_engineering_output.join(globals()[f"si_qtr_{cat}"],on = [app_config["general_inputs"]["date_var"], *modeling_granularity],how = "left")
        print(f"\t===>Joining quarterly SI data")

In [None]:
feature_engineering_output

## Generating trend variables

In [None]:
feature_engineering_output = (
    feature_engineering_output
    .with_columns(["report_week","report_month","report_quarter","report_year"],[F.weekofyear(F.col(date_col)),F.month(F.col(date_col)),F.quarter(F.col(date_col)),F.year(F.col(date_col))]
        
    )
)

def add_linear_trend(df, coln: list, granularity: str):
    """
    Add a linear trend column to the DataFrame based on the specified granularity.

    Args:
        df: Input DataFrame
        coln: List of column names to group by (e.g., ['year', 'month'])
        granularity: One of ['month', 'quarter', 'year']

    Returns:
        DataFrame with trend column added
    """
    valid_granularities = {'month', 'quarter', 'year'}
    if granularity not in valid_granularities:
        raise ValueError(f"Invalid granularity: {granularity}. Must be one of {valid_granularities}")

    trend_col_name = f"trend_{granularity}"
    
    trend_df = df.select(*coln).distinct()
    trend_df = trend_df.withColumn(trend_col_name, F.row_number().over(Window.orderBy(coln)))
    
    df = df.join(trend_df, on=coln, how='left')
    return df

# Add monthly trend column
feature_engineering_output = add_linear_trend(feature_engineering_output, coln=['report_year', 'report_month'], granularity='month')

# Add quarterly trend column
feature_engineering_output = add_linear_trend(feature_engineering_output, coln=['report_year', 'report_quarter'], granularity='quarter')

# Add yearly trend column
feature_engineering_output = add_linear_trend(feature_engineering_output, coln=['report_year'], granularity='year')


In [None]:
feature_engineering_output

In [None]:
print("Feature Engineering output -> SHAPE")
print("Number of rows --->",feature_engineering_output.count())
print("Number of columns --->",len(feature_engineering_output.columns))

In [None]:
# # write_file(pos, algo_path, "/Pos_standardization_results (")
results_s_with_ts = feature_engineering_output.with_column("LOAD_TS", F.current_timestamp())
results_s_with_ts.write.mode("overwrite").save_as_table(output_table_name)
print(results_s_with_ts.count())

In [None]:
test_df = session.table(output_table_name)
print("All data ->", test_df.count())
latest_ts = test_df.select(F.max("LOAD_TS")).collect()[0][0]
feature_engineering_output = test_df.filter(F.col("LOAD_TS") == F.lit(latest_ts))
print("Latest data ->", feature_engineering_output.count())

# Trend and SI features for Future Periods

## Trend features

In [None]:
from snowflake.snowpark import functions as F

from snowflake.snowpark.functions import col, current_date, dateadd, lit

future_periods = int(app_config["future_forecast"]["no_of_future_periods"])

df_hist = (
    feature_engineering_output
    .filter(F.col("START_OF_WEEK") <= F.dateadd("day", F.lit(-7), F.current_date()))
)

# 1. Get max START_OF_WEEK
max_week_df = df_hist.select(F.max("START_OF_WEEK").alias("max_week"))
max_week = max_week_df.collect()[0]["MAX_WEEK"]

# 2. Create next 5 weeks
future_weeks = (
    df_hist.session
      .range(1, future_periods+1)  # generates 1..5
      .select(
          F.dateadd("week", F.col("ID"), F.lit(max_week)).alias("START_OF_WEEK")
      )
)

# 3. Append to original dataframe
future_df = df_hist.select("START_OF_WEEK").union_by_name(future_weeks)
future_df

In [None]:
future_df = (
    future_df
    .with_columns(["report_week","report_month","report_quarter","report_year"],[F.weekofyear(F.col(date_col)),F.month(F.col(date_col)),F.quarter(F.col(date_col)),F.year(F.col(date_col))]
        
    )
)
future_df

In [None]:
# Add monthly trend column
future_df = add_linear_trend(future_df, coln=['report_year', 'report_month'], granularity='month')

# Add quarterly trend column
future_df = add_linear_trend(future_df, coln=['report_year', 'report_quarter'], granularity='quarter')

# Add yearly trend column
future_df = add_linear_trend(future_df, coln=['report_year'], granularity='year')
future_df

In [None]:
results_s_with_ts = future_df.with_column("LOAD_TS", F.current_timestamp())
results_s_with_ts.write.mode("overwrite").save_as_table("TRENDS_FUTURE_DATA")
print(results_s_with_ts.count())

## SI features

In [None]:
#weekly_si = create_seasonality_index(feature_engineering_output, coln=cat_ls, si_target_column = si_target_column, granularity=None)
weekly_si = feature_engineering_output.select(["REGIONNAME","REPORT_WEEK","REGIONNAME_SI_WEEKLY"]).dropDuplicates()
weekly_si

In [None]:
results_s_with_ts = weekly_si.with_column("LOAD_TS", F.current_timestamp())
results_s_with_ts.write.mode("overwrite").save_as_table("SI_FUTURE_DATA")
print(results_s_with_ts.count())