In [7]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
import json
from datetime import datetime
import warnings

from enum import Enum

# Enum for alert types
class ALERT(Enum):
    WARNING = 1
    ERROR = 2

# Alert class to represent alerts and warnings
class Alert:
    def __init__(self, timestamp, stage, substage=None, message="", alert_type=ALERT.WARNING):
        self.timestamp = timestamp
        self.stage = stage
        self.substage = substage
        self.message = message
        self.alert_type = alert_type

    def __str__(self):
        return (
            f"{self.timestamp}\t"
            f"{self.alert_type}\t"
            f"{self.stage}\t"
            f"{self.substage}\t"
            f"{self.message}"
        )

class Stage():
    def __init__(self, config_path) -> None:
        # Load the configuration from JSON file
        with open(config_path, "r") as config_file:
            self.config = json.load(config_file)

    @abstractmethod
    def run_stage(self):
        pass
    
    # Method to create and print an error
    def create_alert(self, substage=None, message="", alert_type=ALERT.WARNING):
        alert = Alert(datetime.now(), self.config['stage_name'], substage, message, alert_type)
        print(str(alert))


# Data validation class
class DataValidation(Stage):

    def run_stage(self):
        # Load input data
        input_dir = self.config['input']['dir']
        input_files = self.config['input']['file']
        input_function = self.config['function']

        for file_config in input_files:
            if 'calendar' in file_config:
                file_name = file_config.get('calendar')
            elif 'time series' in file_config:
                file_name = file_config.get('time series')
            
            file_format = self.config['input']['fileFormat']
            file_path = f"{input_dir}/{file_name}.{file_format}"

            dataframe = pd.read_csv(file_path, index_col=0)
            # Set the DataFrame name
            dataframe.name = file_config.get('calendar') or file_config.get('time series')  

            functions = file_config.get('functions', [])
            
            for function_name in functions:
                
                function_config = next((func for func in input_function if func['name'] == function_name), {})
                params_section = function_config.get('params', {})
                
                if hasattr(self, function_name):
                    function = getattr(self, function_name)
                    if function_name == 'validate_date_format':
                        dataframe = function(dataframe)
                        
                    if dataframe.name == 'forecastra_sample':
                        if function_name == 'creating_unique_key':
                            key_columns = params_section.get('key_columns', [])
                            dataframe = function(dataframe, key_columns)
                            
                        elif function_name == 'find_missing_dates':
                            date_freq = file_config.get('params', {}).get('date_freq', 'D')
                            function(dataframe, date_freq)
                        elif function_name == 'validate_minimum_timeperiod_value':
                            min_time_period = file_config.get('params', {}).get('min_time_period', 100)
                            function(dataframe, min_time_period)

                # Save output data
                output_dir = self.config['output']['dir']
                output_files = self.config['output']['file']

                if dataframe.name == 'calendar_data':
                    # Save the 'calendar' data
                    for file_config in output_files:
                        if 'calendar' in file_config:
                            file_name = file_config.get('calendar')
                            file_format = self.config['output']['fileFormat']
                            file_path = f"{output_dir}/{file_name}.{file_format}"
                            dataframe.to_parquet(file_path, index=False)

                elif dataframe.name == 'forecastra_input_data':
                    # Save the 'forecastra input' data
                    for file_config in output_files:
                        if 'time series' in file_config:
                            file_name = file_config.get('time series')
                            file_format = self.config['output']['fileFormat']
                            file_path = f"{output_dir}/{file_name}.{file_format}"
                            dataframe.to_parquet(file_path, index=False)

   
    # Function to create a unique key in the dataframe
    def creating_unique_key(self, dataframe, key_columns):
        print("Creating unique key with columns:", key_columns)
        if len(key_columns) == 1:
            # Create new column as 'unique_id'
            dataframe['unique_id'] = dataframe[key_columns[0]]
            
        else:
            # Combine selected columns into a new 'unique_id' column
            dataframe['unique_id'] = dataframe[key_columns].apply(lambda row: '_'.join(row.astype(str)), axis=1)

        return dataframe

    # Function to validate the date format in the dataframe
    def validate_date_format(self, dataframe):
        if not pd.api.types.is_datetime64_any_dtype(dataframe['date']):
            for date_str in dataframe['date']:
                try:
                    datetime.strptime(date_str, '%Y-%m-%d')
                except ValueError as e:
                    self.create_alert(message=f"Invalid date format: {date_str}. Please use the format YYYY-MM-DD.")
                    
        return dataframe

    # Function to find missing dates in the dataframe                    
    def find_missing_dates(self, dataframe, date_freq='D'):
        # Checking for missing dates

        # Sort the dataframe by 'date'
        dataframe['date'] = pd.to_datetime(dataframe['date']).astype('datetime64[ns]')
        df = dataframe.sort_values('date')

        # Generate a range of dates covering the desired time period
        start_date = df['date'].min()
        end_date = df['date'].max()
        date_range = pd.date_range(start=start_date, end=end_date, freq=date_freq)

        # Find missing dates
        missing_dates = date_range[~date_range.isin(df['date'])]

        # Generate warnings for missing dates
        for missing_date in missing_dates:
            self.create_alert(message=f"Warning: Data is missing for date {missing_date.date()}")

    # Function to validate the minimum time period in the dataframe
    def validate_minimum_timeperiod_value(self, dataframe, min_time_period=100):
        # User to enter minimum time period value

        if dataframe['date'].max() - dataframe['date'].min() < pd.Timedelta(days=min_time_period):
            self.create_alert(message=f"Warning: Data does not have a minimum time period of {min_time_period} days.")

if __name__ == "__main__":
    # Create and run a test stage
    stage = DataValidation(config_path='D:/Forecastra Accelerator/dataset/.vscode/read_config.json')
    stage.run_stage()


File config: {'calendar': 'calendar_sample', 'functions': ['validate_date_format']}
Params section: {}
Params section for function validate_date_format : {}
before
          date           event
0   06-02-2011       SuperBowl
1   14-02-2011   ValentinesDay
2   21-02-2011   PresidentsDay
3   09-03-2011       LentStart
4   16-03-2011       LentWeek2
5   17-03-2011   StPatricksDay
6   20-03-2011       Purim End
7   24-04-2011  OrthodoxEaster
8   26-04-2011      Pesach End
9   05-05-2011   Cinco De Mayo
10  08-05-2011    Mother's day
after
          date           event
0   06-02-2011       SuperBowl
1   14-02-2011   ValentinesDay
2   21-02-2011   PresidentsDay
3   09-03-2011       LentStart
4   16-03-2011       LentWeek2
5   17-03-2011   StPatricksDay
6   20-03-2011       Purim End
7   24-04-2011  OrthodoxEaster
8   26-04-2011      Pesach End
9   05-05-2011   Cinco De Mayo
10  08-05-2011    Mother's day
File config: {'time series': 'forecastra_sample', 'functions': ['creating_unique_key',

  dataframe['date'] = pd.to_datetime(dataframe['date']).astype('datetime64[ns]')
