In [None]:
import pandas as pd
import numpy as np

class Utils:
    pass

class ConfigManager:
    def __init__():
        pass

    def update_config():
        pass

class IOHandler:
    def __init__(self, data_directory = "data",**kwargs):
        self.data_directory = data_directory
        for k in kwargs.keys():
          self.__setattr__(k, kwargs[k])
    def read_data(self, **kwargs):
        pass
    def read_config(self, **kwargs):
        pass
    def write_data(self, **kwargs):
        pass
    def write_config(self, **kwargs):
        pass
    
class Validator:
    def __init__(self) -> None:
        pass
    
    def validate(self, data, config):
        pass

class Extractor:
    def __init__(self) -> None:
        pass

    def extract(self, data, config):
        pass

class Parser:
    def __init__(self) -> None:
        pass

    def parse_data(self, data, config):
        pass

    def parse_config(self, config):
        pass

class Processor:
    def __init__(self) -> None:
        pass

    def process(self, data, config):
        pass

class ModelTransformer:
    pass

class Model:
    pass

__ConfigManager:__

1. Read configs (`ConfigManager`)
2. Validate configs (`Parser`)
3. return `model_config` and `data_config`

__DataLoader:__

1. Read data  (`IOHandler`)
2. Parse data in right format (`Parser`)
3. Extract data info  (`MetadataExtractor`)
4. Validate data (`Validator`)
5. return `model_input_data` (__`metadata` NOT required now__)

__Logging should be only critical and warning in setup 1 to 4. Any other info logs has to be shown in step 5.__

__ModelTransformer:__

1. Initialize using the configs (`ModelTransformer`)

__Model:__

1. Initialize using the configs (`Model`)

__Pipeline:__

1. `Pipeline = ModelTransformer + Model`
2. Pipeline.fit(data)
3. Pipeline.predict(data)

__Logger:__

1. Log info related to data/model/other metadata (`Logger`): We need to see how we fo `file handler` (info) and `streaming handler` (critical or warning).

### __GC V1 - APIs__

```python
from lola_generator_components.so.models import HoltWinters
model_input_data = pd.read_csv("model_input_data.csv")

model = HoltWinters(country = "tz", model_config = model_config, data_config = data_config)
prepared_data = model.prepare(model_input_data)
X_train, y_train, X_test, y_test = model.split(prepared_data)
model.fit(X_train, y_train)
model.recommend(X_test)
model.score(X_test, y_test)
```


### __GC V2 - APIs__

```python
from lola_generator_components.so.models import HoltWinters
from lola_generator_components.so.models import HoltWintersTransformer
from lola_generator_components.utils import pipeline, time_series_split, DataLoader, ConfigManager

model_config, data_config = ConfigManager().parse_and_validate().load_configs()
# model_config, data_config = ConfigManager(model_config_path = "", data_config_path = "").parse_and_validate().load_configs()
data = DataLoader(**data_config["io_config"]).load_data()
pipeline = pipeline([
                        ("ModelTransformer", HoltWintersTransformer(**data_config["processor_config"])),
                        ("Model", HoltWinters(**model_config))
                    ])
X_train, y_train, X_test, y_test = time_series_split(data, **data_config["split_config"])
pipeline.fit(X_train, y_train)
pipeline.predict(X_test)
pipeline.score(X_test, y_test)
```

### __This is one time or less frequent task. Once you have the directory structure, `data` and `configs` unless you don't want to update this, you don't run `get_config`, `get_data`__

```python
from lola_generator_components.utils import project_init
from lola_generator_components.utils import get_data
from lola_generator_components.utils import get_config
from lola_generator_components.utils import describe_models

project_init()
get_data()
get_config()
describe_models()
```

```bash
lola_generator_components init_project  # folder structure
lola_generator_components get_data      # get data
lola_generator_components get_config    # get config
```

In [None]:
sample_data_config_initial = {
    'io_config': {
        'data_directory': 'data',
        'config_directory': 'configs',
        'model_directory': 'models',
        'log_directory': 'logs',
        'compression': None,
        'data_format': 'csv',
        'select_columns': ['poc_id', 'sku_id', 'quantity', 'date'],
        'data_type_dict': {
            'number': ['quantity'],
            'category': ['poc_id', 'sku_id'],
            'date': ['date'],
            'boolean': None,
            },
        'column_mapper': {
            'poc_id': 'poc_id',
            'sku_id': 'sku_id',
            'quantity': 'quantity',
            'date': 'date',
            'region_id': None,
            'country_id': None,
            'city_id': None,
            'order_id': None,
            'channel_id': None,
            'brand_id': None,
            'subsegment_id': None,
            'state_id': None,
            'segment_id': None,
            'route_id': None,
            'delivery_center_id': None,
            'deliver_region_id': None,
            'sales_route_id': None,
            },
        },
    'parsing_config': {
        'date_format_configs': {'date': '%Y-%m-%d'},
        'input_data_precision': None,
        'output_data_precision': None,
        'optimized_data_schema_file_name': 'optimized_data_schema.json',
        },
    'validator_config': {
        'column_check': None,
        'range_check': {'date': None,'quantity': None},
        'unique_check': {'poc_id': None, 'sku_id': None},
        'null_check': ['poc_id', 'sku_id', 'date'],
        'duplicate_check': ['poc_id', 'sku_id', 'date'],
        },
    'processor_config': {
        'model_id': 'model_id',
        'model_id_constructor': ['poc_id', 'sku_id'],
        'model_split_character': '|||',
        'aggregation_date_freq_configs': {'date': 'MS'},
        'aggregation_category_level':['poc_id', 'sku_id','brand_id'],
        'aggregation_function': {
            'quantity': ['sum', 'mean'],
        },
        "numerical_features": ["brand_id", "some_float_variable"],
        "category_features": ["brand_id", "some_float_variable"],
        "target": "quantity",
        # some configs can be here related to transformer
        },
        "split_config":{
            "split_type": "time",
            "split_column": "date",
            "split_date": None,
        }
    }

# split_date = None, meaning max date in data
# features = None, meaning all columns except target
# Other than date key other elements are optional

In [None]:
sample_data_config_user_input = {
    'io_config': {
        'select_columns': ['POC_ID', 'sku_id', 'quantity', 'date'],
        'data_type_dict': {
            'date': ['date'],
            },
        'column_mapper': {
            'poc_id': 'POC_ID'
            },
        },
    'parsing_config': {
        'date_format_configs': {'date': '%Y-%m-%d'}
        },
    'validator_config': {
        'column_check': ['poc_id', 'sku_id', 'quantity', 'date'],
        'range_check': {'date': ['2020-01-01', '2020-01-05'],'quantity': [0, 2]},
        'unique_check': {'poc_id': [45, 66, 48], 'sku_id': [48, 40, 65, 75]},
        'null_check': ['poc_id', 'sku_id', 'date'],
        'duplicate_check': ['poc_id', 'sku_id', 'date'],
        },
    'processor_config': {
        'model_id': 'model_id',
        'model_id_constructor': ['poc_id', 'sku_id'],
        'aggregation_date_freq_configs': {'date': 'MS'},
        'aggregation_category_level':['poc_id', 'sku_id'],
        'aggregation_function': {
            'quantity': ['sum'],
        },
        "numerical_features": ["brand_id", "some_float_variable"],
        "category_features": ["brand_id", "some_float_variable"],
        "target": "quantity",
        # some configs can be here related to transformer
        },
        "split_config":{
            "split_type": "time",
            "split_column": "date",
            "split_date": "15/12/2023",
        }
    }

In [10]:
col_mapper_default = {'poc_id': 'poc_id','sku_id': 'sku_id','quantity': 'quantity','date': 'date'}
col_mapper_user = {'poc_id': 'POC_ID', 'sku_id': None}

def update_config(dict_1, dict_2):
    return {**dict_1, **dict_2}

In [11]:
update_config(col_mapper_default, col_mapper_user)

{'poc_id': 'POC_ID', 'sku_id': None, 'quantity': 'quantity', 'date': 'date'}

In [19]:
# import pandas as pd
# (pd.DataFrame([col_mapper_default]).T).rename(columns={0: 'default'}).join(pd.DataFrame([col_mapper_user]).T.rename(columns={0: 'user'}))

Unnamed: 0,default,user
poc_id,poc_id,POC_ID
sku_id,sku_id,
quantity,quantity,
date,date,


Unnamed: 0,0
poc_id,POC_ID
sku_id,
