# mloda demo: How can we make feature engineering shareable?

### Define dummy data as plugin

In [None]:
import numpy as np
from mloda.provider import FeatureGroup, DataCreator


class DummyData(FeatureGroup):
    @classmethod
    def calculate_feature(cls, data, features):
        n_samples = features.get_options_key("n_samples") or 100
        return {
            "age": np.random.randint(18, 80, n_samples),
            "weight": np.random.normal(70, 15, n_samples),
            "state": np.random.choice(["CA", "NY", "TX", "FL"], n_samples),
            "gender": np.random.choice(["M", "F"], n_samples),
        }

    @classmethod
    def input_data(cls):
        return DataCreator({"age", "weight", "state", "gender"})

### Request mlodaAPI to create features

In [None]:
# We load dependencies.
import mloda

# Load plugins into namespace
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
from mloda_plugins.compute_framework.base_implementations.pyarrow.table import PyArrowTable

# from mloda.user import PluginLoader
# plugin_loader = PluginLoader.all()

result = mloda.run_all(["age", "weight", "state", "gender"], compute_frameworks=["PyArrowTable", "PandasDataFrame"])
print(result)

### Alternative options to consume data

- Apidata
- Files
- DBs
- Streams
- ...

This is not the heart of mloda.

### Chain features - automatic dependency resolution

In [None]:
# Load plugin into namespace again
from mloda_plugins.compute_framework.base_implementations.polars.lazy_dataframe import PolarsLazyDataFrame
from mloda_plugins.feature_group.experimental.aggregated_feature_group.polars_lazy import (
    PolarsLazyAggregatedFeatureGroup,
)


result = mloda.run_all(
    ["age__sum_aggr"],
    compute_frameworks=["PolarsLazyDataFrame"],
)
print(result)

As long as the plugins exists, we can run any datatransformation.

### What is behind the "age__sum_aggr" syntax?

In [None]:
from mloda.user import Feature, Options

feature = Feature(
    name="CustomConfiguration",
    options=Options(context={"aggregation_type": "sum", "in_features": Feature("age", options={"n_samples": 5})}),
)

result = mloda.run_all(
    [feature],
    compute_frameworks=["PolarsLazyDataFrame"],
)
print(result)

### How the chaining essentially works 

```python
class FeatureGroup(ABC):

    def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
        
        # In principle, the resolver checks if the feature group depends on another input feature
        # -> then adds it to the chain of features which need to be resolved
        if feature_name contains "input_feature__sum_aggr":
            return input_feature

    # How does mloda knows a feature matches a feature group?
    # Customizable, but some good guesses
    @classmethod
    def match_feature_group_criteria(
        cls,
        feature_name: Union[FeatureName, str],
        options: Options,
        data_access_collection: Optional[DataAccessCollection] = None,
    ) -> bool:
```

### Now we have chaining and matching. Why do we do this?


```python
class FeatureGroup(ABC):

    @classmethod
    def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
        """
        This function should be used to calculate the feature.
        """
        
        # data is the incoming data from other feature dependencies or data via API

        # features is the configuration
```

### Business knowledge is in the data and in the configuration, but not in the plugin definition.

## Big idea

**Separate business logic from transformation logic:**

- Plugins = generic transformations (shareable across companies)
- Data + Config = your business knowledge (stays private)

→ Stop rewriting "sum of a column" at every company

→ Build a shared ecosystem of feature engineering plugins