In [None]:
import os
%load_ext jupyter_black
from sklearn import set_config
set_config(display='diagram')
os.chdir("../../")

import pandas as pd

pd.set_option('display.max_columns', 50)


%load_ext kedro.ipython
%reload_kedro .



# Titanic use case



## Transformers and model imports

In [None]:
from project.packages.preprocessing.transformers.raw import RawDataProcessor
from project.packages.preprocessing.transformers.intermediate import (
    IntermediateDataProcessor,
)
from project.packages.preprocessing.transformers.primary import PrimaryDataProcessor
from project.packages.preprocessing.transformers.feature import FeatureDataProcessor
from project.packages.modelling.models.unsupervised.clustering_features import (
    KMeansClusteringFeatures,
)
from project.packages.modelling.models.supervised.sklearn import (
    BinaryClassifierSklearnPipeline,
)

##  Titanic dataset


In [None]:
df = pd.read_csv("data/01_raw/titanic_train.csv")
df

## Data Engineering

### Raw data preprocessing


In [None]:
raw_params = {
    "target": "Survived",
    "index": "passenger_id",
    "schemas": {
        "PassengerId": {"dtype": "int64", "name": "passenger_id"},
        "Survived": {"dtype": "int64", "name": "survived"},
        "Pclass": {"dtype": "int64", "name": "passenger_class"},
        "Name": {"dtype": "object", "name": "name"},
        "Sex": {"dtype": "object", "name": "passenger_sex"},
        "Age": {"dtype": "float64", "name": "passenger_age"},
        "Parch": {"dtype": "int64", "name": "passenger_parch"},
        "Ticket": {"dtype": "object", "name": "passenger_ticket"},
        "Fare": {"dtype": "float64", "name": "passenger_fare"},
        "Cabin": {"dtype": "object", "name": "passenger_cabin"},
        "Embarked": {"dtype": "object", "name": "passenger_embarked_port"},
        "SibSp": {"dtype": "int64", "name": "passenger_siblings"},
    },
}

raw_transformer = RawDataProcessor(raw_params)
df_raw = raw_transformer.fit_transform(df)
df_raw

### Intermediate data preprocessor


In [None]:
intermediate_params = {
    "target": "survived",
    "outlier_params": {"iqr_alpha": 2.5, "q1_quantile": 0.25, "q3_quantile": 0.75},
    "drop_columns": ["name"],
    "categorical_features": [
        "passenger_sex",
        "passenger_ticket",
        "passenger_cabin",
        "passenger_embarked_port",
    ],
}

int_transformer = IntermediateDataProcessor(intermediate_params)
df_int = int_transformer.fit_transform(df_raw)

### Primary data preprocessing


In [None]:
primary_params = {
    "target": "supervised",
    "categorical_columns_fillna": {
        "passenger_cabin": "unknown",
        "passenger_embarked_port": "unknown",
    },
}
prm_transformer = PrimaryDataProcessor(primary_params)
df_prm = prm_transformer.fit_transform(df_int)
df_prm

### Feature engineering 

#### 1. Encoding and specific features data creation



In [None]:
feature_params = {
    "target": "survived",
    "encoding_transform": {
        "one_hot_encoder": [
            "passenger_cabin_level",
            "passenger_embarked_port",
            "passenger_sex",
        ],
        "similarity_based_encoder": None,
    },
}
feat_transformer = FeatureDataProcessor(feature_params)
df_feat = feat_transformer.fit_transform(df_prm)
df_feat

In [None]:
cluster_model_params = {
    "class": "project.packages.modelling.models.unsupervised.segmentation.KMeansElbowSelector",
    "kwargs": {"min_clusters": 1, "max_clusters": 15},
}
cluster_scaler_params = {
    "class": "project.packages.modelling.transformers.scaler.ColumnsPreserverScaler",
    "kwargs": {
        "scaler_params": {"class": "sklearn.preprocessing.MinMaxScaler", "kwargs": {}}
    },
}
cluster_imputer_params = {
    "class": "project.packages.modelling.models.unsupervised.imputer.ColumnsPreserverImputer",
    "kwargs": {
        "imputer_params": {
            "class": "sklearn.impute.KNNImputer",
            "kwargs": {"n_neighbors": 10, "weights": "distance"},
        }
    },
}

# cluster feature name and features used to create the cluster feature
cluster_feature_params = {
    "passenger_cabin_cluster_feature": [
        "passenger_cabin_level_a",
        "passenger_cabin_level_b",
        "passenger_cabin_level_c",
        "passenger_cabin_level_d",
        "passenger_cabin_level_e",
        "passenger_cabin_level_f",
        "passenger_cabin_level_g",
        "passenger_cabin_level_t",
        "passenger_cabin_level_unknown",
    ],
    "passenger_embarked_port_cluster_feature": [
        "passenger_embarked_port_c",
        "passenger_embarked_port_q",
        "passenger_embarked_port_s",
        "passenger_embarked_port_unknown",
    ],
    "passenger_ticket_number_cluster_feature": [
        "passenger_ticket_number",
        "passenger_ticket_unknown_base",
    ],
    "passenger_family_cluster_feature": [
        "passenger_siblings",
        "passenger_parch",
        "passenger_cabin_number",
        "passenger_number_of_family_onboard",
    ],
    "passenger_social_status_cluster_feature": [
        "passenger_class",
        "passenger_age",
        "passenger_sex_female",
    ],
}

cluster_transformer = KMeansClusteringFeatures(
    model_params=cluster_model_params,
    scaler_params=cluster_scaler_params,
    feature_params=cluster_feature_params,
    imputer_params=cluster_imputer_params,
)
data = cluster_transformer.fit_transform(df_feat)
data

### Data engineering in a single sklearn pipeline

In [None]:
from sklearn.pipeline import Pipeline


pipeline = Pipeline(
    [
        ("raw_transformations", RawDataProcessor(raw_params)),
        (
            "intermediate_transformations",
            IntermediateDataProcessor(intermediate_params),
        ),
        ("primary_transformations", PrimaryDataProcessor(primary_params)),
        ("feature_transformations", FeatureDataProcessor(feature_params)),
        (
            "cluster_feature_transformations",
            KMeansClusteringFeatures(
                model_params=cluster_model_params,
                scaler_params=cluster_scaler_params,
                feature_params=cluster_feature_params,
                imputer_params=cluster_imputer_params,
            ),
        ),
    ],
)

data = pipeline.fit_transform(df)
data

### Model hypertune and train


In [None]:
model_params = {
    "scoring_metrics": [
        "accuracy",
        "balanced_accuracy",
        "f1",
        "f1_micro",
        "f1_macro",
        "f1_weighted",
        "precision",
        "precision_micro",
        "precision_macro",
        "precision_weighted",
        "recall",
        "recall_micro",
        "recall_macro",
        "recall_weighted",
        "roc_auc",
        "roc_auc_ovr",
        "roc_auc_ovo",
        "roc_auc_ovr_weighted",
        "roc_auc_ovo_weighted",
    ],
    "optuna": {
        "kwargs_study": {
            "direction": "maximize",
            "study_name": "xgboost",
            "load_if_exists": False,
        },
        "kwargs_optimize": {"n_trials": 500},
        "sampler": {
            "class": "optuna.samplers.TPESampler",
            "kwargs": {"n_startup_trials": 0, "constant_liar": True, "seed": 42},
        },
        "pruner": {"class": "optuna.pruners.SuccessiveHalvingPruner", "kwargs": {}},
    },
    "cv_strategy": {
        "class": "sklearn.model_selection.StratifiedKFold",
        "kwargs": {"n_splits": 5, "random_state": 42, "shuffle": True},
    },
    "cv_score": {
        "scoring": "f1_weighted",
        "class": "sklearn.model_selection.cross_val_predict",
        "kwargs": {
            "estimator": None,
            "X": None,
            "y": None,
            "cv": None,
            "n_jobs": -1,
            "method": "predict",
        },
    },
    "target": "survived",
    "features": [
        "passenger_class",
        "passenger_age",
        "passenger_siblings",
        "passenger_parch",
        "passenger_fare",
        "passenger_ticket_number",
        "passenger_ticket_unknown_base",
        "passenger_cabin_number",
        "passenger_number_of_family_onboard",
        "passenger_is_single",
        "passenger_has_childs",
        "passenger_cabin_level_a",
        "passenger_cabin_level_b",
        "passenger_cabin_level_c",
        "passenger_cabin_level_d",
        "passenger_cabin_level_e",
        "passenger_cabin_level_unknown",
        "passenger_embarked_port_c",
        "passenger_embarked_port_q",
        "passenger_embarked_port_s",
        "passenger_sex_female",
        "passenger_cabin_cluster_feature",
        "passenger_embarked_port_cluster_feature",
        "passenger_ticket_number_cluster_feature",
        "passenger_family_cluster_feature",
        "passenger_social_status_cluster_feature",
    ],
    "pipeline": {
        "imputer": {
            "class": "project.packages.modelling.models.unsupervised.imputer.ColumnsPreserverImputer",
            "kwargs": {
                "imputer_params": {
                    "class": "sklearn.impute.KNNImputer",
                    "kwargs": {
                        "n_neighbors": 'trial.suggest_int("knn_imputer__n_neighbors", 2, 20, step=1)',
                        "weights": 'trial.suggest_categorical("knn_imputer__weights", ["distance", "uniform"])',
                    },
                }
            },
        },
        "scaler": {
            "class": "project.packages.modelling.transformers.scaler.ColumnsPreserverScaler",
            "kwargs": {
                "scaler_params": {
                    "class": 'trial.suggest_categorical("scaler__transformer", ["project.packages.modelling.transformers.scaler.NotScalerTransformer", "sklearn.preprocessing.PowerTransformer", "sklearn.preprocessing.QuantileTransformer"])',
                    "kwargs": {},
                }
            },
        },
        "model": {
            "class": "xgboost.XGBClassifier",
            "kwargs": {
                "n_estimators": 'trial.suggest_int("xgboost__n_estimators", 10, 500, step=5)',
                "learning_rate": 'trial.suggest_float("xgboost__learning_rate", 0.0001, 1)',
                "min_child_weight": 'trial.suggest_int("xgboost__min_child_weight", 0, 500, step=1)',
                "max_depth": 'trial.suggest_int("xgboost__max_depth", 1, 8)',
                "subsample": 'trial.suggest_float("xgboost__subsample", 0.5, 1)',
                "reg_lambda": 'trial.suggest_float("xgboost__reg_lambda", 0, 5)',
                "reg_alpha": 'trial.suggest_float("xgboost__reg_alpha", 0, 1)',
                "random_state": 42,
            },
        },
    },
}

target = "survived"
model = BinaryClassifierSklearnPipeline(model_params)

y_train = data[[target]]
X_train = data[[col for col in data.columns if col != target]]

model = model.fit(X_train, y_train)
model

In [None]:
model.model

In [None]:
y_probs = model.predict_proba(data)
y_probs

In [None]:
model.predict(data)

## Al process in a single sklearn Pipeline



In [None]:
pipeline = Pipeline(
    [
        ("raw_transformations", RawDataProcessor(raw_params)),
        (
            "intermediate_transformations",
            IntermediateDataProcessor(intermediate_params),
        ),
        ("primary_transformations", PrimaryDataProcessor(primary_params)),
        ("feature_transformations", FeatureDataProcessor(feature_params)),
        (
            "cluster_feature_transformations",
            KMeansClusteringFeatures(
                model_params=cluster_model_params,
                scaler_params=cluster_scaler_params,
                feature_params=cluster_feature_params,
                imputer_params=cluster_imputer_params,
            ),
        ),
        ("model", BinaryClassifierSklearnPipeline(model_params)),
    ],
)

pipeline

In [None]:
pipeline.fit(df, y_train)

# Package CLI

## Overview

Welcome to the Package CLI readme. This comprehensive guide provides an in-depth understanding of the Package CLI, its structure, and its capabilities. The Package CLI is a powerful tool designed to encapsulate Kedro pipelines for a wide range of machine learning tasks, enabling streamlined data processing, model training, evaluation, and deployment.

## 1. Introduction<a name="introduction"></a>

The Package CLI simplifies complex machine learning workflows by breaking them down into modular pipelines. It seamlessly integrates with Kedro, a data engineering framework, to provide a structured approach to ML project development.

## 2. Package Structure<a name="package-structure"></a>

The package is organized as follows:

### Pipelines

- Pipelines are the backbone of the Package CLI. Each step in the ML workflow is broken down into separate pipelines. This modular approach improves code readability and maintainability.

### Logging and Tracking

- The Package CLI utilizes MLflow for logging and tracking experiments. This integration enables comprehensive monitoring of your ML projects, including model performance, data lineage, and hyperparameter optimization.

## 3. Logging and Tracking<a name="logging-and-tracking"></a>

MLflow is at the core of our logging and tracking system:

- **Data**: Input data, transformed data, and data splits are logged to ensure complete traceability.
- **Model Artifacts**: Serialized model files are logged, simplifying model replication and deployment.
- **Metrics**: Key performance metrics, such as accuracy, precision, recall, and F1-score, are tracked.
- **Hyperparameters**: Detailed information about the hyperparameters used during training is recorded.
- **Experiment Parameters**: Parameters set for each experiment run are logged for easy reproducibility.
- **Models reprotign**: All html files that reports, performance reports, hyperparameters study, model predictive control exploration and global model optimization reports are logged as artifacts.

## 4. Custom Pipelines<a name="custom-pipelines"></a>

The Package CLI provides a collection of custom Kedro pipelines, making it easy to integrate into your ML projects. These pipelines cover fundamental steps in the ML workflow, including:

- Data preprocessing and feature engineering.
- Model training and evaluation.
- Model deployment and serving.

These pipelines are designed to be highly modular, allowing you to extend or customize them to meet your specific project requirements.

## 5. Model Compatibility<a name="model-compatibility"></a>

The Package CLI includes the `BinaryClassifierSklearnPipeline` class, which is compatible with any machine learning model adhering to the scikit-learn API. This flexibility empowers you to experiment with a wide variety of models, including:

- Logistic Regression
- Random Forest
- Support Vector Machines
- Gradient Boosting
- Neural Networks
- Xgboost
- SVM
- k-NN

And all compatible models

## 6. Hyperparameter Tuning<a name="hyperparameter-tuning"></a>

Using the kedro cli. You can explore different hyperparameter settings for different models to optimize its performance, using **StratifiedKFold** cross validation strategy. Hyperparameter tuning is seamlessly integrated into the pipeline, simplifying the search for the best model configurations.




## Data engineering CLI<a name="data-engineering"></a>

In [None]:
!kedro run --pipeline data_engineering

Data engineering pipelines ends running from raw to cluster transformers and save train and test datasets ready to be used by models

# Data Science Pipelines

## Overview

The Data Science Pipelines project comprises 9 modular pipelines, each uniquely designed to optimize different machine learning models. These pipelines are thoughtfully constructed to leverage the full potential of your data and to achieve the highest possible model performance.

## Model Optimization

### Model Selection

Our pipelines explore a diverse range of models, including:

- Bagging Models
- Boosting Models
- Support Vector Machines (SVM)
- K-Nearest Neighbors (KNN)
- Neural Network Models

### Hyperparameter Tuning

To ensure that these models perform at their best, we employ hyperparameter tuning using cross-validation strategies. This rigorous process fine-tunes the model parameters to maximize predictive accuracy and generalization.

### Saving Trained Models

Once the models are optimized, they are trained on the entire dataset. These trained models are diligently saved for future use, facilitating easy inference on new data and seamless deployment in production environments.

## Out-of-Sample Predictions

To evaluate model performance, we generate out-of-sample predictions using the `cross_val_predict` class from scikit-learn. These predictions provide invaluable insights into how well the models generalize to unseen data.

## Metrics Reporting

The final segment of the pipelines involves the creation of a comprehensive metrics report. This report encompasses key performance metrics, enabling you to thoroughly assess the models' effectiveness. Commonly included metrics are accuracy, precision, recall, F1-score, and more.

## Production Deployment with MLflow

One of the standout features of these pipelines is their seamless integration with MLflow. The best-performing model is automatically registered in a production environment within MLflow. This production-ready model is primed for deployment and can be employed to perform inference on a test dataset. It plays a pivotal role in ranking models based on their performance, ensuring that only the top-performing models are deployed in production.

In conclusion, the Data Science Pipelines provided in this project offer a comprehensive and efficient means to optimize machine learning models and rigorously assess their performance. With modular pipelines, hyperparameter tuning, out-of-sample predictions, metrics reporting, and seamless integration with MLflow for production deployment, these pipelines are an invaluable asset for data scientists and machine learning practitioners. Utilize them to supercharge your data science projects and achieve exceptional model results.





In [None]:
!kedro run --pipeline data_science

## MLflow exploration


After running data engineering and data science pipelines you can access to the mlflow UI using the following command you should be able to see this url:

- **http://127.0.0.1:3001** 

Open it in a browser and navigate to the artifacts, metrics, models and registered models folder.

**Artifacts**: This section houses a variety of assets, including input data, transformed data, model files, and other relevant files. It provides a comprehensive view of the artifacts generated during your experiments.

**Metrics**: In the "Metrics" section, you can review key performance metrics recorded during your experiments. These metrics are crucial for assessing the effectiveness of your models and pipelines. Commonly logged metrics include accuracy, precision, recall, F1-score, and more.

**Models**: Explore the models that have been trained and logged during your experiments in the "Models" section. You can access detailed information about each model, including its hyperparameters, performance metrics, and any associated artifacts.

**Registered** Models: The "Registered Models" folder contains the best-performing models that have been specifically registered for production deployment. These models have undergone rigorous evaluation and are deemed ready for use in real-world applications. This section provides a streamlined view of models that meet the highest quality standards.



In [None]:
!kedro mlflow ui

## Model productionalization

After reviweing MLflow models, next step is put these model on production through an API, so package also contains API version to the production model and transformers used to create these model.



## Put the API on production

Excecute the following command to have a POST endpoint with the production model.

!python src/project/api/model_serving/app.py

In [None]:
!python src/project/api/model_serving/app.py

## Test model API

!python src/project/api/model_serving/ping_api.py


In [None]:
!python src/project/api/model_serving/ping_api.py

After testing the API you should recieve this text on the terminal


**Model response: Matheus Pinto Arratia has a survival probability of 30.87 [%]**

So, I would probably die ...



## Package testing 

You can test the whole package using the following command

!kedro test src/project/packages


You should be able to see the hole package testing and modules that are missing to be tested



## About CI/CD



# Application Deployment Architectures

When considering deploying these applications into production, there are several architectural options to choose from, each with its own set of advantages and disadvantages.

## Gitflow

**Pros:**
- Ensures a well-structured development process with clear branching strategies.
- Continuous Integration (CI) pipeline ensures code quality and functionality.
- Integration tests and unit tests provide end-to-end testing coverage.
- Code versioning and history tracking.

## Docker

API, pipelines and packages should be containarized using docker, in order to maintain reproducibility

**Pros:**
- Enables containerization, ensuring reproducibility across different environments.
- Simplifies deployment and scaling as containers can run consistently in various environments.
- Supports microservices architecture, allowing for modularization of applications.
- Easier management of dependencies within containers.


## Deployment

Using azure function, lambda functions --> API Gateway

### Serverless

**Pros:**
- Cost-effective as you only pay for actual usage.
- Auto-scaling and automatic resource management.
- Low operational overhead as the cloud provider handles infrastructure.
- Well-suited for event-driven applications and microservices.

**Cons:**
- Limited control over infrastructure, which may be a limitation for specific requirements.
- Cold start latency can impact response times for some functions.
- Debugging and monitoring can be more challenging in a serverless environment.


### Load Balancer and Server Deployment

**Pros:**
- Provides control over the underlying infrastructure.
- Suitable for applications with specific hardware requirements.
- Can be cost-effective for steady-state workloads.
- Greater flexibility in configuring load balancing algorithms.


**Cons:**
- Increased operational complexity compared to serverless or containerized approaches.
- Limited scalability during traffic spikes without proper automation.

