In [27]:
import mlflow
from datetime import datetime

In [6]:
from pathlib import Path
from kedro.framework.context import load_context

 
current_dir = Path.cwd()  
proj_path = current_dir.parent  
context = load_context(proj_path)


In [9]:
df = catalog.load("example_iris_data")

2021-06-19 22:15:01,632 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet)...


In [11]:
from typing import Any, Dict
import pandas as pd


In [10]:
def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> Dict[str, Any]:
    data.columns = [
        "sepal_length",
        "sepal_width",
        "petal_length",
        "petal_width",
        "target",
    ]
    classes = sorted(data["target"].unique())
    # One-hot encoding for the target variable
    data = pd.get_dummies(data, columns=["target"], prefix="", prefix_sep="")

    # Shuffle all the data
    data = data.sample(frac=1).reset_index(drop=True)

    # Split to training and testing data
    n = data.shape[0]
    n_test = int(n * example_test_data_ratio)
    training_data = data.iloc[n_test:, :].reset_index(drop=True)
    test_data = data.iloc[:n_test, :].reset_index(drop=True)

    # Split the data to features and labels
    train_data_x = training_data.loc[:, "sepal_length":"petal_width"]
    train_data_y = training_data[classes]
    test_data_x = test_data.loc[:, "sepal_length":"petal_width"]
    test_data_y = test_data[classes]

    # When returning many variables, it is a good practice to give them names:
    return dict(
        train_x=train_data_x,
        train_y=train_data_y,
        test_x=test_data_x,
        test_y=test_data_y,
    )

In [32]:

import logging
from typing import Any, Dict

import numpy as np
import pandas as pd


def train_model(
    train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any]
) -> np.ndarray:
    """Node for training a simple multi-class logistic regression model. The
    number of training iterations as well as the learning rate are taken from
    conf/project/parameters.yml. All of the data as well as the parameters
    will be provided to this function at the time of execution.
    """
    num_iter = parameters["example_num_train_iter"]
    lr = parameters["example_learning_rate"]
    X = train_x.to_numpy()
    Y = train_y.to_numpy()

    # Add bias to the features
    bias = np.ones((X.shape[0], 1))
    X = np.concatenate((bias, X), axis=1)

    weights = []
    # Train one model for each class in Y
    for k in range(Y.shape[1]):
        # Initialise weights
        theta = np.zeros(X.shape[1])
        y = Y[:, k]
        for _ in range(num_iter):
            z = np.dot(X, theta)
            h = _sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            theta -= lr * gradient
        # Save the weights for each model
        weights.append(theta)

    # Return a joint multi-class model with weights for all classes
    return np.vstack(weights).transpose()


def predict(model: np.ndarray, test_x: pd.DataFrame) -> np.ndarray:
    """Node for making predictions given a pre-trained model and a test set."""
    X = test_x.to_numpy()

    # Add bias to the features
    bias = np.ones((X.shape[0], 1))
    X = np.concatenate((bias, X), axis=1)

    # Predict "probabilities" for each class
    result = _sigmoid(np.dot(X, model))

    # Return the index of the class with max probability for all samples
    return np.argmax(result, axis=1)


def report_accuracy(predictions: np.ndarray, test_y: pd.DataFrame, parameters) -> None:
    """Node for reporting the accuracy of the predictions performed by the
    previous node. Notice that this function has no outputs, except logging.
    """
    # Get true class index
    target = np.argmax(test_y.to_numpy(), axis=1)
    # Calculate accuracy of predictions
    accuracy = np.sum(predictions == target) / target.shape[0]
    # Log the accuracy of the model
    log = logging.getLogger(__name__)
    log.info("Model accuracy on test set: %0.2f%%", accuracy * 100)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_param("time of predction", str(datetime.now()))
    mlflow.log_param("example_test_data_ratio", parameters['example_test_data_ratio'])
    mlflow.log_param("example_num_train_iter", parameters['example_num_train_iter'])
    mlflow.log_param("example_learning_rate", parameters['example_learning_rate'])
    

    


def _sigmoid(z):
    """A helper sigmoid function used by the training and the scoring nodes."""
    return 1 / (1 + np.exp(-z))

In [33]:
df = catalog.load("example_iris_data")
# df = drop_missing_value(df)
df_dict = split_data(df, context.params['example_test_data_ratio'])
model = train_model(df_dict['train_x'], df_dict['train_y'], context.params)
y_predict = predict(model, df_dict['test_x'])
report_accuracy(y_predict, df_dict['test_y'], context.params)

2021-06-19 22:24:17,176 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet)...
2021-06-19 22:24:17,574 - __main__ - INFO - Model accuracy on test set: 100.00%


MlflowException: Changing param values is not allowed. Param with key='time of predction' was already logged with value='2021-06-19 22:23:37.427912' for run ID='37f69ec9444e46ba9ccccffdc1dcf926'. Attempted logging new value '2021-06-19 22:24:17.577478'.