# Simple example - Synth Regression Dataset

In [None]:
import os
import math
import time
from datetime import datetime

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
cd ..

In [None]:
from generators.synth_regression_generator import SynthRegressionGenerator
from generators.utils import DriftType

from ml3_platform_sdk.client import ML3PlatformClient
from ml3_platform_sdk.enums import DatasetType, TaskType

In [None]:
cd notebooks

# Parameters and settings

In [None]:
# === ML3 PLATFORM SETTINGS ===

API_KEY = ''
URL = ''


COMPANY_ID = '' # OPTIONAL, just if you already created a company

# === MODEL ===

TARGET_MODEL = Ridge()
MODEL_NAME = 'Ridge'
MODEL_VERSION = 'V1.0'

# === DATA PARAMETERS ===

HISTORICAL_DATA_SAMPLES = 2000
REFERENCE_DATA_SAMPLES = 1000
PRODUCTION_DATA_SAMPLES = 2000

# === GENERATOR PARAMETERS ===

SEED = 42
N_FEATURES = 7
COV_LOWER_VALUE = 0.2
COV_UPPER_VALUE = 0.6

# === NON STATIONARITY ===

# type of drift to inject in production data
DRIFT_TYPE = DriftType.ABRUPT

# define the start position of the drift
# expressed as production sample number
# e.g 1000 => The drift will start after 1000 production samples
DRIFT_POSITION = 1000

# duration of the drift (considered only for INCREMENTAL drifts)
DRIFT_DURATION = 100

# Define the entity of the change
# Must be a value between (0,1)
# Lower the value less the impact on the task
DRIFT_LEVEL = 0.5


In [None]:
assert 0 < DRIFT_POSITION < PRODUCTION_DATA_SAMPLES

## Dataset Generation

This dataset is generated from a multivariate gaussian distribution of n dimensions with random initialization.

The regression task consists in predicting one of the eight values that each sample contains given the other n-1 as features. The drift is introduced by applying a rotation around one of the n axes of the covariance matrix. The rotation angle θ is subject to random noise and the rotation axis is chosen randomly.

In [None]:
data_folder = './data/'

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

In [None]:
generator = SynthRegressionGenerator(n_features=N_FEATURES, seed=42)

In [None]:
# Historical Data
generator.generate(HISTORICAL_DATA_SAMPLES).to_csv(os.path.join(data_folder, 'historical.csv'), index=False)
# Reference Data
generator.generate(REFERENCE_DATA_SAMPLES).to_csv(os.path.join(data_folder, 'reference.csv'), index=False)
# First test set
generator.generate(int(REFERENCE_DATA_SAMPLES/4)).to_csv(os.path.join(data_folder, 'test_0.csv'), index=False)

In [None]:
# Production Data

before_drift_df = generator.generate(DRIFT_POSITION)

if DRIFT_TYPE == DriftType.ABRUPT:
    generator.add_abrupt_drift(
        theta=(math.pi/2)*DRIFT_LEVEL
    )
elif DRIFT_TYPE == DriftType.INCREMENTAL:
    generator.add_incremental_drift(
        theta=(math.pi/2)*DRIFT_LEVEL,
        duration=DRIFT_DURATION
    )

after_drift_df = generator.generate(PRODUCTION_DATA_SAMPLES-DRIFT_POSITION)


pd.concat([before_drift_df,after_drift_df]).to_csv(os.path.join(data_folder, 'production.csv'), index=False)

In [None]:
# Second test set
generator.generate(int(REFERENCE_DATA_SAMPLES/4)).to_csv(os.path.join(data_folder, 'test_1.csv'), index=False)

# Train a Model on generated data

In [None]:
training_df = pd.read_csv(os.path.join(data_folder, 'reference.csv'), index_col='sample-id')

In [None]:
training_df.head()

In [None]:
# Define input and output columns
input_cols = ['X_'+str(i) for i in range(N_FEATURES)]
output_cols = ['y_0']

## Model Development

In [None]:
input_scaler = StandardScaler()
x_train = input_scaler.fit_transform(training_df[input_cols])

output_scaler = StandardScaler()
y_train = output_scaler.fit_transform(training_df[output_cols])

In [None]:
model = TARGET_MODEL
model.fit(x_train, y_train)

### Evaluate model performance

In [None]:
test_df = pd.read_csv(os.path.join(data_folder, 'test_0.csv'))

In [None]:
x_test = input_scaler.transform(test_df[input_cols])
predictions = model.predict(x_test)
predictions = output_scaler.inverse_transform(predictions)

In [None]:
print("MSE: ", mean_squared_error(test_df[output_cols], predictions))
print("R2: ", r2_score(test_df[output_cols], predictions))

## Register the model into ML cube Platform

In [None]:
# Initialize the client
client = ML3PlatformClient(
        api_key=API_KEY,
        url=URL,
    )

In [None]:
# Create a company
# NOTE only if not already done
company_id = client.create_company(name='my company', address='my address', vat='my vat')

In [None]:
# Create project and get the project identifier
# NOTE only if not already done
project_id = client.create_project(company_id='64b12a5ef393ab4885ad90b7', name='my project test 53')

In [None]:
# Create a task for our experiment
task_id = client.create_task(
    project_id=project_id,
    name='my task',
    tags=['aperitech', 'example'],
    task_type=TaskType.REGRESSION
)

In [None]:
data_schema = generator.get_dataschema()

In [None]:
client.add_data_schema(task_id=task_id, data_schema=data_schema)

In [None]:
# Double check that the data schema is correct
client.show_data_schema(task_id=task_id)

In [None]:
# Add historical data
client.add_historical_data(
    task_id=task_id,
    dataset_type=DatasetType.TABULAR,
    data_path=os.path.join(data_folder, 'historical.csv')
)

In [None]:
# Check the running job
client.show_running_operations(task_id=task_id)

In [None]:
# Create the model
model_id = client.create_model(
    project_id=project_id,
    task_id=task_id,
    name=MODEL_NAME,
    version=MODEL_VERSION
)

In [None]:
client.add_model_reference(
    project_id=project_id,
    task_id=task_id,
    model_id=model_id,
    dataset_type=DatasetType.TABULAR,
    data_path=os.path.join(data_folder, 'reference.csv')
)

In [None]:
# Check the running job
client.show_running_operations(task_id=task_id)

## Simulate arrival of new production data

In [None]:
def make_predictions(model, model_name, model_version, input_scaler, output_scaler, production_batch) -> pd.DataFrame:
    """
    This function apply our model to a given production batch.
    It returns a dataframe with the given data and a column with model predictions
    """
    x_production = input_scaler.transform(production_batch[input_cols])
    predictions = model.predict(x_production)
    predictions = output_scaler.inverse_transform(predictions)
    
    # here we assign the right name to the column that will contain the predictions
    # the standard to follow is MODEL_NAME@MODEL_VERSION
    # example: Ridge@V1.0
    predictions_column_name = '@'.join([model_name, model_version])
    
    scored_production_dataframe = production_batch.copy()
    scored_production_dataframe[predictions_column_name] = predictions
    return scored_production_dataframe

In [None]:
production_df = pd.read_csv(os.path.join(data_folder, 'production.csv'), index_col='sample-id')

In [None]:
scored_production = make_predictions(model, MODEL_NAME, MODEL_VERSION, input_scaler, output_scaler, production_df)

In [None]:
# Save the production data batch with predictions
scored_production.to_csv(os.path.join(data_folder, 'scored_production.csv'))

In [None]:
# Send the new production to ML cube Platform

In [None]:
client.add_production_data(
    task_id=task_id,
    dataset_type=DatasetType.TABULAR,
    data_path=os.path.join(data_folder, 'scored_production.csv')
)

In [None]:
# Check the running job
client.show_running_operations(task_id=task_id)

## When finished, check if a drift has been identified

In [None]:
# Check the status of the models in your task
client.show_models(task_id)

In [None]:
prediction_errors = scored_production['y_0'] - scored_production['@'.join([MODEL_NAME, MODEL_VERSION])]

In [None]:
# Lets plot the residuals to see how the drift impacted on our model
plt.figure(figsize=(25, 5))
plt.plot(prediction_errors)
plt.show()

## Get the sample weights to retrain our Model

In [None]:
# Retrieve ids and weights to build a new training set
ids, weights = client.get_importance_weights(task_id=task_id, model_id=model_id, model_version=MODEL_VERSION)

In [None]:
ids, weights = client.get_importance_weights(task_id=task_id, model_id=model_id, model_version=MODEL_VERSION)

In [None]:
# The suggestion includes 5000 ids with sample weights
len(ids)

In [None]:
# Concatenate available data
historical_df = pd.read_csv(os.path.join(data_folder, 'historical.csv'), index_col='sample-id')

available_data = pd.concat([historical_df, training_df, production_df])

In [None]:
input_scaler = StandardScaler()
new_x_train = input_scaler.fit_transform(available_data.loc[ids][input_cols])

output_scaler = StandardScaler()
new_y_train = output_scaler.fit_transform(available_data.loc[ids][output_cols])

In [None]:
new_model = Ridge()
new_model.fit(new_x_train, new_y_train, sample_weight=weights)

## Evaluate the retrained model

In [None]:
new_test_df = pd.read_csv(os.path.join(data_folder, 'test_1.csv'))

In [None]:
x_test = input_scaler.transform(new_test_df[input_cols])
predictions = new_model.predict(x_test)
predictions = output_scaler.inverse_transform(predictions)

In [None]:
print("MSE: ", mean_squared_error(new_test_df[output_cols], predictions))
print("R2: ", r2_score(new_test_df[output_cols], predictions))