# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Problem

In This example, we will use the 
https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html
We will use the The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

## Setup Environment

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker==2.47.1

In [None]:
import sagemaker
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

In [None]:
# uri of your remote mlflow server
tracking_uri = '<YOUR MLFLOW SERVER URI>' 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [None]:
# we use the Boston housing dataset 
data = fetch_california_housing()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('california_train.csv')
testX.to_csv('california_test.csv')

In [None]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='california_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='california_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [None]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'california-housing',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'MedInc HouseAge AveRooms AveBedrms Population AveOccup',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='local',
    framework_version='0.23-1',
    base_job_name='mlflow',
)

In [None]:
estimator.fit({'train':train_path, 'test': test_path})