# Using W&B Artifacts in your ML Workflow

In [None]:
# Based on https://www.kaggle.com/chungyehwang/scikit-learn-classifiers-on-iris-dataset

import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import wandb
import pandas as pd
from joblib import dump, load
import os

### Set any environment variables. If running against a local W&B you should provide Host and Key

In [None]:
#os.environ['WANDB_API_KEY'] = ''
#os.environ['WANDB_BASE_URL'] = ''
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Iris Artifacts Demo'

### Set global configs

In [None]:
group_name = 'Base-Run' # Change this as you run new set of runs to see the results for different groups

project_name = 'artifacts_demo'

general_config = dict(
                     tsize = 0.3, # Default = 0.3
                     gamma = 0.1, # Default = 0.1
                     C = 1.0, # Default = 1.0
                     seed = 0 # Default = 0
                     ) 


### Load Raw Dataset and register in Artifacts

In [None]:
run = wandb.init(project=project_name, job_type='data_load', group=group_name)

artifact = wandb.Artifact('raw_data', type='dataset', metadata={'Source':'https://datahub.io/machine-learning/iris'})

artifact.add_file('data/raw/iris_csv.csv')
run.log_artifact(artifact)

wandb.join()

### Get Raw Dataset from Artifacts, Create Train/Test Split and log back into Artifacts

In [None]:
run = wandb.init(project=project_name, job_type='data_prep', group=group_name)

wandb.config.test_size = general_config['tsize']
wandb.config.seed = general_config['seed']
wandb.config.rawdata_artifact = 'raw_data:latest'

artifact = run.use_artifact(wandb.config.rawdata_artifact)
artifact_dir = artifact.download()

iris = pd.read_csv(os.path.join(artifact_dir, 'iris_csv.csv'))

# Train/Test Split
X = iris.iloc[:,0:3]
y = iris.iloc[:,4]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=wandb.config.test_size, random_state=wandb.config.seed)

X_train.to_pickle('data/split/x_train.pkl')
X_test.to_pickle('data/split/x_test.pkl')
y_train.to_pickle('data/split/y_train.pkl')
y_test.to_pickle('data/split/y_test.pkl')

artifact = wandb.Artifact('split_data', type='dataset', metadata={'Train Pct':1-wandb.config.test_size, 'Test Pct': wandb.config.test_size})
artifact.add_dir('data/split', name='train_test_split')
run.log_artifact(artifact)

wandb.join()

### Prepare Data for Modeling and register prepared binaries into Artifacts

In [None]:
run = wandb.init(project=project_name, job_type='data_prep', group=group_name)

wandb.config.splitdata_artifact = 'split_data:latest'

artifact = run.use_artifact(wandb.config.splitdata_artifact)
artifact_dir = artifact.download()

X_train = pd.read_pickle(os.path.join(artifact_dir, 'train_test_split/x_train.pkl'))
X_test = pd.read_pickle(os.path.join(artifact_dir, 'train_test_split/x_test.pkl'))
y_train = pd.read_pickle(os.path.join(artifact_dir, 'train_test_split/y_train.pkl'))
y_test = pd.read_pickle(os.path.join(artifact_dir, 'train_test_split/y_test.pkl'))

sc = StandardScaler()
sc.fit(X_train)

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

np.save('data/prepped/x_train_std.npy', X_train_std)
np.save('data/prepped/x_test_std.npy', X_test_std)
np.save('data/prepped/x_combined_std.npy', X_combined_std)
np.save('data/prepped/y_combined.npy', y_combined)

artifact = wandb.Artifact('prepped_data', type='dataset')
artifact.add_dir('data/prepped')
run.log_artifact(artifact)

wandb.join()

### Load prepared datasets and train model. Register final model into Artifacts

In [None]:
run = wandb.init(project=project_name, job_type='train', group=group_name)

wandb.config.seed = general_config['seed']
wandb.config.gamma = general_config['gamma']
wandb.config.C = general_config['C']
wandb.config.preppeddata_artifact = 'prepped_data:latest'

artifact = run.use_artifact(wandb.config.preppeddata_artifact)
artifact_dir = artifact.download()

X_train_std = np.load(os.path.join(artifact_dir, 'x_train_std.npy'))
X_test_std = np.load(os.path.join(artifact_dir, 'x_test_std.npy'))
X_combined_std = np.load(os.path.join(artifact_dir, 'x_combined_std.npy'), allow_pickle=True)
y_combined = np.load(os.path.join(artifact_dir, 'y_combined.npy'), allow_pickle=True)


svm = SVC(kernel='rbf', random_state=wandb.config.seed, gamma=wandb.config.gamma, C=wandb.config.C)
svm.fit(X_train_std, y_train)

wandb.log({"Train Accuracy": svm.score(X_train_std, y_train), 
           "Test Accuracy": svm.score(X_test_std, y_test)})


dump(svm, 'models/iris_model.joblib')

artifact = wandb.Artifact('iris_model', type='model')
artifact.add_file('models/iris_model.joblib')
run.log_artifact(artifact)

wandb.join()