In [None]:
from datetime import datetime
from os import environ

from boto3 import client
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from model_registry import ModelRegistry
from model_registry.utils import S3Params
from numpy import load, save
import onnx
from pandas import read_csv
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from tf2onnx import convert

## Reading data connection

In [None]:
s3_endpoint_url = environ.get('AWS_S3_ENDPOINT')
s3_access_key = environ.get('AWS_ACCESS_KEY_ID')
s3_secret_key = environ.get('AWS_SECRET_ACCESS_KEY')
s3_bucket_name = environ.get('AWS_S3_BUCKET')
model_registry_endpoint_url = 'MODEL_REGISTRY_ROUTE_URL'  # without port

# Data ingestion

In [None]:
print(f'Downloading data "training-data.csv" '
      f'from bucket "{s3_bucket_name}" '
      f'from S3 storage at {s3_endpoint_url}')

s3_client = client(
    's3', endpoint_url=s3_endpoint_url,
    aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key
)

s3_client.download_file(
    s3_bucket_name,
    'data/training-data.csv',
    './data/raw_data.csv'
)

# Data preprocessing

In [None]:
df = read_csv('./data/raw_data.csv')

rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(
    df['Amount'].values.reshape(-1, 1)
)
df['scaled_time'] = rob_scaler.fit_transform(
    df['Time'].values.reshape(-1, 1)
)
df.drop(['Time', 'Amount'], axis=1, inplace=True)
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

X = df.drop('Class', axis=1)
y = df['Class']
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain = X.iloc[train_index]
    original_ytrain = y.iloc[train_index]

original_Xtrain = original_Xtrain.values
original_ytrain = original_ytrain.values

sm = SMOTE(sampling_strategy='minority', random_state=42)
Xsm_train, ysm_train = sm.fit_resample(original_Xtrain, original_ytrain)

save('./data/training_samples.npy', Xsm_train)
save('./data/training_labels.npy', ysm_train)

# Model training

In [None]:
epoch_count = 20
learning_rate = 0.001

Xsm_train = load('./data/training_samples.npy')
ysm_train = load('./data/training_labels.npy')
n_inputs = Xsm_train.shape[1]

oversample_model = Sequential([
    Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax'),
])
oversample_model.compile(
    Adam(learning_rate=learning_rate),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
)
training_metrics = oversample_model.fit(
    Xsm_train,
    ysm_train,
    validation_split=0.2,
    batch_size=300,
    epochs=epoch_count,
    shuffle=True,
    verbose=2,
)
accuracy = training_metrics.history['accuracy'][-1]
print(f'finished training model with final accuracy score of {accuracy}')
onnx_model, _ = convert.from_keras(oversample_model)
onnx.save(onnx_model, 'model.onnx')

# Model upload

In [None]:
timestamp = datetime.now().strftime('%y%m%d%H%M')
s3_prefix = f'models/{timestamp}'

s3_params = S3Params(
    bucket_name=s3_bucket_name,
    s3_prefix=s3_prefix,
    access_key_id=s3_access_key,
    secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url
)

sa_token_file_path = '/var/run/secrets/kubernetes.io/serviceaccount/token'
with open(sa_token_file_path, 'r') as token_file:
    auth_token = token_file.read()

registry = ModelRegistry(
    server_address=model_registry_endpoint_url,
    port=443,
    author='user',
    user_token=auth_token
)

model_description = '''
Shallow neural network trained on Credit Card Fraud Detector dataset 
(https://www.kaggle.com/code/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets).\n
Deployed model expects input vector of shape [1, 30] with FP32-type values, 
returns vector of shape [1, 2] with FP32-type values denoting predicted 
probabilities for non-fraud / fraud. See sample:
https://github.com/mamurak/os-mlops/blob/main/notebooks/fraud-detection-onnx/online-scoring.ipynb
'''
registry.upload_artifact_and_register_model(
    name='fraud-detection',
    model_files_path='model.onnx',
    upload_params=s3_params,
    version=timestamp,
    description=model_description,
    model_format_name='onnx',
    model_format_version='1',
    storage_key='aws-connection-fraud-detection',
    metadata={
        'epoch_count': str(epoch_count),
        'learning_rate': str(learning_rate),
        'accuracy': str(accuracy),
        'fraud-detection': '',
        'onnx': '',
    }
)
print(f'model uploaded to {s3_prefix} and registered as version {timestamp}')