# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for individual user. The platform take care about the provisioning of the server and allocating related to storage.

In [None]:
!pip install minio

In [1]:
import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split


from tensorboard.plugins.hparams import api as hp_api
import pandas as pd
import os

from minio import Minio
from minio.error import ResponseError

In [2]:
def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient

def download_all_files(bucket_name):
    minioClient = get_s3_server()
    objects = minioClient.list_objects_v2(bucket_name=bucket_name,
                                          recursive=True)
    for obj in objects:
        print(obj.bucket_name, obj.object_name.encode('utf-8'), obj.last_modified,
              obj.etag, obj.size, obj.content_type)
        try:
            print(minioClient.fget_object(obj.bucket_name, obj.object_name,
                                          '/tmp/' + os.path.basename(obj.object_name)))
        except ResponseError as err:
            print(err)

#%%

def load_card_data(file_path):
    csv_path = os.path.join(file_path, "creditcard.csv")
    # return pd.read_csv(csv_path, header=None)
    return pd.read_csv(csv_path)

#%%

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

def upload_learning_stats_to_s3(folder_name):
    minioClient = get_s3_server()

    files = []
    for r, d, f in os.walk(folder_name):
        for file in f:
            files.append(os.path.join(r, file))

    for f in files:
        print(f)
        minioClient.fput_object(bucket_name='model-stats', object_name="tensordata/"  + f , file_path='./' + f)


# Fetch Data from S3 Bucket - Hosted on OpenShift

In [3]:
download_all_files('rawdata')
file_path = "/tmp"
full_card_data = load_card_data(file_path)
full_card_data = full_card_data.drop('Time', axis=1)
X = full_card_data.drop('Class', axis=1).values
y = full_card_data['Class'].values


#%%

# split data into train and test
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_valid, X_train = X_train_full[:28000], X_train_full[28000:]
y_valid, y_train = y_train_full[:28000], y_train_full[28000:]


rawdata b'creditcard.csv' 2020-04-30 03:48:55.835000+00:00 e90efcb83d69faf99fcab8b0255024de 150828752 None
<Object: bucket_name: rawdata object_name: b'creditcard.csv' last_modified: time.struct_time(tm_year=2020, tm_mon=4, tm_mday=30, tm_hour=3, tm_min=48, tm_sec=55, tm_wday=3, tm_yday=121, tm_isdst=0) etag: e90efcb83d69faf99fcab8b0255024de size: 150828752 content_type: text/csv, is_dir: False, metadata: {'Content-Type': 'text/csv'}>


# Define Hyper Paramaters

In [6]:
root_logdir = "."

# tensor board
run_logdir = get_run_logdir()

hp_units = hp_api.HParam('num_units', hp_api.Discrete([16, 32]))
hp_drop_out = hp_api.HParam('dropout', hp_api.RealInterval(0.1, 0.2))
hp_optimiser = hp_api.HParam('optimizer', hp_api.Discrete(['adam', 'sgd']))


with tf.summary.create_file_writer(run_logdir + '/hparam_tuning').as_default():
  hp_api.hparams_config(
    hparams = [hp_units, hp_drop_out, hp_optimiser],
    metrics=[hp_api.Metric('accuracy', display_name='Accuracy')],
  )



# Train model with Hyper Parameters

In [7]:
def build_model(hparams, logdir):
    # build a multi layer network
    model = keras.models.Sequential()

    model.add(keras.layers.Dense(100, input_dim=29, activation="tanh"))
    model.add(keras.layers.Dense(hparams[hp_units], activation="tanh"))
    model.add(keras.layers.Dropout(rate=hparams[hp_drop_out]))
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer=hparams[hp_optimiser],
                  loss='mean_squared_error', metrics=["accuracy"])

    tensorboard_cb = keras.callbacks.TensorBoard(logdir)
    history = model.fit(X_train, y_train, epochs=1,
                        validation_data=(X_valid, y_valid),
                        callbacks=[tensorboard_cb,
                                   hp_api.KerasCallback(logdir, hparams)])
    _, accuracy = model.evaluate(X_test, y_test)
    return accuracy

    


#%%

def run(hparams, logdir):
  with tf.summary.create_file_writer(logdir).as_default():
    hp_api.hparams(hparams)  # record the values used in this trial
    accuracy = build_model(hparams, logdir)
    tf.summary.scalar('accuracy', accuracy, step=1)

#%%

session_num = 0

for num_units in hp_units.domain.values:
  for dropout_rate in (hp_drop_out.domain.min_value, hp_drop_out.domain.max_value):
    for optimizer in hp_optimiser.domain.values:
      hparams = {
          hp_units: num_units,
          hp_drop_out: dropout_rate,
          hp_optimiser: optimizer,
      }
      run_name = "run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      run(hparams, run_logdir + '/hparam_tuning/' + run_name)
      session_num += 1

--- Starting trial: run-0
{'num_units': 16, 'dropout': 0.1, 'optimizer': 'adam'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-1
{'num_units': 16, 'dropout': 0.1, 'optimizer': 'sgd'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-2
{'num_units': 16, 'dropout': 0.2, 'optimizer': 'adam'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-3
{'num_units': 16, 'dropout': 0.2, 'optimizer': 'sgd'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-4
{'num_units': 32, 'dropout': 0.1, 'optimizer': 'adam'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-5
{'num_units': 32, 'dropout': 0.1, 'optimizer': 'sgd'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-6
{'num_units': 32, 'dropout': 0.2, 'optimizer': 'adam'}
Train on 199845 samples, validate on 28000 samples
--- Starting trial: run-7
{'num_units': 32, 'dropout': 0.2, 'optimizer': 'sgd'}

# Upload Model to Visualise its internals. The visualisation server is hosted on OpenShift Platform

In [9]:
upload_learning_stats_to_s3(run_logdir.replace("./", "", 1))

run_2020_05_01-05_14_58/hparam_tuning/events.out.tfevents.1588310098.jupyterhub-nb-fmasood.506.5.v2
run_2020_05_01-05_14_58/hparam_tuning/run-5/events.out.tfevents.1588310193.jupyterhub-nb-fmasood.506.264235.v2
run_2020_05_01-05_14_58/hparam_tuning/run-5/events.out.tfevents.1588310193.jupyterhub-nb-fmasood.506.264063.v2
run_2020_05_01-05_14_58/hparam_tuning/run-5/validation/events.out.tfevents.1588310208.jupyterhub-nb-fmasood.506.311255.v2
run_2020_05_01-05_14_58/hparam_tuning/run-5/train/events.out.tfevents.1588310194.jupyterhub-nb-fmasood.profile-empty
run_2020_05_01-05_14_58/hparam_tuning/run-5/train/events.out.tfevents.1588310193.jupyterhub-nb-fmasood.506.264344.v2
run_2020_05_01-05_14_58/hparam_tuning/run-5/train/plugins/profile/2020-05-01_05-16-34/local.trace
run_2020_05_01-05_14_58/hparam_tuning/run-2/events.out.tfevents.1588310138.jupyterhub-nb-fmasood.506.105780.v2
run_2020_05_01-05_14_58/hparam_tuning/run-2/events.out.tfevents.1588310138.jupyterhub-nb-fmasood.506.105608.v2
ru