In [None]:
import json
import numpy as np
import pandas as pd
import pickle

import mrmr
import time

from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType, DoubleType 
import pyspark.sql.functions as F

from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2,VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from boruta import BorutaPy
from random import sample
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

import tensorflow as tf
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.utils import to_categorical

In [None]:
import sys
sys.path.append('/opt/deep learning/utilities')

from spark_setup import SetupEnvironment
environment = SetupEnvironment (conda_env='my_env')
spark = environment.setup_spark()
dcRead = environment.setup_DataCatalog()
s3 = environment.setup_s3()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
sys.path.append('/repos/mme010/DS_Artifacts/prototypes/V2')

In [None]:
from push_training.common import mrp_pipeline , preprocess

from push_training.steps import (
    construct_observations, feature_reduction, modified_mrmr_fit,
    boruta_fit, select_n_features, modified_mrmr_tune_n_features,
    feature_selection_transform, preprocess_fit, preprocess_transform,
    tune_train, evaluate, get_a_tsfresh_mapping, get_tsfresh_simplified_mapping,
    compile_metadata, stage 
)

In [None]:
VARIANCE_THRESHOLD = 0.0001
FLOATING_POINT_ERROR = 0.000001
VIF_THRESHOLD = 10
CORRELATION_THRESHOLD = 0.95 
CORRELATION_METHOD = "pearson"
MAX_EVALS_TUNE_TRAIN = 300
N_FEATURES_TO_TRY = np.round(np.geomspace (10, 500, 16)).astype(int)
MAX_EVALS = 50
FEATURE_SELECTION_METHOD = "modified_mrmr" #boruta
NON_FEATURE_COLS = ['id', 'id_secondary', 'LABEL', 'tdate', 'pttrn_trade_dt_all' , 'split']
SOURCE_TSFRESH_DATATYPE_DICTIONARY = {'length': 'discrete', 'maximum': 'continuous', 'mean':'continuous', 'median':'continuous','minimum':'continuous','quantile':'continuous'}

In [None]:
def pickle_put(data, name, s3):
    with s3.open(f'{s3.project_home}/{name}', 'wb') as f: 
        pickle.dump(data, f)

def pickle_get(name, s3):
    with s3.open(f'{s3.project_home}/{name}', 'rb') as f: 
        return pickle.load(f)

def keras_put(model, name, s3): 
    tmp_path='/tmp/tmp_model.h5'
    model.save(tmp_path)
    s3.put(tmp_path, f'{s3.project_home}/{name}')

def keras_get(name, s3): 
    tmp_path ='/tmp/tmp_model/h5'
    s3.get(f'{s3.project_home}/{name}', tmp_path) 
    model = tf.keras.models.load_model(tmp_path)
    return model

def json_put(data, name, s3):
    with s3.open(f'{s3.project_home}/{name}', 'w') as f: 
        json.dump(data, f)

def json_get(name, s3):
    with s3.open(f'{s3.project_home}/{name}', 'r') as f: 
        return json.load(f)

In [None]:
def make_tsfresh_simplified_mapping_step(dcRead, s3, spark):
    def ts_mapping_load_inputs():
        return {
            'mi_feature_metadata': json.load(open('/repos/mme010/DS_Artifacts/prototypes/v2/push_training/momentum_ignition_feature_metadata.json', 'r'))
        }
        
    def ts_mapping_handle_outputs(full_simplified_tsfresh_map): 
        pickle_put(
            full_simplified_tsfresh_map, 
            'full_simplified_tsfresh_map.pickle', 
            s3)

    step_ts_mapping = get_tsfresh_simplified_mapping.GetTsfreshSimplifiedMapping(
        input_loader=ts_mapping_load_inputs, 
        output_handler=ts_mapping_handle_outputs)
    
    return step_ts_mapping

tsfresh_simplified_mapping_step = make_tsfresh_simplified_mapping_step(dcRead, s3, spark)
tsfresh_simplified_mapping_step.run()

In [None]:
#pickle_get('full_simplified_tsfresh_map.pickle',s3)

In [None]:
def make_mapping_step(dcRead, s3, spark):
    def mapping_load_inputs():
        return {
            'mtc_prdcn_table': dcRead('prod').loadDataFrame(
            'MRP',
            'MARKING_THE_CLOSE_PUSH_PRDCN',
            'PRC', 
            'BZ',
            '2021-08-05')
        }

    def mapping_handle_outputs(a_to_tsfresh):
        pickle_put(a_to_tsfresh, 'a_to_tsfresh.pickle', s3)

    step_mapping = get_a_tsfresh_mapping.GetMapping(
        input_loader=mapping_load_inputs,
        output_handler=mapping_handle_outputs
    )
    return step_mapping

mapping_step = make_mapping_step(dcRead, s3, spark)
mapping_step.run()

In [None]:
#pickle_get('a_to_tsfresh.pickle',s3)

In [None]:
def make_co_step(dcRead, s3, spark):
    def co_load_inputs():
        mtc_push_table = dcRead('prody-dev').loadDataFrame(
            "MRP",
            "MTC_tsfresh_data_training_data",
            "PRC",
            "ORC",
            None
        )
        a_column_mapping = pickle_get('a_to_tsfresh.pickle', s3)
        return {
            'mtc_push_table': mtc_push_table,
            'a_column_mapping': a_column_mapping
        }

    def co_handle_outputs(observations):
        observations.write.parquet(f'{s3.project_home}/observations.parquet', mode='overwrite')
    
    step_construct_observations = construct_observations.ConstructObservations(
        input_loader=co_load_inputs, 
        output_handler=co_handle_outputs
        )
    return step_construct_observations

co_step = make_co_step(dcRead, s3, spark)
co_step.run()

In [None]:
def make_feature_reduction_step(dcRead,
    s3,
    spark,
    variance_threshold = VARIANCE_THRESHOLD,
    floating_point_error = FLOATING_POINT_ERROR,
    vif_threshold = VIF_THRESHOLD,
    correlation_threshold = CORRELATION_THRESHOLD,
    correlation_method = CORRELATION_METHOD,
    soure_tsfresh_datatype_dictionary = SOURCE_TSFRESH_DATATYPE_DICTIONARY):

    def feature_reduction_load_inputs():
        return {
            'split_data': spark.read.parquet(f'{s3.project_home}/observations.parquet')
        }
    def feature_reduction_handle_outputs(tsfresh_feature_reduction_dataframe):
        pickle_put(tsfresh_feature_reduction_dataframe, 'tsfresh_feature_reduction_dataframe.pickle', s3)

    step_feature_reduction = feature_reduction.ReduceFeatureSet(
        parameters = {
            'variance_threshold': variance_threshold,
            'floating point_error': floating_point_error,
            'vif_threshold': vif_threshold,
            'correlation_threshold': correlation_threshold,
            'correlation_method': correlation_method,
            'soure_tsfresh_datatype_dictionary': soure_tsfresh_datatype_dictionary
        },
        input_loader = feature_reduction_load_inputs,
        output_handler = feature_reduction_handle_outputs
    )
    return step_feature_reduction 

feature_reduction_step = make_feature_reduction_step(dcRead, s3, spark)
feature_reduction_step.run()

In [None]:
#pickle_get('tsfresh_feature_reduction_dataframe.pickle',s3)

In [None]:
def make_modified_mrmr_fit_step(dcRead, s3, spark): 
    def modified_mrmr_fit_load_inputs():
        return {
            'split_data': spark.read.parquet (f'{s3.project_home} /observations. parquet'),
            'tsfresh_feature_reduction_dataframe': pickle_get('tsfresh_feature_reduction_dataframe.pickle', s3)}

    def modified_mrmr_fit_handle_outputs(tsfresh_feature_selection_dataframe):
        pickle_put(tsfresh_feature_selection_dataframe, 'tsfresh_feature_selection_dataframe.pickle', s3)
        
    step_modified_mrmr_fit = modified_mrmr_fit.ModifiedMRMRFit(
        input_loader = modified_mrmr_fit_load_inputs,
        output_handler = modified_mrmr_fit_handle_outputs
    )
    return step_modified_mrmr_fit

modified_mrmr_fit_step = make_modified_mrmr_fit_step(dcRead, s3, spark)
modified_mrmr_fit_step.run()

In [None]:
#pickle_get('tsfresh_feature_reduction_dataframe.pickle',s3)

In [None]:
def make_boruta_fit_step(dcRead, s3, spark):
    def boruta_fit_load_inputs():
        return {
            'split_data': spark.read.parquet(f'{s3.project_home } /observations. parquet'),
            'tsfresh_feature_reduction_dataframe': pickle_get('tsfresh_feature_reduction_dataframe.pickle', s3)}

    def boruta_fit_handle_outputs(tsfresh_feature_selection_dataframe) :
        pickle_put(tsfresh_feature_selection_dataframe, 'tsfresh_feature_selection_dataframe.pickle', s3)
        
    step_boruta_fit = boruta_fit.BorutaFit(
        input_loader = boruta_fit_load_inputs,
        output_handler = boruta_fit_handle_outputs)

    return step_boruta_fit

boruta_fit_step = make_boruta_fit_step(dcRead, s3, spark)
boruta_fit_step.run()

In [None]:
#pickle_get('tsfresh_feature_selection_dataframe.pickle',s3)

In [None]:
def make_select_n_features_step(
    dcRead,
    s3,
    spark,
    max_evals = MAX_EVALS,
    n_features_to_try = N_FEATURES_TO_TRY,
    feature_selection_method = FEATURE_SELECTION_METHOD
    ):

    def select_n_features_load_inputs():
        return {
        'split_data': spark.read.parquet(f'{s3.project_home} /observations.parquet'),
        'tsfresh_feature_selection_dataframe': pickle_get('tsfresh_feature_selection_dataframe.pickle', s3)}

    def select_n_features_handle_outputs (n_feats_best, n_features_tried, train_scores, val_scores, val_y_predictions):
        pickle_put(n_feats_best, 'n_feats_best.pickle', s3)
        pickle_put(n_features_tried, 'select_n_feats_n_feats.pickle', s3)
        pickle_put(train_scores, 'select_n_feats_train scores.pickle', s3)
        pickle_put(val_scores, 'select_n_feats_val_scores.pickle', s3)
        pickle_put(val_y_predictions, 'val_y_predictions.pickle', s3)

    step_select_n_features = select_n_features.SelectNFeatures(
        parameters = {
            'n_features_to_try': n_features_to_try,
            'max_evals': max_evals,
            'feature_selection method': feature_selection_method},
        input_loader=select_n_features_load_inputs,
        output_handler=select_n_features_handle_outputs
    )
    return step_select_n_features

select_n_features_step = make_select_n_features_step(dcRead, s3, spark)
select_n_features_step.run()

In [None]:
#pickle_get('n_feats_best.pickle',s3)
#pickle_get('select_n_feats_n_feats.pickle',s3)
#pickle_get('select_n_feats_train.pickle',s3)
#pickle_get('select_n_feats_val_scores.pickle',s3)
#pickle_get('val_y_predictions.pickle',s3)

In [None]:
def make_modified_mrmr_tune_n_features_step(dcRead, s3, spark):
    def modified_mrmr_tune_n_features_load_inputs():
        return {
            'n': pickle_get('n_feats_best.pickle', s3),
            'tsfresh_feature_selection_dataframe': pickle_get('tsfresh_feature_selection_dataframe.pickle', s3)
        }
    def modified_mrmr_tune_n_features_handle_outputs(feature_names) :
        pickle_put(feature_names, 'feature_names.pickle', s3)

    step_modified_mrmr_tune_n_features = modified_mrmr_tune_n_features.ModifiedMRMRTuneNFeatures(
        input_loader = modified_mrmr_tune_n_features_load_inputs,
        output_handler = modified_mrmr_tune_n_features_handle_outputs)

    return step_modified_mrmr_tune_n_features

modified_mrmr_tune_n_features_step = make_modified_mrmr_tune_n_features_step(dcRead, s3, spark)
modified_mrmr_tune_n_features_step.run()

In [None]:
#pickle_get('feature_names.pickle',s3)

In [None]:
def make_feature_selection_transform_step(
    dcRead,
    s3,
    spark,
    non_feature_cols = NON_FEATURE_COLS):

    def feature_selection_transform_load_inputs():
        return {
        'split_data': spark.read.parquet(f'{s3.project_home}/observations.parquet'),
        'feature_names': pickle_get('feature_names.pickle', s3)}

    def feature_selection_transform_handle_outputs(reduced_data): 
        return { 
            'reduced_data': reduced_data.write.parquet(
            f'{s3.project_home} /reduced_data.parquet', 
            mode='overwrite')
        }

    step_mrmr_transform = feature_selection_transform.FeatureSelectionTransform(
        parameters = {
            'non_feature_cols': non_feature_cols},
        input_loader=feature_selection_transform_load_inputs,
        output_handler=feature_selection_transform_handle_outputs
    )
    return step_mrmr_transform

feature_selection_transform_step = make_feature_selection_transform_step(dcRead, s3, spark)
feature_selection_transform_step.run()

In [None]:
def make_preprocess_fit_step(dcRead, s3, spark):
    def pf_load_inputs():
        return {
        'split_data': spark.read.parquet (f'{s3.project_home} /reduced_data.parquet')
        }
    def pf_handle_outputs(scaler):
        pickle_put (scaler, 'min_max_scaler.pickle', s3)

    step_preprocess_fit = preprocess_fit.PreprocessFit(
        input_loader = pf_load_inputs,
        output_handler = pf_handle_outputs)

    return step_preprocess_fit

preprocess_fit_step = make_preprocess_fit_step(dcRead, s3, spark)
preprocess_fit_step.run()

In [None]:
#pickle_get('min_max_scaler.pickle',s3)

In [None]:
def make_preprocess_transform_step(dcRead, s3, spark):
    def pt_load_inputs():
        return {
            'split_data': spark.read.parquet (f'{s3.project_home} /reduced_data.parquet'),
            'scaler': pickle_get('min_max_scaler. pickle', s3)}

    def pt_handle_outputs(
        X_train,
        y_train,
        X_val,
        y_val,
        X_test,
        y_test):

        pickle_put(X_train, 'X_train.pickle', s3)
        pickle_put(y_train, 'y_train.pickle', s3)
        pickle_put(X_val, 'x_val.pickle', s3) 
        pickle_put(y_val, 'y_val.pickle', s3)
        pickle_put(X_test, 'X_test.pickle', s3)
        pickle_put(y_test, 'y_test.pickle', s3)

    step_preprocess_transform = preprocess_transform.PreprocessTransform(
        input_loader=pt_load_inputs, 
        output_handler=pt_handle_outputs)

    return step_preprocess_transform

preprocess_transform_step = make_preprocess_transform_step(dcRead, s3, spark)
preprocess_transform_step.run()

In [None]:
#pickle_get('X_train.pickle',s3)
#pickle_get('y_train.pickle',s3)
#pickle_get('x_val.pickle',s3)
#pickle_get('y_val.pickle',s3)
#pickle_get('X_test.pickle',s3)
#pickle_get('y_test.pickle',s3)

In [None]:
def make_tune_train_step(dcRead, s3, spark, max_evals=MAX_EVALS_TUNE_TRAIN):

    def tt_load_inputs():
        return{
        'X_train': pickle_get('X_train.pickle', s3),
        'y_train': pickle_get('y_train.pickle', s3),
        'X_val': pickle_get('X_val.pickle', s3),
        'y_val': pickle_get('y_val.pickle', s3)}

    def tt_handle_outputs (model):
        keras_put(model, 'model.h5', s3) 

    step_tune_train = tune_train.TuneTrain(
        parameters={ 
            'max_evals': max_evals 
        },
        input_loader=tt_load_inputs, 
        output_handler=tt_handle_outputs
    )
    return step_tune_train

tune_train_step = make_tune_train_step(dcRead, s3, spark)
tune_train_step.run()

In [None]:
#keras_put(model, 'model.h5', s3)

In [None]:
def make_evaluate_step (dcRead, s3, spark):
    def eval_load_inputs():
        return {
            'model': keras_get('model.h5', s3),
            'X_test': pickle_get('X_test.pickle', s3),
            'y_test': pickle_get('y_test.pickle', s3)}

    def eval_handle_outputs (recall, precision, roc_auc):
        print(f'Recall: {recall}')
        print(f'Precision: {precision}')
        print(f'ROC-AUC: {roc_auc}')
    
    step_evaluate = evaluate.Evaluate(
        input_loader = eval_load_inputs,
        output_handler = eval_handle_outputs
    )
    return step_evaluate

evaluate_step = make_evaluate_step(dcRead, s3, spark)
evaluate_step.run()

In [None]:
def make_compile_metadata_step(dcRead, s3, spark):
    def cm_load_inputs():
        return {
            'simplified_tsfresh_map': pickle_get('full_simplified_tsfresh_map.pickle', s3),
            'selected_features': pickle_get('feature_names.pickle', s3),
            'scaler': pickle_get('min_max_scaler.pickle', s3)
        }

    def cm_handle_outputs (metadata):
        json_put(metadata, 'metadata.json', s3)
        
    step_cm = compile_metadata.CompileMetadata(
        input_loader=cm_load_inputs,
        output_handler=cm_handle_outputs)
    
    return step_cm

compile_metadata_step = make_compile_metadata_step(dcRead, s3, spark)
compile_metadata_step.run()

In [None]:
#json_get('metadata.json', s3)

In [None]:
def make_stage_step(dcRead, s3, spark):
    def get_run_info() :
        with open('/mnt/metadata/run_information.json', 'r') as f:
            run_info = json.load(f)
        return run_info

    def stage_load_inputs():
        return {
            'split_data': spark.read.parquet (f'{s3.project_home} /observations.parquet'),
            'metadata': json_get('metadata.json', s3),
            'model': keras_get('model.h5', s3),
            'run_info': get_run_info()
        }

    def stage_handle_outputs (split_data, metadata, model, run_info):
        split_data.write.parquet(
            f'{s3.project_home}/staged/split_data. parquet',
            mode='overwrite'
        )
        json_put (metadata, 'staged/metadata.json', s3) 
        keras_put (model, 'staged/model.h5', s3)
        json_put (run_info, 'staged/run_info.json', s3)

    step_stage = stage.Stage(
        input_loader = stage_load_inputs,
        output_handler = stage_handle_outputs
    )    
    return step_stage


stage_step = make_stage_step(dcRead, s3, spark)
stage_step.run()

In [None]:
#json_get('staged/metadata.json', s3)
#keras_put(model, 'model.h5', s3)
#json_get('staged/run_info.json', s3)