In [None]:
from datetime import timezone, datetime, timedelta
import time

start_time = time.time()

dt_now = datetime.now(timezone(-timedelta(hours=3)))
print(f'Started at {dt_now.strftime("%Y-%m-%d %H:%M")}')

# Requirements

In [None]:
%pip install hyperopt
%pip install xgboost
%pip install mlxtend

# Hyperparameter optimization with hyperopt

In [None]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from evaluation_utils import get_selected_features, get_domain_dict, get_search_space_hyperopt, mape_scorer, MAPE
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import json
import os
import sys
from datetime import datetime, timedelta
import pandas as pd
import boto3
import sagemaker
from mlxtend.feature_selection import ExhaustiveFeatureSelector


s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
bucket = 'bucket_name'
seed = 1995

domain_dict = get_domain_dict()
info_dict = {}

for domain in domain_dict.keys():
    print('='*50)
    print(domain)
    print('='*50)
    TARGET = domain_dict[domain]['TARGET']
    target_problem = domain_dict[domain]['target_problem']
    # corr_threshold = domain_dict[domain]['corr_threshold']
    # delay_days = domain_dict[domain]['delay_days']

    data = f's3://{bucket}/{target_problem}/train/data.csv'

    df_full = pd.read_csv(data)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True, ascending=True, ignore_index=True)
    df.reset_index(drop=True, inplace=True)
    # removing latest 15 days
    df = df.loc[df['date'] < df['date'].max() - timedelta(days=15)].copy()
    print(df.shape)

    print('Selecting features')
    # corrmat = df.corr()
    # top_corr_features = corrmat.index[abs(corrmat[TARGET])>=corr_threshold]
    # top_corr_features = top_corr_features.to_list()
    # print(top_corr_features)
    top_corr_features = list(get_selected_features(df, TARGET))
    print(f'Done. Selected {len(top_corr_features)} features')

    if len(top_corr_features) == 0:
        print('0 features chosen\nExit')
        sys.exit(0)

    full_columns = top_corr_features[:]
    full_columns.insert(0, 'date')
    full_columns.append(TARGET)
    df_preprocess = df[full_columns].copy()
    # upscaling small values (percentages)
    df_preprocess[TARGET] = df_preprocess[TARGET] * 100
    print('\nMultiplying low-scale values')
    for col in top_corr_features:
        if 'pct_' in col:
            df_preprocess[col] = df_preprocess[col]*100
    df_preprocess.to_csv(
        f's3://{bucket}/{target_problem}/preprocess/train_preprocess.csv', index=False, header=True)

    X = df_preprocess[top_corr_features].copy()
    y = df_preprocess[TARGET].copy()

    print('Date interval: ')
    print(df_preprocess['date'].min())
    print(df_preprocess['date'].max())
    print(f'{X.shape[0]} instances - {X.shape[1]} features')

    X.fillna(0, inplace=True)
    standard_scaler = StandardScaler()

    results = {}
    estimators = get_search_space_hyperopt()
    for model_name in estimators:
        results[model_name] = {"scores": list()}

    # configure the cross-validation procedure
    k = 5
    kfold = KFold(n_splits=k, shuffle=True, random_state=seed)
    # kfold = TimeSeriesSplit(n_splits=k)
    if isinstance(kfold, KFold):
        print(f'{k}-fold CV')
        print(f'Test size = {X.shape[0]/k}')
    elif isinstance(kfold, TimeSeriesSplit):
        print(f'{k}-fold Time Series Split')
        print(f'Test size = {X.shape[0]//(k + 1)}')

    for model_name in estimators:
        if estimators[model_name]["skip"]:
            continue
        print('='*50)
        print(f'Model {model_name}')
        model = estimators[model_name]["model"]
        params = estimators[model_name]["space"]
        reduce = estimators[model_name]["reduce"]

        def objective(params):
            clf = model(**params)
            pipeline = Pipeline(
                [('transformer', standard_scaler), ('estimator', clf)])
            # Extract the best score
            scores = -cross_val_score(pipeline, X, y, cv=kfold,
                                      scoring='neg_mean_squared_error', n_jobs=-1)
            # trying no minimize the average
            avg_score = reduce(scores) 

            return {'loss': avg_score, 'status': STATUS_OK}

        trials = Trials()
        best_hyperparams = fmin(fn=objective,
                                space=params,
                                algo=tpe.suggest,
                                max_evals=500,
                                trials=trials,
                                rstate=np.random.default_rng(seed)
                                )
        print(f'Best hyperparameters: {space_eval(params, best_hyperparams)}')
        clf = model().set_params(**space_eval(params, best_hyperparams))
        print(clf)

        pipeline = Pipeline(
            [('transformer', standard_scaler), ('estimator', clf)])
        scores = cross_val_score(
            pipeline, X, y, cv=kfold, scoring=mape_scorer, n_jobs=-1)
        print(scores)
        print(f'{np.mean(scores)} +- ({np.std(scores)})')

        results[model_name]["scores"] = scores.tolist()
        results[model_name]["mean_scores"] = np.mean(scores)
        results[model_name]["std_scores"] = np.std(scores)
        # results[model_name]["corr_threshold"] = corr_threshold
        results[model_name]["feature_columns"] = top_corr_features
        results[model_name]["hyperparameters"] = space_eval(
            params, best_hyperparams)
        results[model_name]["trained_at"] = datetime.today().strftime(
            '%Y-%m-%d___%H-%M-%S')

    info_dict[domain] = results['xgboost']

    s3_client.put_object(
            Body=json.dumps(results, indent=4),
            Bucket=bucket,
            Key=f'{target_problem}/experiments/current.json'
    )
    print(json.dumps(results, indent=4))

s3_client.put_object(
        Body=json.dumps(info_dict, indent=4),
        Bucket=bucket,
        Key=f'xgboost_info.json'
)
elapsed = time.time() - start_time
print(f'Elapsed time (seconds): {elapsed}')


# Sagemaker model generation

In [None]:
role = sagemaker.get_execution_role()
sagemaker_client = boto3.client('sagemaker', region_name='us-east-1')

for domain in domain_dict.keys():
    print('='*50)
    print(domain)
    print('='*50)
    TARGET = domain_dict[domain]['TARGET']
    target_problem = domain_dict[domain]['target_problem']

    content_object = s3_resource.Object(
        bucket, f'{target_problem}/experiments/current.json')
    file_content = content_object.get()['Body'].read().decode('utf-8')
    json_content = json.loads(file_content)
    hyperparameters = json_content['xgboost']['hyperparameters']
    feature_columns = json_content['xgboost']['feature_columns']
    print(json.dumps(hyperparameters, sort_keys=True, indent=4))

    df = pd.read_csv(
        f's3://{bucket}/{target_problem}/preprocess/train_preprocess.csv')
    X = df[feature_columns].copy()
    y = df[TARGET].copy()

    X.fillna(0, inplace=True)
    scaler = StandardScaler()
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), columns=feature_columns)

    pd.concat([y, X], axis=1).to_csv(
        f's3://{bucket}/{target_problem}/xgboost/train/train.csv', index=False, header=False)
    s3_input_train = sagemaker.TrainingInput(
        f's3://{bucket}/{target_problem}/xgboost/train/train.csv', content_type='csv')

    container = sagemaker.image_uris.retrieve(
        'xgboost', region=boto3.Session().region_name, version='1.0-1')

    json_content['xgboost']['container'] = str(container)
    s3_client.put_object(
        Body=json.dumps(json_content),
        Bucket=bucket,
        Key=f'{target_problem}/experiments/current.json'
    )

    model_output = f's3://{bucket}/{target_problem}/xgboost/models/'

    n_round = hyperparameters['n_estimators']
    xgb_estimator = sagemaker.estimator.Estimator(image_uri=container,
                                                  hyperparameters=hyperparameters,
                                                  role=role,
                                                  instance_count=1,
                                                  instance_type='ml.m4.xlarge',
                                                  output_path=model_output
                                                  )
    xgb_estimator.set_hyperparameters(objective='reg:squarederror',
                                      num_round=n_round
                                      )
    xgb_estimator.fit({'train': s3_input_train})

    primary_container = {'Image': container,
                         'ModelDataUrl': xgb_estimator.model_data
                         }

    try:
        delete_response = sagemaker_client.delete_model(
            ModelName=target_problem)
    except Exception as e:
        print('Could not delete model')
        print(e)

    create_model_response = sagemaker_client.create_model(
        ModelName=target_problem,
        ExecutionRoleArn=role,
        PrimaryContainer=primary_container)
    print(f"myINFO : Created Model ARN : {create_model_response['ModelArn']}")


In [None]:
elapsed = time.time() - start_time
print(f'Elapsed time (seconds): {elapsed}')

info_dict['total_processing_time'] = elapsed

s3_client.put_object(
        Body=json.dumps(info_dict, indent=4),
        Bucket=bucket,
        Key=f'xgboost_info.json'
)

In [None]:
sagemaker_client = boto3.client('sagemaker', region_name='us-east-1')
sagemaker_client.stop_notebook_instance(
        NotebookInstanceName='schedule-training-notebook')