In [2]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Definitions

import pandas as pd
import numpy as np

today = pd.to_datetime('today').normalize()
current_eom = today + pd.offsets.MonthEnd(0)
start_date = '2017-01-01'
end_date = current_eom + pd.offsets.MonthEnd(11)

# entity_debug = "GBR"

debug = True


def get_json(df):
    """ Small function to serialise DataFrame dates as 'YYYY-MM-DD' in JSON """

    def convert_timestamp(item_date_object):
        if isinstance(item_date_object, (datetime.date,
                      datetime.datetime)):
            return item_date_object.strftime('%Y-%m-%d')

    dict_ = df.to_dict(orient='records')

    return json.dumps(dict_, default=convert_timestamp)


# display count and summary of any dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
pd.set_option('display.precision', 1)
pd.set_option('display.float_format', lambda x: '%.2f' % x)


def get_df_name(df):
    name = [x for x in globals() if globals()[x] is df][0]
    return name


def difflist(li1, li2):
    return list(set(li1) - set(li2))


def addlist(li1, li2):
    return li1.append(li2)


def remove_percetage(df, column_list):
    for col in column_list:
        df[col] = round(df[col].str.replace('%', '').astype(np.float64)
                        / 100, 4)
    return df


def coerce_df_columns_to_numeric(df):
    cols_float1 = list(df.filter(like='Rate', axis=1).columns)
    cols_float2 = list(df.filter(like='Yield', axis=1).columns)
    cols_float3 = list(df.filter(like='Diff%', axis=1).columns)
    cols_float4 = list(df.filter(like='Relative_Offset',
                       axis=1).columns)
    cols_float5 = list(df.filter(like='sp500', axis=1).columns)
    cols_float6 = list(df.filter(like='Return', axis=1).columns)
    cols_float = cols_float1 + cols_float2 + cols_float3 + cols_float4 \
        + cols_float5 + cols_float6

    # display("cols_float:", cols_float)

    cols_int1 = list(df.filter(like='Revenue', axis=1).columns)
    cols_int2 = list(df.filter(like='Conversions', axis=1).columns)
    cols_int3 = list(df.filter(like='Value', axis=1).columns)
    cols_int4 = list(df.filter(like='Pipeline', axis=1).columns)
    cols_int5 = list(df.filter(like='Offset', axis=1).columns)
    cols_int6 = list(df.filter(like='Headcount', axis=1).columns)
    cols_int = cols_int1 + cols_int2 + cols_int3 + cols_int4 \
        + cols_int5 + cols_int6

    # display("cols_int:", cols_int)

    cols1 = list(df.select_dtypes(include='float64').columns)
    cols = cols1 + cols_int

    # display("cols:", cols)

    final_cols = difflist(cols, cols_float)

    # display("final_cols:", final_cols)

    df[final_cols] = df[final_cols].apply(pd.to_numeric, errors='coerce'
            )
    df[final_cols] = df[final_cols].replace(np.nan, 0, regex=True)
    df[final_cols] = df[final_cols].astype(int)
    df[final_cols] = round(df[final_cols], 0)


    # return df

def data_prep(df):
    df.columns = df.columns.astype(str).str.replace(' ', '_')
    if 'End_of_Month' in df.columns:
        df['End_of_Month'] = pd.to_datetime(df['End_of_Month'])  # Format Date

        # df = df.query('End_of_Month < @current_eom').reset_index(drop=True)

    if 'Snapshot_Date_Short' in df.columns:
        df['Snapshot_Date_Short'] = \
            pd.to_datetime(df['Snapshot_Date_Short'])  # Format Date

    # #df = df.query('Fin_Entity_ID not in @exclude_studio')
    # #df = df.query('Fin_Entity_ID not in ["SGP",0]') # Exclude Singapore rows

    df = df.replace(np.nan, 0, regex=True)
    return df


def show_stats(df):
    print (' DF Name: ')
    display(get_df_name(df))
    print (' DF Info: ')
    display(df.info(verbose=True))
    print (' DF Describe: ')
    display(df.describe(include='all').transpose().head())
    print (' DF Head: ')
    display(df.head())
    print (' DF Tail: ')
    display(df.tail())

    # group_by_entity = df.groupby(by=['Fin_Entity_ID'], as_index=False)
    # entity_sum = group_by_entity.sum().reset_index(drop=True)
    # entity_count = group_by_entity.count().reset_index(drop=True)
    # print(" Entity Sum: ")
    # display(entity_sum.head())
    # print(" Studio Count: ")
    # display(entity_count.head())

    if 'End_of_Month' in df.columns:
        df['End_of_Month'] = pd.to_datetime(df['End_of_Month'])  # Format Date
        group_by_eom = df.groupby(by=['End_of_Month'], as_index=False)
        eom_sum = group_by_eom.sum().reset_index(drop=True)
        eom_count = group_by_eom.count().reset_index(drop=True)
        print (' EOM Sum:')
        display(eom_sum.head())
        print (' EOM Count: ')
        display(eom_count.head())
    if 'Snapshot_Date_Short' in df.columns:
        df['Snapshot_Date_Short'] = \
            pd.to_datetime(df['Snapshot_Date_Short'])  # Format Date
        group_by_sds = df.groupby(by=['Snapshot_Date_Short'],
                                  as_index=False)
        sds_sum = group_by_sds.sum().reset_index(drop=True)
        sds_count = group_by_sds.count().reset_index(drop=True)
        print (' SDS Sum:')
        display(sds_sum.head())
        print (' SDS Count: ')
        display(sds_count.head())
    return


In [3]:
#!/usr/bin/python
# -*- coding: utf-8 -*-


def split_last_n_by_series_id(df, n):
    """Group df by series identifiers and split on last n rows for each group."""

    df_grouped = \
        df.sort_values(time_column_name).groupby(time_series_id_column_names,
            group_keys=False)  # Sort by ascending time
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])
    return (df_head, df_tail)


In [4]:
import pandas as pd
import numpy as np


def APE(actual, pred):
    """
    Calculate absolute percentage error.
    Returns a vector of APE values with same length as actual/pred.
    """

    return 100 * np.abs((actual - pred) / actual)


def MAPE(actual, pred):
    """
    Calculate mean absolute percentage error.
    Remove NA and values where actual is close to zero
    """

    not_na = ~(np.isnan(actual) | np.isnan(pred))
    not_zero = ~np.isclose(actual, 0.0)
    actual_safe = actual[not_na & not_zero]
    pred_safe = pred[not_na & not_zero]
    return np.mean(APE(actual_safe, pred_safe))

In [5]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from pandas.tseries.frequencies import to_offset


def align_outputs(
    y_predicted,
    X_trans,
    X_test,
    y_test,
    target_column_name,
    predicted_column_name='predicted',
    horizon_colname='horizon_origin',
    ):
    """
    Demonstrates how to get the output aligned to the inputs
    using pandas indexes. Helps understand what happened if
    the output's shape differs from the input shape, or if
    the data got re-sorted by time and grain during forecasting.

    Typical causes of misalignment are:
    * we predicted some periods that were missing in actuals -> drop from eval
    * model was asked to predict past max_horizon -> increase max horizon
    * data at start of X_test was needed for lags -> provide previous periods
    """

    if horizon_colname in X_trans:
        df_fcst = pd.DataFrame({predicted_column_name: y_predicted,
                               horizon_colname: X_trans[horizon_colname]})
    else:
        df_fcst = pd.DataFrame({predicted_column_name: y_predicted})

    # y and X outputs are aligned by forecast() function contract

    df_fcst.index = X_trans.index

    # align original X_test to y_test

    X_test_full = X_test.copy()
    X_test_full[target_column_name] = y_test

    # X_test_full's index does not include origin, so reset for merge

    df_fcst.reset_index(inplace=True)
    X_test_full = X_test_full.reset_index().drop(columns='index')
    together = df_fcst.merge(X_test_full, how='right')

    # drop rows where prediction or actuals are nan
    # happens because of missing actuals
    # or at edges of time due to lags/rolling windows

    clean = together[together[[target_column_name,
                     predicted_column_name]].notnull().all(axis=1)]
    return clean


In [7]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.estimator import Estimator
from azureml.core.run import Run
from azureml.automl.core.shared import constants


def split_fraction_by_grain(
    df,
    fraction,
    time_column_name,
    grain_column_names=None,
    ):
    if not grain_column_names:
        df['tmp_grain_column'] = 'grain'
        grain_column_names = ['tmp_grain_column']

    df_grouped = \
        df.sort_values(time_column_name).groupby(grain_column_names,
            group_keys=False)

    df_head = df_grouped.apply(lambda dfg: (dfg.iloc[:-int(len(dfg)
                               * fraction)] if fraction > 0 else dfg))

    df_tail = df_grouped.apply(lambda dfg: (dfg.iloc[-int(len(dfg)
                               * fraction):] if fraction
                               > 0 else dfg[:0]))

    if 'tmp_grain_column' in grain_column_names:
        for df2 in (df, df_head, df_tail):
            df2.drop('tmp_grain_column', axis=1, inplace=True)

        grain_column_names.remove('tmp_grain_column')

    return (df_head, df_tail)


def split_full_for_forecasting(
    df,
    time_column_name,
    grain_column_names=None,
    test_split=0.2,
    ):
    index_name = df.index.name

    # Assumes that there isn't already a column called tmpindex

    df['tmpindex'] = df.index

    (train_df, test_df) = split_fraction_by_grain(df, test_split,
            time_column_name, grain_column_names)

    train_df = train_df.set_index('tmpindex')
    train_df.index.name = index_name

    test_df = test_df.set_index('tmpindex')
    test_df.index.name = index_name

    df.drop('tmpindex', axis=1, inplace=True)

    return (train_df, test_df)


def get_result_df(remote_run):
    children = list(remote_run.get_children(recursive=True))
    summary_df = pd.DataFrame(index=['run_id', 'run_algorithm',
                              'primary_metric', 'Score'])
    goal_minimize = False
    for run in children:
        if run.get_status().lower() == constants.RunState.COMPLETE_RUN \
            and 'run_algorithm' in run.properties and 'score' \
            in run.properties:

            # We only count in the completed child runs.

            summary_df[run.id] = [run.id, run.properties['run_algorithm'
                                  ], run.properties['primary_metric'],
                                  float(run.properties['score'])]
            if 'goal' in run.properties:
                goal_minimize = run.properties['goal'].split('_')[-1] \
                    == 'min'

    summary_df = summary_df.T.sort_values('Score',
            ascending=goal_minimize).drop_duplicates(['run_algorithm'])
    summary_df = summary_df.set_index('run_algorithm')
    return summary_df


def run_inference(
    test_experiment,
    compute_target,
    script_folder,
    train_run,
    test_dataset,
    lookback_dataset,
    max_horizon,
    target_column_name,
    time_column_name,
    freq,
    ):

    model_base_name = 'model.pkl'
    if 'model_data_location' in train_run.properties:
        model_location = train_run.properties['model_data_location']
        (_, model_base_name) = model_location.rsplit('/', 1)
    train_run.download_file('outputs/{}'.format(model_base_name),
                            'inference/{}'.format(model_base_name))
    train_run.download_file('outputs/conda_env_v_1_0_0.yml',
                            'inference/condafile.yml')

    inference_env = Environment('myenv')
    inference_env.docker.enabled = True
    inference_env.python.conda_dependencies = \
        CondaDependencies(conda_dependencies_file_path='inference/condafile.yml'
                          )

    est = Estimator(
        source_directory=script_folder,
        entry_script='infer.py',
        script_params={
            '--max_horizon': max_horizon,
            '--target_column_name': target_column_name,
            '--time_column_name': time_column_name,
            '--frequency': freq,
            '--model_path': model_base_name,
            },
        inputs=[test_dataset.as_named_input('test_data'),
                lookback_dataset.as_named_input('lookback_data')],
        compute_target=compute_target,
        environment_definition=inference_env,
        )

    run = test_experiment.submit(est, tags={
        'training_run_id': train_run.id,
        'run_algorithm': train_run.properties['run_algorithm'],
        'valid_score': train_run.properties['score'],
        'primary_metric': train_run.properties['primary_metric'],
        })

    run.log('run_algorithm', run.tags['run_algorithm'])
    return run


def run_multiple_inferences(
    summary_df,
    train_experiment,
    test_experiment,
    compute_target,
    script_folder,
    test_dataset,
    lookback_dataset,
    max_horizon,
    target_column_name,
    time_column_name,
    freq,
    ):

    for (run_name, run_summary) in summary_df.iterrows():
        print (run_name)
        print (run_summary)
        run_id = run_summary.run_id
        train_run = Run(train_experiment, run_id)

        test_run = run_inference(
            test_experiment,
            compute_target,
            script_folder,
            train_run,
            test_dataset,
            lookback_dataset,
            max_horizon,
            target_column_name,
            time_column_name,
            freq,
            )

        print (test_run)

        summary_df.loc[summary_df.run_id == run_id, 'test_run_id'] = \
            test_run.id

    return summary_df
