In [0]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Definitions

import pandas as pd
import numpy as np

today = pd.to_datetime('today').normalize()
current_eom = today + pd.offsets.MonthEnd(0)
start_date = '2017-01-01'
end_date = current_eom + pd.offsets.MonthEnd(11)

# entity_debug = "GBR"

debug = False

# display count and summary of any dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# pd.set_option('max_colwidth', -1)

pd.set_option('display.precision', 1)
pd.set_option('display.float_format', lambda x: '%.2f' % x)


def get_json(df):
    """ Small function to serialise DataFrame dates as 'YYYY-MM-DD' in JSON """

    def convert_timestamp(item_date_object):
        if isinstance(item_date_object, (datetime.date,
                      datetime.datetime)):
            return item_date_object.strftime('%Y-%m-%d')

    dict_ = df.to_dict(orient='records')

    return json.dumps(dict_, default=convert_timestamp)


def get_df_name(df):
    name = [x for x in globals() if globals()[x] is df][0]
    return name


def difflist(li1, li2):
    return list(set(li1) - set(li2))


def addlist(li1, li2):
    return li1.append(li2)


def remove_percetage(df, column_list):
    for col in column_list:
        df[col] = round(df[col].str.replace('%', '').astype(np.float64)
                        / 100, 4)
    return df

numeric_cols = [""]
def coerce_to_numeric(df, num_cols = numeric_cols):
    for col in num_cols:
        if col in df.columns:
            df[col] = (
                (df[col]
                .replace( '[\$,)]','', regex=True)
                .replace( '[(]','-',   regex=True ).astype(float)) #.apply(pd.to_numeric, errors="coerce")
                .replace(np.nan, 0, regex=True)
            )
    return df
  
int_cols = [""]
def coerce_to_int(df, int_cols = int_cols):
    for col in int_cols:
        if col in df.columns:
            df[col] = (
                df[col]
                .apply(pd.to_numeric, errors="coerce")
                .replace(np.nan, 0, regex=True)
                .astype(int)
            )
    return df

def convert_date_cols(df):
    if 'End_of_Month' in df.columns:
        df['End_of_Month'] = pd.to_datetime(df['End_of_Month'])
    if 'Snapshot_Date_Short' in df.columns:
        df['Snapshot_Date_Short'] = \
            pd.to_datetime(df['Snapshot_Date_Short'])
    if 'Snapshot_Date' in df.columns:
        df['Snapshot_Date'] = pd.to_datetime(df['Snapshot_Date'])
    if 'Forecast_Date' in df.columns:
        df['Forecast_Date'] = pd.to_datetime(df['Forecast_Date'])
    return df


def data_prep(df):
    df.columns = df.columns.astype(str).str.replace(' ', '_')
    df = convert_date_cols(df)
    df = df.replace(np.nan, 0, regex=True)
    return df
  

def move_column_inplace(df, col, pos):
    col = df.pop(col)
    df.insert(pos, col.name, col)


def show_stats(df):
    display(' DF Name: ')
    display(get_df_name(df))
    display(' DF Info: ')
    display(df.info(verbose=True))
    display(' DF Describe: ')
    display(df.describe(include='all')) #.transpose().head()
    display(' DF Head: ')
    display(df.head())
    display(' DF Tail: ')
    display(df.tail())

    # group_by_entity = df.groupby(by=['Fin_Entity_ID'], as_index=False)
    # entity_sum = group_by_entity.sum().reset_index(drop=True)
    # entity_count = group_by_entity.count().reset_index(drop=True)
    # print(" Entity Sum: ")
    # display(entity_sum.head())
    # print(" Studio Count: ")
    # display(entity_count.head())

    if 'End_of_Month' in df.columns:
        df['End_of_Month'] = pd.to_datetime(df['End_of_Month'])  # Format Date
        group_by_eom = df.groupby(by=['End_of_Month'], as_index=False)
        eom_sum = group_by_eom.sum().reset_index(drop=True)
        eom_count = group_by_eom.count().reset_index(drop=True)
        display(' EOM Sum:')
        display(eom_sum.head())
        display(' EOM Count: ')
        display(eom_count.head())
    if 'Snapshot_Date_Short' in df.columns:
        df['Snapshot_Date_Short'] = \
            pd.to_datetime(df['Snapshot_Date_Short'])  # Format Date
        group_by_sds = df.groupby(by=['Snapshot_Date_Short'],
                                  as_index=False)
        sds_sum = group_by_sds.sum().reset_index(drop=True)
        sds_count = group_by_sds.count().reset_index(drop=True)
        display(' SDS Sum:')
        display(sds_sum.head())
        display(' SDS Count: ')
        display(sds_count.head())
    if 'Snapshot_Date' in df.columns:
        df['Snapshot_Date'] = pd.to_datetime(df['Snapshot_Date'])  # Format Date
        group_by_sds = df.groupby(by=['Snapshot_Date'], as_index=False)
        sds_sum = group_by_sds.sum().reset_index(drop=True)
        sds_count = group_by_sds.count().reset_index(drop=True)
        display(' SDS Sum:')
        display(sds_sum.head())
        display(' SDS Count: ')
        display(sds_count.head())
    return


def split_last_n_by_series_id(df, n):
    """Group df by series identifiers and split on last n rows for each group."""

    df_grouped = \
        df.sort_values(time_column_name).groupby(time_series_id_column_names,
            group_keys=False)  # Sort by ascending time
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])
    return (df_head, df_tail)


def APE(actual, pred):
    """
    Calculate absolute percentage error.
    Returns a vector of APE values with same length as actual/pred.
    """

    return 100 * np.abs((actual - pred) / actual)


def MAPE(actual, pred):
    """
    Calculate mean absolute percentage error.
    Remove NA and values where actual is close to zero
    """

    not_na = ~(np.isnan(actual) | np.isnan(pred))
    not_zero = ~np.isclose(actual, 0.0)
    actual_safe = actual[not_na & not_zero]
    pred_safe = pred[not_na & not_zero]
    return np.mean(APE(actual_safe, pred_safe))
  

def normalize(x, newLowerBound, newUpperBound):
    # Normalizing from one range to another
    min = np.min(x)
    max = np.max(x)
    range = max - min
    newRange = newUpperBound - newLowerBound

    return [int(((a - min) / range) * newRange + newLowerBound) for a in x]

def df_crossjoin(df1, df2, **kwargs):
    """
    Make a cross join (cartesian product) between two dataframes by using a constant temporary key.
    Also sets a MultiIndex which is the cartesian product of the indices of the input dataframes.
    See: https://github.com/pydata/pandas/issues/5401
    :param df1 dataframe 1
    :param df1 dataframe 2
    :param kwargs keyword arguments that will be passed to pd.merge()
    :return cross join of df1 and df2
    """
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    #res.index = pd.MultiIndex.from_product((df1.index, df2.index))
    res = res.reset_index(drop=True)

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res
  
def movecol(df, cols_to_move=[], ref_col='', place='After'):
    
    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]
    
    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]
    
    return(df[seg1 + seg2 + seg3])

In [0]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from pandas.tseries.frequencies import to_offset


def align_outputs(
    y_predicted,
    X_trans,
    X_test,
    y_test,
    target_column_name,
    predicted_column_name='predicted',
    horizon_colname='horizon_origin',
    ):
    """
    Demonstrates how to get the output aligned to the inputs
    using pandas indexes. Helps understand what happened if
    the output's shape differs from the input shape, or if
    the data got re-sorted by time and grain during forecasting.

    Typical causes of misalignment are:
    * we predicted some periods that were missing in actuals -> drop from eval
    * model was asked to predict past max_horizon -> increase max horizon
    * data at start of X_test was needed for lags -> provide previous periods
    """

    if horizon_colname in X_trans:
        df_fcst = pd.DataFrame({predicted_column_name: y_predicted,
                               horizon_colname: X_trans[horizon_colname]})
    else:
        df_fcst = pd.DataFrame({predicted_column_name: y_predicted})

    # y and X outputs are aligned by forecast() function contract

    df_fcst.index = X_trans.index

    # align original X_test to y_test

    X_test_full = X_test.copy()
    X_test_full[target_column_name] = y_test

    # X_test_full's index does not include origin, so reset for merge

    df_fcst.reset_index(inplace=True)
    X_test_full = X_test_full.reset_index().drop(columns='index')
    together = df_fcst.merge(X_test_full, how='right')

    # drop rows where prediction or actuals are nan
    # happens because of missing actuals
    # or at edges of time due to lags/rolling windows

    clean = together[together[[target_column_name,
                     predicted_column_name]].notnull().all(axis=1)]
    return clean
