# **Intro and Setup**
## See "kaggle_main_..." code block for GBDT parameter inputs below
## See code block beneath that for some example inputs, useful choices

In [1]:
# imports
from google.colab import drive

from collections import namedtuple, OrderedDict
import datetime
import io
from itertools import chain, product
import math
import multiprocessing as mp
import os
from pathlib import Path
import platform
from pprint import pprint
import psutil
import re
import sys
import time

import numpy as np
import pandas as pd
import pkg_resources
from tensorflow import test, distribute
# %tensorflow_version 2.x
# import tensorflow as tf
# import subprocess as sp  # query Windows for amount of physically-present RAM

# timing
from   time import strftime, tzset
os.environ['TZ'] = 'EST+05EDT,M4.1.0,M10.5.0'   # allows user to simply print a formatted version of the local date and time; helps keep track of what cells were run, and when
tzset()                                         # set the time zone
print(f'Done: {strftime("%a %X %x")}')

Done: Sat 08:44:45 11/21/20


###**Mount Google Drive for access to Google Drive local repo and data**

In [2]:
# click on the URL link presented to you by this command, get your authorization code from Google, 
#     then paste it into the input box and hit 'enter' to complete mounting of the drive

GDRIVE_REPO_PATH = "/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final/Kag"
drive.mount('/content/drive')

Mounted at /content/drive


## **kag_utils.py**

In [3]:
"""
Provide utility functions to assist main Kaggle program.

Created on Tue Oct 20 05:55:03 2020
@author: mgaidis

    functions:
        tdstr():        Return a formatted version of present time and date
        timer(t_ref):   Return a formatted version of elapsed time relative to t_ref
        dict_to_writelines_list(source_dict, k_v_link_text=': '): Format a dict as a list of
            text strings ending in newline, suitable for writelines
        col_info(target_df, n_cols=3, inter_col_space=5, col_pad=2, target_df_name=None):
            Print a formatted version of a DataFrame's column's data types and memory usage.

    classes:
        AttrDict: create dictionary-like object where keys can be written like instance attrs.
            dict['key'] is the same as dict.key
"""

# import datetime
# import io
# import math
# import sys
# import time
# import numpy as np
# import pandas as pd


def tdstr():
    """Return formatted version of the time and date NOW."""
    return f'{time.strftime("%a %X %x")}'


def timer(t_ref):
    """Return formatted version of elapsed time between performance_counters."""
    return f'{datetime.timedelta(seconds=round(time.perf_counter() - t_ref))}'


def df_dict_summary_to_writelines_list(dfdict, colinfo=False, console=True):
    """Summary of each DataFrame in the argument {dfname: df}, formatted for writelines output."""
    writelist = []
    for dfname, dframe in dfdict.items():
        writelist.extend([f'---------- {dfname} ----------\n',
                          (f'DataFrame shape: {dfdict[dfname].shape}     '
                           f'DataFrame total memory usage: '
                           f'{(dfdict[dfname].memory_usage(deep=True) / 1e6).sum():.0f} MB\n'),
                          f'DataFrame Column Names: {dfdict[dfname].columns.to_list()}\n'])
        if colinfo:
            writelist.extend(col_info(dframe, console=False))
        writelist.append(f'^^^^^\n{dfdict[dfname].head(2)}\n\n')
    if console:
        sys.stdout.writelines(writelist)
    return writelist


def dict_to_writelines_list(source_dict, k_v_link_text=': '):
    """Format a dict as a list of text strings ending in newline, suitable for writelines."""
    writelist = []
    for source_key, source_val in source_dict.items():
        writelist.append(f'{source_key}{k_v_link_text}{source_val}\n')
    return writelist


def col_info(target_df, n_cols=3, target_df_name=None, console=True):
    """
    Format and print information regarding pandas DataFrame column datatypes and memory usage.

    More readable than standard printing, this fn prints out multiple columns of length "n_rows",
    where each column is like:
        "column_dtype   column_memory_use(MB)   column_name"
    and finishes with a printout of total DataFrame memory usage.

    Inputs:
        target_df: pandas DataFrame to analyze and report on
        n_cols: int --> Determines the "width" of the printed table by specifying (roughly)
            how many truples (type, mem, name) (type, mem, name)... to print in each row
        target_df_name: string
        console: bool --> print to console (vs. just returning a writelines-formatted list)
    """
    writelist = []
    col_mem = target_df.memory_usage(deep=True)
    if isinstance(col_mem, int):  # Series rather than DataFrame
        dfname_str = f'Series {target_df_name}' if target_df_name else 'Series'
        writelist.extend([f"{dfname_str} shape: {target_df.shape}\n",
                          f"{dfname_str} column name: ['{target_df.name}']\n",
                          f"{dfname_str} column memory: {col_mem}\n",
                          f"{dfname_str} column dtype: {target_df.dtypes}\n"])
    else:
        dfname_str = f'DataFrame {target_df_name}' if target_df_name else 'DataFrame'
        # change to kB or MB or GB
        exponent = (len(bin(col_mem.max())) - 3) // 10  # 1024 = 2**10; bin() 2 char prefix '0b'
        col_mem = col_mem / (1024**exponent)
        mem_unit = ['Bytes', 'kBytes', 'MBytes', 'GBytes', 'TBytes', 'PBytes'][exponent]
        writelist.extend([f'{dfname_str} shape: {target_df.shape}\n',
                          f'{dfname_str} total memory: {col_mem.sum():.0f} {mem_unit}\n',
                          f'{dfname_str} column names: {target_df.columns.to_list()}\n'])

        # Index is in df.memory_usage, but not df.dtypes, so compute and concat to dtypes df
        info_df = pd.concat([pd.Series([target_df.index.dtype], index=['Index']),
                             target_df.dtypes], axis=0)
        # Combine the memory and the dtypes dataframes
        info_df = pd.concat([info_df, col_mem], axis=1).reset_index()
        info_df.columns = ['Column Name', 'DType', f'{mem_unit}']
        # Format printout of the memory values
        info_df[mem_unit] = info_df[mem_unit].apply(lambda x: f'{x:.1f}')

        n_cols = min(target_df.shape[0] // 3, n_cols)  # don't make columns too short
        if n_cols < 2:  # print all truples in just one column... no special formatting
            writelist.extend(io.StringIO(info_df.to_string()).readlines())
            # print(info_df)
        else:
            n_rows = math.ceil(info_df.shape[0] / n_cols)
            # n_rows, stragglers = divmod(info_df.shape[0], n_cols)
            # n_rows += (stragglers > 0)  # add an extra row if not strictly divisible by n_cols

            # Create wide DataFrame by copying the original 3 columns and concatenating to the
            # original to form n_cols of truples.  Each new addition is shifted up by n_rows.
            info_df = pd.concat([info_df.shift(n_rows * x) for x in range(0, -n_cols, -1)], axis=1)

            # Truncate rows to eliminate duplicates made by the above copy-shift-concat operation
            # Also, make first row = column names (easier for printing)
            info_df = info_df.iloc[:n_rows][:].fillna('').T.reset_index().T.reset_index(drop=True)

            # compute max string length in each col, add padding=2 to create a clean column look
            str_lengths = np.vectorize(len)
            col_widths = np.add(str_lengths(info_df.values.astype(str)).max(axis=0), 2)

            for row in range(n_rows + 1):  # create, print strings of all columns in each row
                print_row = ''
                for col in range(len(info_df.columns)):
                    # format string for right alignment to fit 'col_widths'
                    print_row = print_row + f'{str(info_df.iloc[row][col]):>{col_widths[col]}} '
                    # when done with one column of truples; add 4 extra spaces before next column
                    print_row = print_row + ' ' * 4 * ((col + 1) % 3 == 0)
                # print(print_row)
                writelist.append(f'{print_row}\n')

        writelist.extend([f'\n{dfname_str} shape: {target_df.shape}\n',
                          f'{dfname_str} total memory usage: {col_mem.sum():.0f} {mem_unit}\n'])
    if console:
        sys.stdout.writelines(writelist)
    return writelist

# =============================================================================
# https://www.toptal.com/python/python-class-attributes-an-overly-thorough-guide-item 3
#  or pympler for tracking instances
#
# psutil.swap_memory()
# Out[19]: sswap(total=58783318016, used=123, free=456, percent=18.2, sin=0, sout=0)
# psutil.virtual_memory()
# Out[20]: svmem(total=51267125248, available=123, percent=17.3, used=456, free=789)
# =============================================================================


class AttrDict(dict):
    """
    Give attribute-format access to string keys of dictionaries.

    somedict = {'key': 123, 'stuff': 456}
    data = AttrDict(somedict)
    print(data.key)
    print(data.stuff)
    >> 123
    >> 456
    data.key = 'abc'
    print(data.key)
    print(data['key'])
    >> abc
    >> abc
    def fn(**i): print(i)
    fn(**somedict)
    >> {'key': 123, 'stuff': 456}
    fn(**data)
    >> {'key': 'abc', 'stuff': 456}
    data.alpha = 'oh'
    fn(**data)
    {'key': 5, 'stuff': 456, 'alpha': 'oh'}
    data
    >> {'key': 5, 'stuff': 456, 'alpha': 'oh'}
    """

    def __getattr__(self, k):
        """Get the attribute with dot notation."""
        return self[k]

    def __setattr__(self, k, v):
        """Set the attribute with dot notation."""
        self[k] = v

## **kag_config.py**

In [4]:
"""
Containers for somewhat-constant parameters used by the Kaggle routines.

Utilize named tuples for convenience in future referencing.
CONFIG = namedtuple('packages pd_opts paths data ftr_paths join_str short_re stat_abbr')
    CONFIG.packages = list of strings of relevant package names for version control purposes
    CONFIG.pd_opts = namedtuple('max_rows max_cols disp_width max_colwidth float_decimals')
    CONFIG.paths = namedtuple('home repo feather data logs outputs')
    CONFIG.data = OrderedDict(name: InputInfo), where:
        name(key) = string name of DataFrame formed from input file data (one key for each file)
        InputInfo(value) = namedtuple('filename filepath all_columns') --> filepath includes name
    CONFIG.ftr_paths = OrderedDict(module_name: FeatherPaths), where:
        module_name(key) = module name of overall program ('eda', 'data', 'tvt')
        FeatherPaths = namedtuple('filename filepath') --> filepath includes name
            where each tuple contains a list of DataFrames/Filenames/Paths from the respective
            tuple's parent module that are suitable for feather file storage to conserve RAM
    CONFIG.join_str = '__'  == characters between feature description string elements
    CONFIG.short_re = regex pattern for shortening column names for readability (_id, cat, grp)
    CONFIG.stat_abbr = dict for shortening stats names for readabilty (count -> cnt, median -> med)

Created on Wed Nov  4 08:43:16 2020
@author: mgaidis
"""

# import re
# import sys
# from collections import namedtuple, OrderedDict
# from pathlib import Path
# import pandas as pd

# ===============================================================================
# # =============================================================================
# # User-Configurable Settings to Guide the Program Execution and Display
# # =============================================================================
# ===============================================================================
GROUP_STAT_JOIN_STR = '__'  # characters between feature description string elements
SHORTEN_NAME_RE = re.compile('(_id)|(egory)|(gr)(ou)(p)')  # del _id; category -> cat; group -> grp
STATS_ABBR = {'count': 'cnt', 'median': 'med', 'nunique()': 'nunq'}

# =============================================================================
# Package Version Control: relevant packages for logging the version numbers
# =============================================================================
PACKAGES = ['pandas', 'matplotlib', 'numpy', 'scikit-learn', 'lightgbm']
# , 'keras', 'catboost', 'seaborn', 'nltk', 'newtworkx', 'graphx', 'tensorflow'

# =============================================================================
# Useful file paths and file names
# =============================================================================
WINDOWS_HOME = Path("C:/Users/mgaid/Documents/GitHub")
COLAB_HOME = Path("/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final")
REPO_DIR = 'Kag'
DATA_DIR = 'readonly/final_project_data'
DATA_INPUT_FILENAMES = ['items_enc.csv',
                        'shops_enc.csv',
                        'date_scaling.csv',
                        'stt.csv.gz',
                        'test.csv.gz']
FTR_DIR = 'ftr_files'
LOG_DIR = 'logs'
OUTPUTS_DIR = 'models_and_predictions'

# =============================================================================
# Process modules and associated DataFrames possibly stored as ftr files to conserve RAM
# =============================================================================
FTR_MODULES_DFS = OrderedDict([
    ('eda', ['items_enc', 'shops_enc', 'date_scaling', 'stt']),  # 'test' df intentionally omitted
    ('data', ['monthly_stt']),
    ('tvt', ['train_X', 'train_y', 'val_X', 'val_y', 'test_X'])])

# =============================================================================
# Preferred options for pandas DataFrame display configuration (adjust based on monitor size)
# =============================================================================
MAX_ROWS = 60
MAX_COLS = 40
DISP_WIDTH = 180
MAX_COLWIDTH = None
FLOAT_DECIMALS = 3

# ===============================================================================
# # =============================================================================
# # END OF User-Configurable Settings to Guide the Program Execution and Display
# # =============================================================================
# ===============================================================================

PandasOpts = namedtuple('PandasOpts', 'max_rows max_cols disp_width max_colwidth float_decimals')
pandas_opts = PandasOpts(MAX_ROWS, MAX_COLS, DISP_WIDTH, MAX_COLWIDTH, FLOAT_DECIMALS)

home_path = WINDOWS_HOME if sys.platform == 'win32' else COLAB_HOME
repo_path = home_path / REPO_DIR  # sync with GitHub (only files < 50 or 100 MB allowed)
ftr_path = home_path / FTR_DIR  # local store of fast-loading ftr files, e.g., > 50-100 MB
data_path = repo_path / DATA_DIR
log_path = repo_path / LOG_DIR
output_path = repo_path / OUTPUTS_DIR

DrivePaths = namedtuple('DrivePaths', 'home repo feather data logs outputs')
kaggle_paths = DrivePaths(home_path, repo_path, ftr_path, data_path, log_path, output_path)
for kag_path in kaggle_paths:
    kag_path.mkdir(parents=True, exist_ok=True)  # make directories if they do not already exist

data_input_dataframe_names = [x.split('.')[0] for x in DATA_INPUT_FILENAMES]
data_input_filepaths = [data_path / x for x in DATA_INPUT_FILENAMES]
data_input_all_columns = [pd.read_csv(x, nrows=0).columns for x in data_input_filepaths]
InputInfo = namedtuple('InputInfo', 'filename filepath all_columns')
data_input = OrderedDict([(name, InputInfo(fname, fpath, allcols)) for
                          name, fname, fpath, allcols in zip(data_input_dataframe_names,
                                                             DATA_INPUT_FILENAMES,
                                                             data_input_filepaths,
                                                             data_input_all_columns)])

ftr_filenames = [[x for x in y] for y in FTR_MODULES_DFS.values()]
ftr_filepaths = [[ftr_path / f'{x}.ftr' for x in y] for y in ftr_filenames]
FeatherPaths = namedtuple('FeatherPaths', 'filename filepath')
feather_paths = OrderedDict([(module_name, FeatherPaths(fnames, fpaths)) for
                             module_name, fnames, fpaths in
                             zip(FTR_MODULES_DFS.keys(), ftr_filenames, ftr_filepaths)])

Cfg = namedtuple('Cfg', 'packages pd_opts paths data ftr_paths join_str short_re stat_abbr')
CONFIG = Cfg(packages=PACKAGES,
             pd_opts=pandas_opts,
             paths=kaggle_paths,
             data=data_input,
             ftr_paths=feather_paths,
             join_str=GROUP_STAT_JOIN_STR,
             short_re=SHORTEN_NAME_RE,
             stat_abbr=STATS_ABBR)


## **kag_features.py**

In [5]:
"""
Define the various choices for features and lags.

Created on Wed Oct 21 08:14:17 2020
@author: mgaidis

    Enable looping (~~ grid search) over variations in choices of categories to keep,
    statistics to use, and lag months (and stats to lag for each month)

    Note that instead of 'cartesian product' type looping over all possible combinations of
        [lags, category-groups, stats],
    we link these specific lag and groups together as one unit for each stats assignment, like:
        [[[lags1, groups1, stats_1-1], [lags1, groups2, stats_1-2], [L2, G1, S_2-1], ...]]
    instead of
        [[[lags1, groups1, stats1], [lags1, groups1, stats2], [L2, G1, S1], [L2, G1, S2], ...]]

"""

# import re
# from collections import OrderedDict
# from kag_config import CONFIG


def snake_list_to_camel_string(snake_list):
    """Convert names from snake to camel, and abbreviate to reduce length for readability."""
    camel_list = []
    for feature_name in snake_list:
        feature_name = re.sub(CONFIG.short_re, r'\3\5', feature_name)
        feature_parts = feature_name.split('_')
        camel_list.append(feature_parts[0] + ''.join([f.capitalize() for f in feature_parts[1:]]))
    return '_'.join(camel_list)


def make_agg_names(group_list, stats_dict, base_features=None):  # vs. base_ = self.cols['no_lag']
    """
    Given group name like 'shop_id__item_id', parse with stats dict to get new column names.

    Delete "_id", "category" -> "cat", "group" -> "gp", and snake -> camelcase to shorten names
    Shorten names of statistics to 3 letters
    """
    # =============================================================================
    # Handling base features (without statistics during monthly grouping / aggregation)
    # =============================================================================
    # The first grouping/agg will form the base monthly dataset, and needs to include original
    #   features as well as additional statistics based on a subset of those features.
    #   Further statistical feature generation and merging does not need to repeat the above.
    # =============================================================================
    if base_features:
        stats_agg_dict = OrderedDict.fromkeys(base_features, ['first'])
        stats_agg_dict.update(stats_dict)
    else:
        stats_agg_dict = stats_dict
    group_list = [x for x in group_list if x != 'month']  # 'month' will be assumed; don't need it
    group_by = ['month'] + group_list  # monthly aggregates only
    agg_dict = OrderedDict([
        ('group_name', snake_list_to_camel_string(group_list)),
        ('group', group_by),
        ('stats', stats_agg_dict),
        ('col_names', []),  # include base features that do not get agg stats
        ('agg_names', [])])
    for data_col, stat_list in stats_agg_dict.items():
        for stat in stat_list:
            if stat == 'first':  # used to aggregate without applying statistics, to base cols
                agg_dict['col_names'].append(data_col)
            else:
                if stat in CONFIG.stat_abbr.keys():
                    stat = CONFIG.stat_abbr[stat]
                camel_data_col = snake_list_to_camel_string([data_col])
                agg_name = (f'{agg_dict["group_name"]}{CONFIG.join_str}'
                            f'{camel_data_col}{stat.capitalize()}')
                agg_dict['col_names'].append(agg_name)
                agg_dict['agg_names'].append(agg_name)
    return agg_dict


class LagFeatures:
    """
    Create a set of base + statistical + lagged features for an iteration the main ML loop.

    Creating new statistical features:
    New features are generated by aggregating statistics while grouping over monthly periods, with
      additional grouping elements as desired (e.g., aggregating with group = ['shop_cat'] and
      statistics = {'sales': ['sum']} will take the monthly sum of sales for each shop_cat
      and make a new feature from it, which will be in a column named shopCat__sales_sum.
    If more than one initial grouping element is desired (such as the sum of all sales at shop #x
      for item #y, which would be done by grouping ['shop_cat', 'item_id'] while doing the monthly
      aggregation of 'sales sum') the column will be named using CONFIG.join_str = '__' and
      deleting 'id' with camelCase like so:  shopCat_item__salesSum
    Computations are performed using self.lags.min_agg_set with monthly grouping/aggregation.
    Then we use the self.lags.month_dict to copy and shift select aggregate columns by n months.

    Note on inputs:  'group_list' should include the feature column name(s) to be grouped.
        Do not include 'month' in the 'group_list', as that is assumed.
        Include full column name for the feature (with '_id' if in original data file col name).
        The '_id' will be removed later in creating the stats column names, for readability.

    Inputs
    ------
        lag_feature_list: list of FeatureGroup(FG) namedtuples, where each tuple contains:
            1) lag_month : int = a specific number of months to lag by, and
            2) group_list : list of strings = the columns on which to group by month, and
            3) stats_dict : OrderedDict = specification for aggregation stas on this month/group,
                       e.g., OD([('sales', ['count', 'sum']), ('rev', ['sum'])])
            eg: [FG(month1, groupname1, statsdict_1-1), FG(month1, groupname2, statsdict_1-2),...]

    Attributes
    ----------
        lag_feature_list: list of lists, copy of the input for posterity
        _lag_feature_dict: OrderedDict
            key: int = number of months to lag the feature
            value: list[agg_name strings] = col names for features to get lagged by KEY months
        min_feat_groups_stats_set: strip away monthly lag info/repeats to get just the smallest
            set of aggregate grouping stats we need to compute (then we apply lags to this set).
            The format is easily interpreted by pandas group/agg funcs.
        printable_lag_features: OrderedDict = 'appealing' version of the lag features for print
            key: int = number of months being lagged
            value: OrderedDict = collection of the various features/stats for KEY lag months
                {key: value} =
                'group_name': group_name like 'shop_shopGrp'
                'group': list like ['month', 'shop_id', 'shop_group']
                'stats': stats_dict like {'sales': ['count', 'sum']}
                'agg_names': list like ['shop_shopGroup__salesCnt', 'shop_shopGroup__salesSum']

        cols: dict = useful groupings of columns for later dataframe manipulations
            {key: value} =
            all_keep: list of cols (from initial dataframes) needed to create all desired features
            keep: dict = names of columns to keep for each individual initial dataframe
                key = dfname string; value = list of column string names
            cat_feats: list of column string names for the categorical type columns
            int_feats: list of column string names for the integer-representable columns
            no_lag: list of column string names for those columns holding features without lags
            final_stt: list of ordered column string names to specify stt df at end of eda module
            min_agg_cols: list of all statistically-aggregated columns in the minimal agg set

        lag_month_dict: OrderedDict{
                key: int = number of months to lag,
                value = OrderedDict{
                    key: str = base column name, like 'shop_shopGrp__salesCnt',
                    value: str = lag-month-specific col name, like 'shop_shopGroup__salesCnt_L4'}}
        all_lag_features: list of all the month-specific column names from the above month_dict

    Methods
    -------
        get_keep_cols(lag_feature_list): from the input info, get a list of ALL datafile-input
            DataFrame columns we need to keep, vs. which we can discard to conserve RAM
        _assign_column_types(): from the input information, assign the various
            data-input dataframe columns to useful groups for future data manipulation
        _add_group_features(lag_month, group_name, stats_dict): process an element of input list
        get_all_lag_features(): combine useful data structures into something easy to pass on
    """

    def __init__(self, lag_feature_list):
        self.lag_feature_list = lag_feature_list
        self._lag_feature_dict = OrderedDict()
        self.printable_lag_features = OrderedDict()  # pprintable version of lag feature info
        self.min_feat_groups_stats_set = OrderedDict()
        self.lag_month_dict = OrderedDict()
        self.cols = self._assign_column_types()  # useful groupings of columns

        for feat_group in self.lag_feature_list:
            self._add_group_features(**feat_group._asdict())

        self.all_lag_features = self.get_all_lag_features()

    def get_keep_cols(self):
        """Return a list of col names to enable discarding unneeded cols (for speed, RAM)."""
        # Save memory, speedier merges, etc.
        base = ['month'] + [*CONFIG.data['test'].all_columns]
        cols1 = [col for feat_group in self.lag_feature_list for col in [*feat_group.group_list]]
        cols2 = [col for feat_group in self.lag_feature_list for col in [*feat_group.stats_dict]]
        return set(cols1 + cols2 + base)

    def _assign_column_types(self):
        """Provide useful groupings of columns for simpler data manipulation in other modules."""
        columns = {}
        columns['all_keep'] = self.get_keep_cols()  # does not distinguish based on DataFrame name
        columns['keep'] = {}                        # separate keep cols by DataFrame name
        for dfname, dfinfo in CONFIG.data.items():
            columns['keep'][dfname] = [x for x in dfinfo.all_columns if x in columns['all_keep']]
        columns['cat_feats'] = columns['keep']['shops_enc'] + columns['keep']['items_enc']
        columns['no_lag'] = [x for x in columns['cat_feats'] if x not in ['shop_id', 'item_id']]
        columns['int_feats'] = ['month', 'shop_id', 'item_id'] + columns['no_lag']
        columns['final_stt'] = columns['keep']['stt'] + columns['no_lag']
        if 'rev' in columns['all_keep']:
            columns['final_stt'].insert(columns['final_stt'].index('sales') + 1, 'rev')
        if 'price' not in columns['keep']['stt']:
            columns['keep']['stt'].insert(columns['keep']['stt'].index('sales') + 1, 'price')
        return columns

    def _add_group_features(self, lag_month, group_list, stats_dict):
        """
        Create new group feature(s) based on time-lag and aggregate statistics.

        Also create guidelines for smallest set of stats to compute:
        min_feat_groups_stats_set = OrderedDict(keys = group name, values = minimal stats needed)
        Example formatting of the various attribute containers are shown far below...
        """
        if lag_month not in self._lag_feature_dict:
            self._lag_feature_dict[lag_month] = []
            self.printable_lag_features[lag_month] = []
        if (lag_month == 1) and (group_list == ['shop_id', 'item_id']):
            base_features = self.cols['no_lag']
        else:
            base_features = None
        agg_dict = make_agg_names(group_list, stats_dict, base_features)
        self._lag_feature_dict[lag_month] += agg_dict.pop('agg_names')
        self.printable_lag_features[lag_month].append(agg_dict.copy())
        # track only the groups + stats we need to compute, to not repeat for every lag month#."""
        grp_name = agg_dict.pop('group_name')
        del agg_dict['col_names']  # will redo this below with complete compilation of min agg set
        if grp_name not in self.min_feat_groups_stats_set:
            self.min_feat_groups_stats_set[grp_name] = agg_dict
        else:
            for data_col, stat_list in stats_dict.items():
                if data_col not in self.min_feat_groups_stats_set[grp_name]['stats']:
                    self.min_feat_groups_stats_set[grp_name]['stats'][data_col] = stat_list
                    continue
                for stat in stat_list:
                    if stat not in self.min_feat_groups_stats_set[grp_name]['stats'][data_col]:
                        self.min_feat_groups_stats_set[grp_name]['stats'][data_col].append(stat)

    def get_all_lag_features(self):
        """Provide lag feature names with associated identification of n months lagged."""
        self.cols['min_agg_cols'] = []
        for gp_name, agg_set_dict in self.min_feat_groups_stats_set.items():
            agg_dict = make_agg_names(agg_set_dict['group'], agg_set_dict['stats'])
            self.min_feat_groups_stats_set[gp_name]['col_names'] = agg_dict['col_names']
            self.cols['min_agg_cols'].extend(agg_dict['agg_names'])  # don't include base cat feats

        all_lag_feats = []
        for month in sorted([*self._lag_feature_dict]):
            self.lag_month_dict[month] = OrderedDict()
            for feat in self._lag_feature_dict[month]:
                column_name = f'{feat}_L{month}'
                self.lag_month_dict[month][feat] = column_name
                all_lag_feats.append(column_name)
        return all_lag_feats

"""
EXAMPLE IS BELOW
"""


'\nEXAMPLE IS BELOW\n'

## **kag_program_manager.py**

In [6]:
"""
Provide objects to coordinate operation and input/output of main Kaggle program.

Created on Tue Oct 20 05:55:03 2020
@author: mgaidis

    Classes:
        ProgramManager:
            Provide oversight of the program's execution:
            - Compile computer platform specs and package version control information
            - Provide and/or generate file paths for input and output
            - Organize distribution of run config params for an efficient loop/split grid search
            - Temporary data store to pass information between modules and help reduce RAM demands
            - Assist with run documentation (output log files, etc.)
        MemStats: obtain and organize information on memory-related issues throughout the program.

    Functions:
        set_pandas_options(opts=CONFIG.pd_opts):
            Adjust pandas setup options for speed enhancement and pleasing display output formats.
        get_package_versions(pkg_list=CONFIG.packages):
            For version control, describe key package imports with dict like: {pkg: version, ...}.
        get_runtime_type():
            Check if connected to a CPU vs. GPU-enabled runtime, or possibly a TPU in Colab.
        get_phys_dram():
            On Windows system, query for actual physical installed DRAM in all available slots.
        list_of_lists_splits(param_value):
            Given a 'param_value', determine if/how it will result in splits after pd.df.explode.
"""

# import os
# import sys
# import time
# from collections import namedtuple, OrderedDict
# from itertools import chain
# from pprint import pprint

# import multiprocessing as mp
# import platform
# import subprocess as sp  # query Windows for amount of physically-present RAM
# import pkg_resources
# import psutil

# import numpy as np
# import pandas as pd

# from kag_config import CONFIG
# from kag_utils import tdstr, timer, dict_to_writelines_list
# from kag_features import LagFeatures

FNameContainer = namedtuple('FNameContainer',
                            'root base model ftr_paths submit_path output_path log_path')
# ScalersContainer = namedtuple('ScalersContainer', 'robust minmax')


def set_pandas_options(opts=CONFIG.pd_opts):
    """Adjust pandas setup options for speed enhancements and for desired UI output formatting."""
    # speed up operation when using NaNs
    pd.set_option('compute.use_bottleneck', False)
    # speed up bool ops, large dfs; df.query() and df.eval() will use numexpr
    pd.set_option('compute.use_numexpr', False)
    # pd.set_option("display.max_rows", opts.max_rows)
    pd.set_option("display.max_columns", opts.max_cols)
    pd.set_option("display.width", opts.disp_width)
    pd.set_option("max_colwidth", opts.max_colwidth)
    # decimal places: opts.float_decimals(default=3) for float, 0 for int
    pd.options.display.float_format = (
        lambda x: f'{x:.0f}' if round(x, 0) == x else f'{x:,.{opts.float_decimals}f}')


def get_package_versions(pkg_list=CONFIG.packages):
    """For version control, describe key package imports with dict like: {pkg: version, ...}."""
    package_versions = OrderedDict([("Python", platform.python_version())])
    for pkg in sorted(pkg_list):
        package_versions[pkg] = pkg_resources.get_distribution(pkg).version  # ~ pkg.__version__
    return package_versions


def get_runtime_type():
    """Check if connected to a CPU vs. GPU-enabled runtime, or possibly a TPU in Colab."""
    try:
        # from tensorflow import test, distribute
        gpu_device_name = test.gpu_device_name()
        # if 'GPU' in (gpu_device_name := test.gpu_device_name()):
        if 'GPU' in gpu_device_name:
            return f'Colab using GPU at: {gpu_device_name}'
        try:
            tpu = distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
            return f'Colab using TPU at: {tpu.cluster_spec().as_dict()["worker"]}'
        except ValueError:
            return 'Colab using CPU'
    except ImportError:
        print('Cannot find runtime type (missing TensorFlow?).')
        return 'Unknown runtime type'


def get_phys_dram():
    """On Windows system, query for actual physical installed DRAM in all available slots."""
    slots = sp.check_output('wmic MemoryChip get Capacity', shell=True).decode().split()[1:]
    slots_gb = [int(x) / 2.**30 for x in slots]  # 2**30 converts to GiB
    return sum(slots_gb)


def list_of_lists_splits(param_value):
    """Given a 'param_value', determine if/how it will result in splits after pd.df.explode."""
    split = False
    if isinstance(param_value, list):
        for param_element in param_value:
            if isinstance(param_element, list):
                if len(param_element) > 1:
                    split = param_element
    return split


class ProgramManager:
    """
    Provide oversight of the program's execution.

    Compile computer platform specs and package version control information
    Provide and/or generate file paths for input and output
    Organize distribution of run configuration parameters for an efficient loop/split grid search
    Serve as a temporary data store to pass information between modules and help reduce RAM use
    Assist with run documentation (output log files, etc.)
    ...

    Attributes
    ----------
    run_platform: dict     = container for system platform information
    n_models: int          = total n splits to run in the program (product of nested loop lengths)
    init_datetime: str     = f'{strftime("%y%m%d-%H%M")}' == initialization date-time of prog mgr

    t_timer_start: float   = (reused) ref time for fine tracking of module execution elapsed time
    model_n: int           = current split number being executed within the looping 'grid search'
    filenames: namedtuple  = (root base model ftr_paths submit_path output_path log_path)
    feats: LagFeatures()   = LagFeatures(self.split['features'])... formatted input info
    splits: OrderedDict    = description of ALL splits to use in the program's 'grid search'
                             --> length = total n of modules = len(['module', 'eda', 'data', ...])
        key1: str          = module name, such as one of ('module','eda','data', ...)
        value1: dict       = description of all user-specified splits within key1's module
                             --> length = n splits (rows) 'exploding' key1's module's parameter df
            key2: int      = iter n (0 to n_rows-1) of key1's module's 'exploded' parameter df
            value2: ODict  = description of values chosen for key1's module's params
                             --> length = number of parameters in the key1 module
                key3: str  = parameter name
                value3:    = user-specified value for key3 param for key2 split in module key1
    current_model: ODict   = description of parameters of ALL MODULES for a SINGLE split (model_n)
                             --> length = total number of parameter choices for all modules
                             (Overwritten as required when looping to a new iteration param set)
        key: str           = parameter name
        value: (various)   = user-specified parameter value choice for the present iteration/split
    # disable scaling at this time...
    # robust_scalers: dict   = sklearn robust scaler for each relevant col of monthly_stt df*
    # minmax_scalers: dict   = sklearn minmax scaler for each relevant col of monthly_stt df*
    #     *shop_item_salesSum column's scalers are needed to inverse-transform the predictions
    memory_stats_log: list = instances of mem stats captures throughout the entire program

    Methods
    -------
    mem_capture(program_loc="0", printout=False, single_row=True):
        Store immediate memory data in memory_stats_log in the form of a MemStats class
        instance.  Optional print of latest or all snapshots to console.
    format_memory_stats():
        Format all memory stats gathered into the log, for printing or file saving.
    print_memory_stats(single_row=True):
        Print all program mem stats to console, or just the most recent program location (default).
    format_log_mgr_info():
        Format relevant information for utilizing 'writelines' to write to a file or the console.
    write_log_mgr_info(param_dict=None, console=False)
        Write memory information either to the log file, and (optionally) to the console.
    update_filenames()
        Create named tuple for storing files with filenames adjusted by model_n.
    set_features():
        Using LagFeatures class, transform user input agg stats/lag features format for pandas.
    arrange_run_info(params, output_results):
        Explode splits, print details of run to console to help user verify that all is ok.
        Return a pd.DataFrame container for storage of output results - one row for each run split.
    module_start(module_name):
        Print short status and collect memstats; start timer to track module execution time.
    module_end(module_name, printout=True, single_row=False):
        Collect memory stats; print short status or (optionally) all mem stats since program start.
        Save elapsed time for module.
    store_dfs(module, dfdict):
        Store pd.DataFrames as a class instance attribute. Goal is to avoid pandas gc / RAM issue.
    delete_dfs(module):
        Delete instance attr, reverting to empty default attr. Goal is to avoid pandas gc issues.
    write_ftr_files(module, df_dict):
        Write a dictionary of dataframes ({name: df, name2: df2, ...}) to ftr files on disk.
        Fast-loading ftr file use is one way to help reclaim unnecessarily-reserved RAM for pandas.
    read_ftr_files(module):
        Read and return a dict of DataFrames from ftr files on disk. (Matches the above write_ftr.)
    """

    run_platform = OrderedDict([
        ('os', sys.platform),  # laptop == 'win32' or COLAB == 'linux'
        ('os_full', platform.platform()),
        ('runtime', 'Windows' if sys.platform == 'win32' else get_runtime_type()),
        ('gb_physical_dram', get_phys_dram() if sys.platform == 'win32' else 0),
        ('n_logical_cpu', psutil.cpu_count(logical=True)),
        ('n_physical_cpu', psutil.cpu_count(logical=False)),
        ('n_multiprocessing_cpu', mp.cpu_count()),
        ('chipset', platform.processor())])
    n_models = 1
    init_datetime = f'{time.strftime("%y%m%d-%H%M")}'
    ftr_modules = []
    file_rootname = ''

    def __init__(self, file_rootname, ftr_modules=None):
        set_pandas_options(opts=CONFIG.pd_opts)
        ProgramManager.file_rootname = file_rootname
        ProgramManager.ftr_modules = ftr_modules if ftr_modules else []
        ProgramManager.init_datetime = f'{time.strftime("%y%m%d-%H%M")}'
        self.t_timer_start = time.perf_counter()
        self.memory_stats_log = []
        self.mem_capture("Program Start")

        self.model_n = 0
        self.filenames = self.update_filenames()
        self.all_splits = OrderedDict()
        self.split = OrderedDict()
        self.feats = None
        # self.scalers = self.update_scalers(None, None)
        # self.robust_scalers = {}
        # self.minmax_scalers = {}
        # self.feature_names = None  # tbd, maybe not needed

    def mem_capture(self, program_loc="0", printout=False, single_row=True):
        """Gather memory consumption info.  Print if desired - latest row or all rows."""
        mem_stat = MemStats(program_loc)
        self.memory_stats_log.append(mem_stat)
        if printout:
            self.print_memory_stats(single_row)

    def format_memory_stats(self):
        """Format all memory stats gathered into the log, for printing or file saving."""
        mem_str_list = MemStats.print_memstats_header(console=False)  # ok for writelines()
        for mem_stat in self.memory_stats_log:
            mem_str_list.extend(mem_stat.print_memstats(console=False))
        return mem_str_list

    def print_memory_stats(self, single_row=True):
        """Print all program stats, or just the most recent program location (default)."""
        print_list = self.format_memory_stats()
        if single_row:
            print_list = print_list[:3] + print_list[-1:]
        sys.stdout.writelines(print_list)

    def format_log_mgr_info(self):
        """Format relevant information for writing to a file or console, using 'writelines'."""
        log_data_list = dict_to_writelines_list(get_package_versions(pkg_list=CONFIG.packages),
                                                ': version ')
        log_data_list += ['\n'] + dict_to_writelines_list(self.run_platform)
        log_data_list += ['\n'] + self.format_memory_stats()
        return log_data_list

    def write_log_mgr_info(self, param_dict=None, console=False):
        """Write relevant information to file."""
        with open(self.filenames.log_path, 'w+') as output_location:
            if param_dict:
                pprint(param_dict, output_location)
            output_location.writelines(['\n'] + self.format_log_mgr_info())
        if console:  # write to the console
            if param_dict:
                pprint(param_dict)
            sys.stdout.writelines(['\n'] + self.format_log_mgr_info())

    def update_filenames(self):
        """Modify filenames to reflect iteration number, etc., as we loop through main program."""
        # Get a basic filename for saving files with various suffixes
        if not ProgramManager.file_rootname:
            ProgramManager.file_rootname = input(
                "Enter the root filename to use for this run's outputs (e.g., 'v4shops' ): ")
        basename = f'{ProgramManager.init_datetime}_{ProgramManager.file_rootname}'
        modelname = f'{basename}_{self.model_n:02d}'
        submit_path = CONFIG.paths.outputs / f'{modelname}_submission.csv'
        output_path = CONFIG.paths.outputs / f'{modelname}_output.csv'
        log_path = CONFIG.paths.logs / f'{modelname}_log.txt'
        active_ftr_paths = OrderedDict()
        for ftr_mod in ProgramManager.ftr_modules:
            active_ftr_paths[ftr_mod] = CONFIG.ftr_paths[ftr_mod]
        return FNameContainer(ProgramManager.file_rootname, basename, modelname,
                              active_ftr_paths, submit_path, output_path, log_path)

    def set_features(self):
        """Expand the input features list into pandas-friendly agg stats dicts and col lists."""
        self.feats = LagFeatures(self.split['features'])

    def arrange_run_info(self, params, output_results):
        """Explode splits, print details of run to console to help user verify that all is ok."""
        # =============================================================================
        # Find the parameters that have splits in them, and print out to highlight them
        # =============================================================================
        split_summary = {}
        for module_dict in params.values():
            for parameter_name, parameter_value in module_dict.items():
                if parameter_name != 'features':  # features can be huge; print it below
                    splits = list_of_lists_splits(parameter_value)
                    if splits:
                        split_summary[parameter_name] = splits

        # =============================================================================
        # Explode the parameter dictionaries so each row is one iteration of modeling parameters
        # Keep modules isolated so we don't need to do extra calculations while looping
        # =============================================================================
        ProgramManager.n_models = 1
        self.all_splits = OrderedDict()
        for module_name, module_dict in params.items():
            module_df = pd.DataFrame(module_dict)
            for col in module_df.columns:
                module_df = module_df.explode(col)
            module_df.reset_index(drop=True, inplace=True)
            # to_dict('index'...) gives {int: {row}}, while to_dict('records'...) gives [{row}]
            self.all_splits[module_name] = module_df.to_dict('index', into=OrderedDict)
            ProgramManager.n_models *= module_df.shape[0]

        # Make placeholder rows in outputs_df so n_rows equals the number of models to be trained
        # Concat input params with outputs_df to require writing only one log file for entire run
        # Could also get [keys] as [*dict] or as d after 'list(map((d:=[]).extend, all_par_list))'
        input_params = dict.fromkeys(chain(*params.values()), None)  # {'p': None, ...} for ALL p
        output_results.update(input_params)
        outputs_df = pd.DataFrame(
            list(np.repeat(output_results.values(), ProgramManager.n_models, axis=0)),
            columns=list(output_results.keys())).reset_index(drop=True)

        # =============================================================================
        # Print run 'size' and key splits
        # =============================================================================
        print(f'N train models: {ProgramManager.n_models}')
        print('Splits in this run (excluding features/lags):')
        pprint(split_summary)
        print('\n')
        self.mem_capture("Iteration Parameters Defined", printout=True, single_row=False)

        return outputs_df

    def module_start(self, module_name):
        """Print short status and collect memstats; start timer to track module execution time."""
        print(f'\nModel #{self.model_n + 1} of {ProgramManager.n_models}'
              f' -- Start {module_name} Module @ {tdstr()}')
        self.mem_capture(f'Start {module_name} Module')
        self.t_timer_start = time.perf_counter()

    def module_end(self, module_name, printout=True, single_row=False):
        """Print short status and collect memstats; save elapsed time for module."""
        self.mem_capture(f'End {module_name} Module', printout, single_row)
        elapsed = timer(self.t_timer_start)
        print(f'{module_name} Module complete; elapsed time = {elapsed}')
        self.split[f't_{module_name.lower()}'] = elapsed

    def store_dfs(self, module, dfdict):
        """Store pd.DataFrames as a class instance attr. Goal is to avoid pandas gc / RAM issue."""
        setattr(ProgramManager, f'{module.lower()}_dfs', {})
        setattr(self, f'{module.lower()}_dfs', dfdict)

    def delete_dfs(self, module):
        """Delete instance attr, reverting to empty default attr. Goal is to avoid pd gc issues."""
        if getattr(self, f'{module.lower()}_dfs') != {}:
            try:
                delattr(self, f'{module.lower()}_dfs')
            except AttributeError as exc:
                print(f'Unable to delete instance attr {module.lower()}_dfs: {exc}')

    def write_ftr_files(self, module, df_dict):
        """Write a dictionary of dataframes ({name: df, name2: df2, ...}) to ftr files on disk."""
        t_write_start = time.perf_counter()
        ftr_df_names, ftr_df_paths = self.filenames.ftr_paths[module]
        for ftr_df_name, ftr_df_path in zip(ftr_df_names, ftr_df_paths):
            df_dict[ftr_df_name].to_feather(ftr_df_path)
        return f'{module} ftr file write time: {timer(t_write_start)}'

    def read_ftr_files(self, module):
        """Read and return a dict of dataframes from ftr files on disk."""
        df_dict = {}
        ftr_df_names, ftr_df_paths = self.filenames.ftr_paths[module]
        for ftr_df_name, ftr_df_path in zip(ftr_df_names, ftr_df_paths):
            df_dict[ftr_df_name] = pd.read_feather(ftr_df_path, columns=None, use_threads=True)
        return df_dict

    # def update_scalers(self, robust, minmax):
    #     """Container for sklearn scalers to enable inverse transforming the final predictions."""
    #     return ScalersContainer(robust, minmax)


class MemStats:
    """
    A class to obtain and organize information on memory-related issues throughout the program.

    Inspired by pandas garbage collection issue where deleted dataframes can block memory access.
    ...

    Attributes
    ----------
    class attributes:
    max_meas_str_len, cell_widths, border:
        Direct the formatting of tabular output info for ALL collected instances of memory stats

    instance attributes:
    program_loc : str = description of which module / subroutine created this class instance
    pid_mem_use : float = gigabytes of RAM in use per the OS
    vm_used : float = gigabytes of RAM in use per the VM
    vm_total : float = gigabytes of RAM in total in the VM
    vm_available : float = gigabytes of RAM in still available (unused) in the VM
    active_proc : list of process objects
        info on which live children are still running with multiprocessing
        = empty list if processes are properly closed/joined
    n_active_proc : int = the number of live child processes still running with multiproc
    date_time : string = date and time when the class instance was created

    Methods
    -------
    get_ram(): Queries the OS and VM for memory use.
    get_mp_kids(): Query multiproc module to see if child processes are still running ('active')
    print_memstats_header(console=True): 3 line heading for tabular printout of memory stats
    print_memstats(single_row or all rows): Print the memory stats info for this class instance
    """

    max_meas_str_len = len('Measurement Point')  # = 17
    cell_widths = [21, 0, 6, 7, 8, 8, 11]
    border = '|'

    def __init__(self, program_loc='Start'):
        self.program_loc = program_loc
        MemStats.max_meas_str_len = max(MemStats.max_meas_str_len, len(self.program_loc))
        MemStats.cell_widths[1] = MemStats.max_meas_str_len
        self.date_time = f'{tdstr()}'
        self.get_ram()
        self.get_mp_kids()

    def get_ram(self):
        """Get RAM usage in GB as specified by psutil.Process(pid) and psutil.virtual_memory()."""
        pid = os.getpid()
        pyproc = psutil.Process(pid)
        self.pid_mem_use = pyproc.memory_info()[0] / 2. ** 30
        self.vm_used = psutil.virtual_memory().used / 1e9
        self.vm_total = psutil.virtual_memory().total / 1e9
        self.vm_available = self.vm_total - self.vm_used

    def get_mp_kids(self):
        """Get a list of active child objects per the multiprocessing configuration being used."""
        # (Empty list if all multiprocesses are joined / closed properly.)
        self.active_proc = mp.active_children()
        # self.n_active_proc = len(self.active_proc)

    @classmethod
    def print_memstats_header(cls, console=True):
        """Print a header for memory stats info table printout."""
        # cell_widths = [21, self.max_meas_str_len, 6, 7, 8, 8, 11]
        str_list1 = [' ', ' ', 'pid', ' ', 'vm', ' ', ' ']
        borders1 = [cls.border, ' ', cls.border, cls.border, ' ', ' ', cls.border, cls.border]
        str_list2 = ['Time and Date', 'Measurement Point', 'pid-GB',
                     'used-GB', 'avail-GB', 'total-GB', 'Active Proc']
        borders2 = [cls.border] * 8
        hdr_str1 = borders1[0]
        hdr_str2 = borders2[0]
        for idx in range(len(cls.cell_widths)):
            hdr_str1 = ''.join([hdr_str1, f' {str_list1[idx]:^{cls.cell_widths[idx]}} '])
            hdr_str1 = ''.join([hdr_str1, borders1[idx + 1]])
            hdr_str2 = ''.join([hdr_str2, f' {str_list2[idx]:^{cls.cell_widths[idx]}} '])
            hdr_str2 = ''.join([hdr_str2, borders2[idx + 1]])
        hdr_str3 = f'{"-" * len(hdr_str1)}'
        if console:
            print(f'{hdr_str1}\n{hdr_str2}\n{hdr_str3}')
        return [f'{hdr_str1}\n', f'{hdr_str2}\n', f'{hdr_str3}\n']  # format good for writelines()

    def print_memstats(self, console=True):
        """Print the memory statistics summary to match the header."""
        mem_str = (f'{self.border} {self.date_time:<{self.cell_widths[0]}} '
                   f'{self.border} {self.program_loc:<{self.cell_widths[1]}} '
                   f'{self.border} {self.pid_mem_use:>{self.cell_widths[2]}.2f} '
                   f'{self.border} {self.vm_used:>{self.cell_widths[3]}.2f} '
                   f'{self.border} {self.vm_available:>{self.cell_widths[4]}.2f} '
                   f'{self.border} {self.vm_total:>{self.cell_widths[5]}.2f} '
                   f'{self.border} {str(self.active_proc):^{self.cell_widths[6]}} '
                   f'{self.border}')
        if console:
            print(mem_str)
        return [f'{mem_str}\n']  # formatted for adoption by writelines()

# =============================================================================
# Some alternatives for memory tracking:
# =============================================================================
# https://www.toptal.com/python/python-class-attributes-an-overly-thorough-guide-item 3
#  or pympler for tracking instances
#
# =============================================================================
# For reference, here's what you get from psutil queries:
# =============================================================================
# psutil.swap_memory()
# Out[19]: sswap(total=58783318016, used=123, free=456, percent=18.2, sin=0, sout=0)
# psutil.virtual_memory()
# Out[20]: svmem(total=51267125248, available=123, percent=17.3, used=456, free=789)
# =============================================================================


## **kag_eda.py**

In [7]:
"""
Exploratory data analysis tweaks and related preprocessing before time-lag feature generation.

Issue: freeing up memory used by pandas dataframes that are no longer required by the program
    (del + gc.collect() does not reliably accomplish desired result, nor does re-defining the
     df as empty pd.DataFrame())
To keep from overloading Colab memory limits, we utilize multiprocessing function calls,
which use OS-level operations to discard unwanted dataframes, and reliably release unused RAM.

Created on Thu Oct 22 21:58:48 2020
@author: mgaidis
"""

# from collections import namedtuple, OrderedDict
# import multiprocessing as mp
# import pandas as pd
# import numpy as np
# from kag_config import CONFIG
# from kag_utils import df_dict_summary_to_writelines_list


DataFileArgs = namedtuple('DataFileArgs', 'filepath keep_cols')


def files_to_dfs_func(load_dfs_args):
    """
    Multiprocess-enabled loading of data to assist with pandas gc issues and RAM use.

    Load disk data into pd df using filename defined in kag_config.
    Drop unneeded columns for this particular model iteration, as determined in kag_features.
    """
    dataframe = pd.read_csv(load_dfs_args.filepath)
    dataframe = dataframe[load_dfs_args.keep_cols]
    return dataframe


def eda_cleanup(mgr):
    """
    Control the loading of data and application of residual exploratory data analysis (EDA) tweaks.

    1) Load pre-processed datafiles from paths defined in the "kag_config.py" module.
       At present, we utilize the following datafiles after EDA and feature augmentation
       described in the following 'utility' notebooks:
           'data_cleaning_and_eda_feature_merging_v2_mg.ipynb' --> stt.csv.gz
           'nlp_clustering_item_names_v1_june2020_mg.ipynb'    --> items_enc.csv
           'time_item_category_shop_correlations_v10_mg.ipynb' --> shops_enc.csv
           'calculate_days_per_month.ipynb',
           'EDA_sales_by_day_of_week_mg.ipynb'                 --> date_scaling.csv
           [no modifications]                                  --> test.csv.gz

       stt.csv.gz -> stt = sales_train_test, with additional datetime feature columns
       items_enc.csv -> items_enc = items with encoded categorical feature additions,
       shops_enc.csv -> shops_enc = shops with encoded categorical feature additions,
       date_scaling.csv -> date_scaling = datetime cols and associated retail scaling features
       test.csv.gz -> test = template needed to format competion predictions for submission

    2) Modify DataFrames using splits defined for the present model iteration
       (drop unneded features, merge into stt, scale according to date, adjust datatypes)
        a) Use 'multiprocessing' python configuration to help ensure pandas dataframes release RAM
           back to the VM when they are no longer needed (simple 'del' command is unreliable;
           only workaround I have found to work is to multiprocess to call a new process, which
           releases all memory *at the OS level* when process is complete)
        b) Also provide option to save quick-loading "feather" files and eliminate need to
            keep all dataframes in RAM at all times.  This is particularly important when one is
            RAM-limited, as is a not-infrequent occurence with big data sets in Google Colab.
    """
    print("Loading Files from repo into VM...\n")
    mgr.feats.cols['keep']['date_scaling'].append(mgr.split['scale_sales'])
    load_dfs_arglist = [DataFileArgs(CONFIG.data[name].filepath, mgr.feats.cols['keep'][name])
                        for name in [*CONFIG.data]]
    # [x.filepath for x in CONFIG.data.values()])))

    # ============================================================================================
    #   Load data from storage into pandas DataFrames in RAM / VM
    # ============================================================================================
    # ~~~~  Multiprocessing Block  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    with mp.Pool() as cruncher:
        eda_dfs = OrderedDict(zip([*CONFIG.data],
                                  cruncher.map(files_to_dfs_func, load_dfs_arglist)))
    cruncher.close()
    cruncher.join()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    mgr.mem_capture('Joined load_dfs mp')

    mgr.test_df = eda_dfs.pop('test')  # save for future use in formatting Coursera submission
    df_dict_summary_to_writelines_list(eda_dfs, colinfo=False, console=True)  # quick df check

    # ============================================================================================
    #   Merge shops and items into stt
    # ============================================================================================
    eda_dfs['stt'] = eda_dfs['stt'].merge(eda_dfs['shops_enc'], on='shop_id', how='left')
    eda_dfs['stt'] = eda_dfs['stt'].merge(eda_dfs['items_enc'], on='item_id', how='left')
    mgr.mem_capture(program_loc="Merged stt with shops and items")
    df_dict_summary_to_writelines_list({'merged stt': eda_dfs['stt']}, colinfo=True, console=True)

    # ============================================================================================
    #   Delete unwanted shops, item cats; scale sales for days in month, etc.
    # ============================================================================================
    print('----------\n')
    if mgr.split['del_shops']:  # drop undesirable shops
        eda_dfs['stt'] = eda_dfs['stt'].query(f'shop_id != {mgr.split["del_shops"]}')
        print(f'stt shape after deleting shops #{mgr.split["del_shops"]}: {eda_dfs["stt"].shape}')
    if mgr.split['del_item_cats']:  # drop undesirable item categories
        eda_dfs['stt'] = eda_dfs['stt'].query(f'item_category_id != {mgr.split["del_item_cats"]}')
        print(f'stt shape after deleting item categories {mgr.split["del_item_cats"]}: '
              f'{eda_dfs["stt"].shape}\n')
    if mgr.split['scale_sales']:  # scale by date_scaling as desired
        eda_dfs['stt'] = eda_dfs['stt'].merge(
            eda_dfs['date_scaling'][['month', mgr.split['scale_sales']]], on='month', how='left')
        eda_dfs['stt'].sales = eda_dfs['stt'].sales * eda_dfs['stt'][mgr.split['scale_sales']]
        eda_dfs['stt'].drop(mgr.split['scale_sales'], axis=1, inplace=True)

    # ============================================================================================
    #   Insert revenue feature; adjust data types; drop unnecessary cols; set desired col order
    # ============================================================================================
    eda_dfs['stt']['rev'] = eda_dfs['stt'].sales * eda_dfs['stt'].price / 1000
    # float so date_scaling weight is accurate; can use price; divide by 1000 for reasonable range
    eda_dfs['stt'][['sales', 'price', 'rev']].astype(np.float32)
    # reduce RAM footprint at this point by downcasting integer features
    eda_dfs['stt'][mgr.feats.cols['int_feats']] = (eda_dfs['stt'][mgr.feats.cols['int_feats']]
                                                   .astype(np.int16))
    # drop unnecessary columns and order the others as desired
    eda_dfs['stt'] = (eda_dfs['stt'][mgr.feats.cols['final_stt']]
                      .reset_index(drop=True))  # reset index saves 25MB
    df_dict_summary_to_writelines_list({'final stt': eda_dfs['stt']}, colinfo=True, console=True)

    mgr.mem_capture(program_loc="Completed stt transformations")

    # ============================================================================================
    #   Save ftr files or return dataframes
    # ============================================================================================
    if 'eda' in mgr.ftr_modules:
        mgr.write_ftr_files('eda', eda_dfs)
        mgr.mem_capture(program_loc="EDA .ftr Saved")
    else:
        mgr.store_dfs('eda', eda_dfs)

    return mgr


## **kag_data.py**

In [8]:
"""
Group by month, generate new statistical and time-lagged features, and merge with original data.

    1) Group training/validation data by months, computing statistics while aggregating
        : min_feat_groups_stats_set: dicts formatted to be easily fed into pandas group/agg funcs.
        : After performing group/agg on this list, then we apply lags as desired to
          each of the various elements, without re-computing the stats for each lag month.
    2) Add Cartesian Product fill using unique 'shop_id' and 'item_id' values from each month
        : Create dummy rows of relevant month-shop-item combinations to provide model with explicit
          'knowledge' of nonexistent transactions.
          This can be a RAM abuser, so limit the months of insertion to only the most important
          (plus additional earlier rows to account for maximum number of months to lag features).
        : If RAM is abundant, one can indicate that cp rows should include all shop-item pairs
          from the test set in addition to the default inclusion of only the shop-item pairs
          made from cp of unique shop_ids and unique item_ids within a given month.
        : Convention during addition of lag statistics is presently to drop any lagged rows wher
          the source month shop-item pair is not also present in the destination month.
    3) Merge time-lagged statistics, discarding those that don't have a match with an
        existing shop-item pair in destination month
    4) Clean, Sort, Clip, Scale, and Datatype the monthly_stt df
        : (tbd?? Clip per Kaggle competition recommendations to range [0,20])
        : Note that int dtypes cannot represent N/A values; use fillna(0) or use np.float32 dtype.
          (Price as a feature is not ideal if filling with 0; better to use revenue instead.)
        : Scale the feature columns for better use of full range of available datatype values
          (np.int16, np.int8, np.uint16,...) unless float
        : Use np.iinfo(np.int32) or np.finfo(np.float32) e.g., to get numerical range of np dtypes

    : Make frequent use of multiprocessing pools such that pandas DataFrame garbage collection is
      not an issue (RAM is reclaimed as soon as pool is closed/joined, whereas pd is unreliable).
      By creating iterable sequences to pass to multiprocessing.map(), we can also speed things up.
    : Note that the simple action of .reset_index(drop=True) on a pd df can save 25 MB RAM

Created on Fri Oct 23 05:22:45 2020
@author: mgaidis
"""

# from collections import namedtuple
# from itertools import product
# import multiprocessing as mp
# import numpy as np
# import pandas as pd
# from kag_utils import col_info

LagArgs = namedtuple('LagArgs', 'df n_months col_dict shift_eval_str shift_cols')
CPArgs = namedtuple('CPArgs', 'month monthly_stt query_str')


def group_and_agg(stt, group_agg_dict):
    """Compute statistics-based features while grouping by month."""
    grouped_df = stt.groupby(group_agg_dict['group']).agg(group_agg_dict['stats'])
    grouped_df.columns = group_agg_dict['col_names']
    grouped_df.reset_index(inplace=True)
    return grouped_df


def create_and_merge_groups(stt, agg_dict):
    """Create and Merge Groups."""
    # Initialize monthly_stt to capture standard non-statistical features, then continue
    init_group_agg = agg_dict.pop('shop_item')
    mo_stt = group_and_agg(stt, init_group_agg)
    for group_dict in agg_dict.values():
        mo_stt = mo_stt.merge(group_and_agg(stt, group_dict),
                              on=group_dict['group'], how='left')
    return mo_stt


def np_cartesian_product(cp_args):
    """Create one month's Cartesian Product rows to merge with monthly_stt."""
    # CPArgs = namedtuple('month monthly_stt query_str')
    cartprod_rows = cp_args.monthly_stt[['month', 'shop_id', 'item_id']].query(cp_args.query_str)
    return np.array(list(product([cp_args.month],
                                 cartprod_rows.shop_id.unique(),
                                 cartprod_rows.item_id.unique())),
                    dtype=np.int16)


def copy_shift_type_lag_cols(lag_args):
    """Create df for a given number of lag months, to merge with monthly_stt."""
    #   LagArgs = namedtuple('df n_months col_dict shift_eval_str shift_cols')
    lag_df = (lag_args.df[lag_args.shift_cols]
              .copy(deep=True)
              .rename(columns=lag_args.col_dict))
    lag_df['month'] = lag_df['month'] + lag_args.n_months
    return lag_df


# =============================================================================
#     Data Conditioning and Statistical Feature Generation - Main Module
# =============================================================================
def data_conditioning(mgr):
    """
    Generate features and condition data according to parameter settings.

    inputs:     stt (sales train test) dataframe,
                shops_enc, items_enc, and the various parameters to guide the actions of
                    grouping, aggregating stats, clipping, scaling, adding cartesian product rows,
                    creating/merging time-lagged features, and adjusting datatypes for efficiency

    outputs:    monthly_stt DataFrame ready for model training
                    (grouped by month, cp rows added, lagged stats added, dtypes set)
                (disabled for now...  robust_scalers and minmax_scalers
                    necessary to inverse transform model predictions before submission)
    """
    # =============================================================================
    #     Load ftr files if necessary
    # =============================================================================
    if 'eda' in mgr.ftr_modules:
        eda_dfs = mgr.read_ftr_files('eda')
        mgr.mem_capture(program_loc="EDA .ftr Loaded")
    else:
        eda_dfs = mgr.eda_dfs

    # Prefer to clip to Kaggle recommendation [0,20] sales per month only at final prediction time
    #   For now, clip sales per day just to eliminate crazy outliers, particularly to avoid issues
    #   with outliers in inital data propagating weirdness through agg/group stats. Keep negatives.
    eda_dfs['stt'].sales = eda_dfs['stt'].sales.clip(*mgr.split['clip_train'])

    # =============================================================================
    # 1. Aggregate the Monthly Stats (multiproc for RAM/gc issues, not necessarily for speed)
    # =============================================================================
    # ~~~~  Multiprocessing Block  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    with mp.Pool() as cruncher:
        monthly_stt = cruncher.apply(create_and_merge_groups,
                                     [eda_dfs['stt'], mgr.feats.min_feat_groups_stats_set])
    cruncher.close()
    cruncher.join()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    print(f'\nmonthly_stt minimal agg stats grouped and merged: shape = {monthly_stt.shape}')
    mgr.mem_capture('Joined Agg mp')
    # Make sure pandas hasn't sucked up too much RAM
    monthly_stt = monthly_stt.astype(np.float32)
    monthly_stt[mgr.feats.cols['int_feats']] = \
        monthly_stt[mgr.feats.cols['int_feats']].astype(np.int16)

    # =============================================================================
    # 2. Cartesian Product (cp) Rows Insertion
    # =============================================================================
    cp_arg_list = []
    for cp_month in range(max((mgr.split['cp_first_mo'] - max(mgr.feats.lag_month_dict)), 0), 34):
        query_str = f'(month == {cp_month})'
        if mgr.split['cp_test_pairs']:
            query_str = query_str + '|(month == 34)'
        cp_arg_list.append(CPArgs(cp_month, monthly_stt, query_str))

    # ~~~~  Multiprocessing Block  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    with mp.Pool() as cruncher:
        cp_array_list = cruncher.map(np_cartesian_product, cp_arg_list)
    cruncher.close()
    cruncher.join()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    mgr.mem_capture('Joined CP mp')

    # Merge cartesian product rows into monthly_stt -- will replace 'no_lag' cols afterwards
    monthly_stt = (monthly_stt.drop(mgr.feats.cols['no_lag'], axis=1)
                              .merge(pd.DataFrame(np.vstack(cp_array_list),
                                                  columns=['month', 'shop_id', 'item_id']),
                                     how='outer', on=['month', 'shop_id', 'item_id']))
    # Insert categorical features info into (empty) CP rows, re-inserting 'no_lag' columns
    monthly_stt = monthly_stt.merge(eda_dfs['shops_enc'], how='left', on='shop_id')
    monthly_stt = monthly_stt.merge(eda_dfs['items_enc'], how='left', on='item_id')
    monthly_stt = ((monthly_stt[mgr.feats.cols['int_feats'] + mgr.feats.cols['min_agg_cols']])
                   .sort_values(['month', 'shop_id', 'item_id']).reset_index(drop=True))
    mgr.mem_capture('Cartesian Product Rows Added')
    col_info(monthly_stt, 2, target_df_name='monthly_stt (with CP rows)')
    print(f'\nNumber of months: {monthly_stt.month.nunique():,d}\n'
          f'Number of shops: {monthly_stt.shop_id.nunique():,d}\n'
          f'Number of items: {monthly_stt.item_id.nunique():,d}\n'
          f'Number of df rows: {len(monthly_stt):,d}\n'
          f'{monthly_stt.describe()}')

    # =============================================================================
    # 3. Merge Time-Lag Features
    # =============================================================================
    print(f'Pre-lag monthly_stt DataFrame length: {len(monthly_stt):,d}\n')
    lag_arg_list = []
    for lag_mo, lag_dict in mgr.feats.lag_month_dict.items():
        lag_arg_list.append(LagArgs(monthly_stt, lag_mo, lag_dict,
                                    shift_eval_str=f'month = month + {lag_mo}',
                                    shift_cols=['month', 'shop_id', 'item_id'] + [*lag_dict]))

    # Get all lag col features in a list of dfs (each list element <--> a given lag month number)
    # ~~~~  Multiprocessing Block  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    with mp.Pool() as cruncher:
        lag_dfs_list = cruncher.map(copy_shift_type_lag_cols, lag_arg_list)
    cruncher.close()
    cruncher.join()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    mgr.mem_capture('Joined Lag mp')

    monthly_stt['y_target'] = monthly_stt.shop_item__salesSum.astype(np.float32)
    monthly_stt = monthly_stt.drop(mgr.feats.cols['min_agg_cols'], axis=1)
    for lagidx, lag_df in enumerate(lag_dfs_list):
        print(f'\nNow merging lag month = {lag_arg_list[lagidx].n_months}...')
        monthly_stt = monthly_stt.merge(lag_df, on=['month', 'shop_id', 'item_id'], how='left')
    if mgr.split['cp_fillna0'] or np.issubdtype(mgr.split['feat_dtype'], np.integer):
        monthly_stt = monthly_stt.fillna(0)
    mgr.mem_capture('Time Lag Features Added')

    # =============================================================================
    #     Save ftr files or return dataframes
    # =============================================================================
    if 'data' in mgr.ftr_modules:
        mgr.write_ftr_files('data', {'monthly_stt': monthly_stt})
        mgr.mem_capture(program_loc="Data .ftr Saved")
    else:
        mgr.store_dfs('data', {'monthly_stt': monthly_stt})

    return mgr


# =============================================================================
# sklearn_preprocessing scaling...
# import math
# from sklearn.preprocessing import MinMaxScaler, RobustScaler
# =============================================================================
# Disabled for now: then do Robust Scaling (if desired) to squeeze outliers into central
#   distribution, then do MinMax Scaling (if desired) to make best use of full range of
#   datatype being used.  Keep in mind that smaller ranges will converge faster during GBDT
#   fitting, so do not extend range too much.
#   Reverse the order of operations when unscaling/clipping after model predictions.)
# # =============================================================================
# # Scaling / Transforming is temporarily disabled until a good use case is found
# # =============================================================================
#     if robust_qtiles:
#         for aggcol in mgr.feats.cols['min_agg_cols']:
#             mgr.robust_scalers[aggcol] = RobustScaler(with_centering=False,
#                                                       quantile_range=robust_qtiles)
#             monthly_stt[aggcol] = (mgr.robust_scalers[aggcol].
#                                    fit_transform(
#                                    monthly_stt[aggcol].to_numpy().reshape(-1, 1)))
#     if minmax_range:
#         for aggcol in mgr.feats.cols['min_agg_cols']:
#             mgr.minmax_scalers[aggcol] = MinMaxScaler(feature_range=minmax_range)
#             monthly_stt[aggcol] = (mgr.minmax_scalers[aggcol].
#                                    fit_transform(
#                                    monthly_stt[aggcol].to_numpy().reshape(-1, 1)))
# # =============================================================================
# # new
# # =============================================================================
#             monthly_stt[aggcol] = monthly_stt[aggcol].apply(lambda x: math.ceil(x))
#     if mgr.split['feat_dtype'] in [np.int16, np.uint16]:
#         monthly_stt = monthly_stt.fillna(0).round()
# =============================================================================

# =============================================================================
#         # np.int16 #.apply(pd.to_numeric, downcast= np.float32)
#         # store as integers to save memory, but INTs cannot handle NA (so, do not use price)
# =============================================================================

# =============================================================================
# from functools import partial  # partial 'binds' arguments to a function; avoid need to pass them
# from functools import reduce
# # def copy_shift_type_lag_cols(lag_args):  # {'mo_stt': monthly_stt, 'lag', 'cols', 'dtype'}
# #     """Create df for a given number of lag months, to merge with monthly_stt."""
# #     cols_to_shift = ['month', 'shop_id', 'item_id'] + list(lag_args['cols'].keys())
# #     lag_df = lag_args['mo_stt'][cols_to_shift].copy(deep=True).rename(lag_args['cols'])
# #     lag = lag_args['lag']
# #     lag_df.eval('month = month + @lag', inplace=True).astype(lag_args['dtype'])
# #     return lag_df
# # def worker_wrapper(arg):
# #     args, kwargs = arg
# #     return worker(*args, **kwargs)
# # def multiproc_arg_wrap(func_arg):
# #     func
#         # lag_dfs_list = lag_pool.map(partial(copy_shift_type_lag_cols,
#         #                                     **{'mo_stt': monthly_stt,
#         #                                        'data_type': mgr.split['feat_dtype']}),
#         #                             arg_list)
# =============================================================================

# print(f'\nmonthly_stt.head:\n{monthly_stt.head(2)}')
# print(f'\nmonthly_stt.tail:\n{monthly_stt.tail(2)}')


## **kag_tvt.py**

In [9]:
"""
Create Train-Validation-Test split to feed into model fitting and prediction.

Created on Fri Oct 23 05:23:40 2020
@author: mgaidis
"""

# import numpy as np
# import pandas as pd
# from kag_utils import col_info


def create_tvt_split(mgr):
    """Create Train-Validation-Test split to feed into model fitting and prediction."""
    # =============================================================================
    #     Load ftr files if necessary
    # =============================================================================
    tvt_dfs = {}
    if 'data' in mgr.ftr_modules:
        tvt_dfs['train_X'] = mgr.read_ftr_files('data').pop('monthly_stt')
        mgr.mem_capture(program_loc="monthly_stt .ftr Loaded")
    else:
        tvt_dfs['train_X'] = mgr.data_dfs.pop('monthly_stt')
    # =============================================================================
    # tvt_dfs['train_X'] contains everything at this point... now we slice it into train/val/test
    # Trying to minimize n variable names to save memory given pd garbage collection issue
    # =============================================================================

    # if mgr.split['model_type'] == 'LGBM':  # split is sensitive to time-ordered data
    # =============================================================================
    # 1. Remove early months that don't participate in model training (for both X and y)
    # =============================================================================
    tvt_dfs['train_X'] = tvt_dfs['train_X'].query(f'month >= {mgr.split["tr_start_mo"]}')
    tvt_dfs['train_X'].reset_index(drop=True, inplace=True)

    # =============================================================================
    # 2. Discretize (integer from float) all the non-categorical columns except y-target
    # =============================================================================
    if np.issubdtype(mgr.split['feat_dtype'], np.integer):
        full_pos_range = np.iinfo(mgr.split['feat_dtype']).max - 1
        for ncol in mgr.feats.all_lag_features:
            tvt_dfs['train_X'][ncol] = pd.cut(tvt_dfs['train_X'][ncol].clip(lower=0),
                                              full_pos_range, labels=False)
            tvt_dfs['train_X'][ncol] = tvt_dfs['train_X'][ncol].astype(mgr.split['feat_dtype'])

    print(f'\nTVT df after discretizing (float -> int) and removing unused (early) months:'
          f'shape = {tvt_dfs["train_X"].shape}\n')
    col_info(tvt_dfs['train_X'], 2, target_df_name='Full Train-Val-Test DataFrame')
    print(f'\nfinal tvt.head():\n{tvt_dfs["train_X"].head()}\n')
    mgr.mem_capture('TVT about to split', True, False)

    # =============================================================================
    # 3. Create the test input data set by selecting just the test month from all data
    #     Also, delete the y (prediction) column, as it is unknown for the test month
    # =============================================================================
    tvt_dfs['test_X'] = tvt_dfs['train_X'].query('month == 34')
    tvt_dfs['test_X'] = tvt_dfs['test_X'].drop('y_target', axis=1)

    # =============================================================================
    # 4. Scrape off the validation data set from the remaining training data
    #    val_months == 999 means 'all months from end of training + month 33 at minimum'
    #    (otherwise, use only 'val_months' months after training) (val_months = 1, 2, 3, ...)
    # =============================================================================
    if mgr.split['val_months'] == 999:
        tvt_dfs['val_X'] = tvt_dfs['train_X'].query(
            f'((month > {mgr.split["tr_final_mo"]}) & (month < 34)) | (month == 33)')
    else:
        tvt_dfs['val_X'] = tvt_dfs['train_X'].query(
            f'(month > {mgr.split["tr_final_mo"]}) &'
            f'(month <= {mgr.split["tr_final_mo"] + mgr.split["val_months"]}) &'
            f'(month < 34)')

    # =============================================================================
    # 5. Separate the y target prediction column from the input data columns X
    # =============================================================================
    tvt_dfs['val_y'] = tvt_dfs['val_X'].pop('y_target')

    # =============================================================================
    # 6. Drop the undesired rows from the train data, and separate the y column
    # =============================================================================
    tvt_dfs['train_X'] = tvt_dfs['train_X'].query(f'month <= {mgr.split["tr_final_mo"]}')
    tvt_dfs['train_y'] = tvt_dfs['train_X'].pop('y_target')

    col_info(tvt_dfs['train_X'], 2, target_df_name='tvt train_X')
    # print(tvt_dfs['train_X'].head(2))
    col_info(tvt_dfs['train_y'], 2, target_df_name='tvt train_y')
    # print(tvt_dfs['train_y'].head(2))

    # =============================================================================
    # 7. Make sure all data sets are properly categorized and typed
    #    'target y' can be high accuracy; 'int_cats' and properly-scaled feats can be int
    # =============================================================================
    for data_set in tvt_dfs:
        if data_set[-1] == 'y':
            tvt_dfs[data_set] = tvt_dfs[data_set].astype(np.float32).reset_index(drop=True)
        else:
            tvt_dfs[data_set] = (tvt_dfs[data_set]
                                 .astype(mgr.split['feat_dtype'])
                                 .reset_index(drop=True))
            tvt_dfs[data_set][mgr.feats.cols['int_feats']] = \
                tvt_dfs[data_set][mgr.feats.cols['int_feats']].astype(np.int16)
            if mgr.split['use_categorical']:
                tvt_dfs[data_set][mgr.feats.cols['cat_feats']] = \
                    tvt_dfs[data_set][mgr.feats.cols['cat_feats']].astype('category')

    # =============================================================================
    #     Save ftr files or return dataframes
    # =============================================================================
    if 'tvt' in mgr.ftr_modules:
        mgr.write_ftr_files('tvt', tvt_dfs)
        mgr.mem_capture(program_loc="TVT .ftr Saved")
    else:
        mgr.store_dfs('tvt', tvt_dfs)

    mgr.feature_names = tvt_dfs['train_X'].columns  # not needed?... newer lgbm versions can use gbm.feature_name_, but not yet in colab
    return mgr

# =============================================================================
# 2. Discretize (integer from float) all the non-categorical columns except y-target
# =============================================================================
# Particularly with decision trees, we don't need to use floats for the regression
#   accuracy required by this competition.  np.int16 (range = -32768 to 32767) gives
#   plenty of feature precision for all of our non-categorical features.  Using int16
#   will presumably help us with RAM limitations and with speed of model fitting, so
#   here the non-categorical features are discretized to fit mgr.split['feat_dtype'],
#   and at the same time (for giggles?) drop negative values for sum/med/cnt, etc.
#   The more-conventional int16 is used rather than uint16 to protect against potential
#   compatibility issues with model fitting.
# As the predictions and comparisions with labeled values are separate from the
#   decision-tree methods that handle input features, we can keep the "y_target"
#   values as float32, and eliminate the need to do any inverse scaling at the end.


## **kag_model.py**

In [14]:
"""
Perform Setup, Fit/Train, and Prediction with a gradient-boosted decision tree (GBDT) model.

Created on Fri Oct 23 05:24:27 2020
@author: mgaidis
"""

# import time
# import numpy as np
# import pandas as pd
# =============================================================================
#       ML Packages
# =============================================================================
import lightgbm as lgb  # LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
# explicitly require expt feature before importing HistGradientBoostingRegressor
# from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# from sklearn.ensemble import HistGradientBoostingRegressor

# from kag_utils import tdstr, timer, AttrDict


# =============================================================================
# Disabled for Now
# def unscale(scaler, target_dict):
#     """Reverse the scaling done prior to model fitting."""
#     for name, data in target_dict.items():
#         target_dict[name] = scaler.inverse_transform(data.reshape(-1, 1)).squeeze()
#     return target_dict
# =============================================================================


def gbdt_model(mgr, gbdt_setup_params, gbdt_fit_params):
    """
    Perform Setup, Fit/Train, and Prediction with a gradient-boosted decision tree model.

    AttrDict 'tvt_dfs' includes train_X, train_y, val_X, val_y, test_X.
        (keys = string version of values), or filenames (if stored on disk)
    lgbm model = LightGBM is a particular case of a gradient-boosted decision tree model,
        so it is a subroutine in this GBDT function.
        Other GBMs can also be coded in this module.
    """
    output_dict = AttrDict()  # temporary storage of metrics and results
    y_predictions = AttrDict()
    y_labels = AttrDict()

    # =============================================================================
    #     Load ftr files if necessary
    # =============================================================================
    if 'tvt' in mgr.ftr_modules:
        tvt_dfs = mgr.read_ftr_files('tvt')
        mgr.mem_capture(program_loc="TVT .ftr Loaded")
    else:
        tvt_dfs = mgr.tvt_dfs

    if True:  # mgr.current_model['model_type'] == 'LGBM':
        t0_model_fit = time.perf_counter()
        # =============================================================================
        #     LGBM Model Setup
        # =============================================================================
        lgbm = lgb.LGBMRegressor(**gbdt_setup_params)

        # =============================================================================
        #     LGBM Model Fitting
        # =============================================================================
        lgbm.fit(
            tvt_dfs['train_X'],  # Input feature matrix or df 'train_X' [n_samples, n_features]
            tvt_dfs['train_y'],  # Target values 'train_y' (shape = [n_samples])
            eval_set=[(tvt_dfs['val_X'], tvt_dfs['val_y'])],  # list [(val_X, val_y)]
            eval_names=None,  # Names of eval_set (list of strs or None, opt. (default=None))
            **gbdt_fit_params)

        # =============================================================================
        #     LGBM Metrics Collection
        # =============================================================================
        output_dict.t_model_fit = timer(t0_model_fit)
        output_dict.best_iteration_ = lgbm.best_iteration_
        output_dict.best_score_ = lgbm.best_score_['valid_0']['rmse']
        # output_dict.feature_name_ = lgbm.feature_name_  # shape [n_features] names list --> not available yet in Colab's lgbm version
        output_dict.feature_name_ = mgr.feature_names
        output_dict.feature_importances_ = lgbm.feature_importances_
        # output_dict.model_params = lgbm.get_params()
        print(f'Done fitting; Model LGBM fit time: {output_dict.t_model_fit}')

    # =============================================================================
    #     GBM Predictions -- possible multiprocessing use here?
    # =============================================================================
    print(f'{tdstr()} -- Starting predictions...')
    t0_model_predict = time.perf_counter()
    y_labels.train = tvt_dfs['train_y'].to_numpy()
    y_labels.val = tvt_dfs['val_y'].to_numpy()
    y_predictions.train = lgbm.predict(tvt_dfs['train_X'], num_iteration=lgbm.best_iteration_)
    y_predictions.val = lgbm.predict(tvt_dfs['val_X'], num_iteration=lgbm.best_iteration_)
    y_predictions.test = lgbm.predict(tvt_dfs['test_X'], num_iteration=lgbm.best_iteration_)

    # =============================================================================
    #     Disable Scaling for Now
    #     # =============================================================================
    #     #     Invert Scaling
    #     # =============================================================================
    #     # do minmax scaling after robust; and ~inverse~ scaling with minmax first, then robust
    #     scaling_col = 'shop_item__salesSum'
    #     if mgr.minmax_scalers:
    #         y_predictions = unscale(mgr.minmax_scalers[scaling_col], y_predictions)
    #         y_labels = unscale(mgr.minmax_scalers[scaling_col], y_labels)
    #     if mgr.robust_scalers:
    #         y_predictions = unscale(mgr.robust_scalers[scaling_col], y_predictions)
    #         y_labels = unscale(mgr.minmax_scalers[scaling_col], y_labels)
    # =============================================================================

    # =============================================================================
    #     Clip as per Kaggle Recommendations
    # =============================================================================
    for name, data in y_predictions.items():
        y_predictions[name] = data.clip(*mgr.split['clip_predict'])

    output_dict.t_model_predict = timer(t0_model_predict)
    output_dict.train_R2 = r2_score(y_labels.train, y_predictions.train)
    output_dict.val_R2 = r2_score(y_labels.val, y_predictions.val)
    output_dict.train_rmse = mean_squared_error(y_labels.train, y_predictions.train, squared=False)
    output_dict.val_rmse = mean_squared_error(y_labels.val, y_predictions.val, squared=False)

    print(f'Model LGBM fit time: {output_dict.t_model_fit}\n'
          f'Transform and Predict train/val/test time: {output_dict.t_model_predict}')
    print(f'R^2 train  = {output_dict.train_R2:.4f}    '
          f'R^2 val  = {output_dict.val_R2:.4f}\n'
          f'RMSE train = {output_dict.train_rmse:.4f}    '
          f'RMSE val = {output_dict.val_rmse:.4f}\n')

    # re-format feature importances? dict with key=featurename val=importance?
    # save model?

    # =============================================================================
    # Merge predictions with IDs from original test dataset; keep only "ID" and "item_cnt_month"
    # =============================================================================
    y_submission = pd.DataFrame.from_dict({'item_cnt_month': y_predictions.test,
                                           'shop_id': tvt_dfs['test_X'].shop_id,
                                           'item_id': tvt_dfs['test_X'].item_id})
    y_submission = mgr.test_df.merge(y_submission, on=['shop_id', 'item_id'], how='left')
    y_submission = y_submission.reset_index(drop=True).drop(['shop_id', 'item_id'], axis=1)

    # =============================================================================
    # Save prediction for every one of the run iterations; can ensemble them later if desired
    # =============================================================================
    y_submission.to_csv(mgr.filenames.submit_path, index=False)

    return mgr, output_dict


## **kaggle_main_01p2.py**

In [15]:
"""
Feature/Split Interface for Manual Entry + Main controller for Kaggle computations.

Created on Wed Oct 21 08:06:32 2020
@author: mgaidis
"""

# from collections import namedtuple, OrderedDict
# import time
# import numpy as np

# # %tensorflow_version 2.x
# # import tensorflow as tf
# from kag_utils import timer
# from kag_program_manager import ProgramManager
# from kag_eda import eda_cleanup
# from kag_data import data_conditioning
# from kag_tvt import create_tvt_split
# from kag_model import gbdt_model


# ================================================================================================
# Define which (if any) DataFrames should be saved to disk between modules to help with RAM issues
#     FEATHERABLE_DF_NAMES = {'eda': ['items_enc', 'shops_enc', 'stt', 'test'],
#                             'data': ['monthly_stt'],
#                             'tvt': ['train_X', 'train_y', 'val_X', 'val_y', 'test_X']}
# ================================================================================================
FTR_MODULES = ['eda', 'data']  # write feather data for these modules to disk between modules
FILE_ROOTNAME = 'v1p0_test1'  # use None or '' if you wish to input (via console) filename below

# ================================================================================================
# Define the various choices for features, preprocessing, and ML fitting.
# ================================================================================================
#     1. Define elements that tweak the feature attributes.  (e.g., shops to delete)
#     2. Define preprocessing methods.  (e.g., scaling, clipping)
#     3. Define data augmentation strategy.  (e.g., use of cartesian product)
#     4. Define train-validation-test split configuration.
#     5. Define ML model to use and the related hyperparameters to be used in model fitting.
#
#     Configuration of data structure is such that we can "explode" it to generate a spec row
#     for each iteration we wish to perform.  This provides a cartesian product over the desired
#     hyperparameters and preprocessing methods below, although we do not perform a full cartesian
#     product over the elements defined in 'kag_features.py' (categories, lags, statistics).
# ================================================================================================
# Define features based on time-lags and statistical aggregations during monthly grouping
# ================================================================================================
FG = namedtuple('FeatureGroup', 'lag_month group_list stats_dict')
iter1_feature_list = [
    # lag months = 1
    FG(lag_month=1,  # months to lag by
       group_list=['shop_id', 'item_id'],  # grouping for aggregate statistics in the ODict below
       stats_dict=OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),
    FG(1, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['sum', 'median', 'count'])])),
    FG(1, ['shop_id', 'item_cluster'], OrderedDict([('sales', ['sum', 'median'])])),
    FG(1, ['shop_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(1, ['item_id'], OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),
    FG(1, ['shop_group'], OrderedDict([('rev', ['sum'])])),
    FG(1, ['item_category_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(1, ['item_group'], OrderedDict([('sales', ['sum']), ('rev', ['sum'])])),
    FG(1, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    # lag months = 2
    FG(2, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(2, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['count']), ('rev', ['sum'])])),
    FG(2, ['shop_id'], OrderedDict([('sales', ['sum'])])),
    FG(2, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(2, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(2, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    # lag months = 3
    FG(3, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    FG(3, ['shop_id'], OrderedDict([('sales', ['sum'])])),
    FG(3, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(3, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(3, ['item_cluster'], OrderedDict([('sales', ['count'])])),
    # lag months = 4
    FG(4, ['item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 5
    FG(5, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 6
    FG(6, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    FG(6, ['item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 8
    FG(8, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])]))]

# ================================================================================================


def define_model_parameters(feature_iterations):
    """Define splits to iterate over in the execution of the program."""
    # See far below for details on parameter meanings, and some useful choices to use
    #   The huge nesting of lists is to accommodate the pd.explode op to get one split per row
    all_splits_params = OrderedDict()
    # ============================================================================================
    #     model specifications (model type, feature categories, stats, lags)
    # ============================================================================================
    all_splits_params['model'] = OrderedDict([
        ('features',          [feature_iterations]),
        ('model_type',        [['LGBM']])])
    # ============================================================================================
    #     eda exploratory data analysis parameters - coarse adjustments before monthly grouping
    # ============================================================================================
    all_splits_params['eda'] = OrderedDict([
        ('del_shops',       [[[9, 20]]]),
        ('del_item_cats',   [[[8, 10, 32, 59, 80, 81, 82]]]),
        ('scale_sales',     [['week_retail_weight']])])
    # ============================================================================================
    #     data conditioning parameters (datatypes, scaling, cartesian product filling)
    # ============================================================================================
    all_splits_params['data'] = OrderedDict([  # note cp === Cartesian Product
        ('cp_fillna0',       [[True]]),
        ('cp_first_mo',      [[13]]),
        ('cp_test_pairs',    [[False]]),
        ('clip_train',       [[(-20 * 30, 20 * 30)]]),  # limit to 20 sales per day avg. in a month
        ('feat_dtype',       [[np.int16]])])  # for RAM savings, and possible model fitting speedup
    # np.iinfo(np.int32) or np.finfo(np.float32) e.g., to find range of np dtypes
    # Disable scaling for now
    # ('minmax_range',     [[(0, 16000)]]),  # [[False]]),  # or (1,16000) for sales_count=1?
    # ('robust_qtiles',    [[(20, 80)]])  # [[False]]),
    # ============================================================================================
    #     train/val/test splitting parameters
    # ============================================================================================
    all_splits_params['tvt'] = OrderedDict([
        ('tr_start_mo',     [[13]]),
        ('tr_final_mo',     [[29]]),
        ('val_months',      [[999]]),
        ('use_categorical', [[True]])])
    # ============================================================================================
    #     gbm regresssor SETUP parameters
    # ============================================================================================
    all_splits_params['lgbm_setup'] = OrderedDict([
        ('boosting_type',     'gbdt'),
        ('metric',            [['rmse']]),
        ('learning_rate',     [[0.05]]),  # default = 0.1
        ('n_estimators',      [[200]]),
        ('colsample_bytree',  [[0.4]]),  # = feature_fraction; default 1
        ('random_state',      [[42]]),
        ('subsample_for_bin', [[200000, 800000]]),
        ('num_leaves',        [[31]]),
        ('max_depth',         [[-1]]),
        ('min_split_gain',    [[0.0]]),
        ('min_child_weight',  [[0.001]]),
        ('min_child_samples', [[20]]),
        ('silent',            [[False]]),
        ('importance_type',   [['split']]),
        ('reg_alpha',         [[0.0]]),
        ('reg_lambda',        [[0.0]]),
        ('n_jobs',            (-1)),
        ('subsample',         1.0),  # bagging fraction
        ('subsample_freq',    0),    # bagging frequency
        ('objective',         'regression')])
    # ============================================================================================
    #     gbm regressor FITTING parameters
    # ============================================================================================
    all_splits_params['lgbm_fit'] = OrderedDict([
        ('early_stopping_rounds',  [[20]]),
        ('eval_metric',            [['rmse']]),
        ('init_score',             None),
        ('eval_init_score',        None),
        ('verbose',                [[True]]),
        ('feature_name',           [['auto']]),
        ('categorical_feature',    [['auto']]),
        ('callbacks',              None)])
    # ============================================================================================
    #     output predict parameters below # (also need inverse scaling transformers)
    # ============================================================================================
    all_splits_params['predict'] = OrderedDict([
        ('clip_predict', [[(0, 20)]])])  # clip final prediction before submission

    # ============================================================================================
    #     output results container
    # ============================================================================================
    output_results_dict = OrderedDict([
        ('train_rmse',           0.0),
        ('train_R2',             0.0),
        ('val_rmse',             0.0),
        ('val_R2',               0.0),
        ('best_iteration_',      0),
        ('best_score_',          0.0),
        ('feature_name_',        [""]),
        ('feature_importances_', [0.0]),
        ('t_eda',                ""),
        ('t_data',               ""),
        ('t_tvt',                ""),
        ('t_model_fit',          ""),
        ('t_model_predict',      ""),
        ('t_ml',                 ""),
        ('t_model_n',            ""),
        ('t_cumulative',         "")])

    return all_splits_params, output_results_dict

# =============================================================================
# END OF MANUAL USER INPUT PARAMETERS and FEATURES
# =============================================================================
# =============================================================================
# END OF MANUAL USER INPUT PARAMETERS and FEATURES
# =============================================================================
# =============================================================================
# END OF MANUAL USER INPUT PARAMETERS and FEATURES
# =============================================================================


# ================================================================================================
# Main Control Loop  # possibly need == '__main__' for multiprocessing to work smoothly??
# ================================================================================================
if __name__ == '__main__':
    mgr = ProgramManager(FILE_ROOTNAME, FTR_MODULES)
    mgr.mem_capture(program_loc="Before defining features")
    # multiple splits in feature combinations can be handled by iterating through this list
    feature_iter_list = []  # details on cats to use, agg stats and lag features
    feature_iter_list.append(iter1_feature_list)  # ...append(iter2_feature_list) ...

    params, output_results = define_model_parameters(feature_iter_list)
    mgr.write_log_mgr_info(params, True)  # write to log.txt file + console
    outputs_df = mgr.arrange_run_info(params, output_results)

    t0_model_n = time.perf_counter()
    t0_cumulative = time.perf_counter()
    # ============================================================================================
    #   Model Module
    # ============================================================================================
    for model_iter_n, model_iter_dict in mgr.all_splits['model'].items():
        mgr.split.update(model_iter_dict)  # intermediate storage of params for logging
        mgr.set_features()  # expand inputs into pd-friendly dicts of stats and lists of cols
        model_done = (model_iter_n == max(mgr.all_splits['model']))  # done model splits?

        # ========================================================================================
        #   EDA Module - load data, merge basic features into stt df, delete unwanteds, scaling
        # ========================================================================================
        for eda_iter_n, eda_iter_dict in mgr.all_splits['eda'].items():
            mgr.split.update(eda_iter_dict)
            mgr.module_start('EDA')
            # ====================================================================================
            # mgr = eda_cleanup(mgr, **eda_iter_dict)
            mgr = eda_cleanup(mgr)
            mgr.module_end("EDA")
            # True if done all splits in params.model and eda_expl... will help reclaim RAM below
            eda_done = model_done and (eda_iter_n == max(mgr.all_splits['eda']))

            # ====================================================================================
            #   Data - Monthly Feature Generation (agg, stats, lags) & Cartesian Product Module
            # ====================================================================================
            for data_iter_n, data_iter_dict in mgr.all_splits['data'].items():
                mgr.split.update(data_iter_dict)
                mgr.module_start('Data')
                # ================================================================================
                mgr = data_conditioning(mgr)
                mgr.module_end("Data")
                data_done = eda_done and (data_iter_n == max(mgr.all_splits['data']))
                if data_done and hasattr(mgr, 'eda_dfs'):
                    mgr.delete_dfs('eda')  # no future splits need these dfs; reclaim RAM

                # ================================================================================
                #   TVT Split Module - split data into train/val/test and assign desired dtypes
                # ================================================================================
                for tvt_iter_n, tvt_iter_dict in mgr.all_splits['tvt'].items():
                    mgr.split.update(tvt_iter_dict)
                    mgr.module_start('TVT')
                    # ============================================================================
                    mgr = create_tvt_split(mgr)
                    mgr.module_end("TVT")
                    tvt_done = data_done and (tvt_iter_n == max(mgr.all_splits['tvt']))
                    if tvt_done and hasattr(mgr, 'data_dfs'):
                        mgr.delete_dfs('data')  # no future splits need these dfs; reclaim RAM

                    # ============================================================================
                    #   Model Setup / Fitting / Prediction Module
                    # ============================================================================
                    for lgbm_setup_iter_dict in mgr.all_splits['lgbm_setup'].values():
                        mgr.split.update(lgbm_setup_iter_dict)
                        for lgbm_fit_iter_dict in mgr.all_splits['lgbm_fit'].values():
                            mgr.split.update(lgbm_fit_iter_dict)
                            for predict_iter_dict in mgr.all_splits['predict'].values():
                                mgr.split.update(predict_iter_dict)
                                mgr.module_start('ML')
                                # ================================================================
                                mgr, output_dict = gbdt_model(mgr,
                                                              lgbm_setup_iter_dict,
                                                              lgbm_fit_iter_dict)
                                mgr.module_end("ML")
                                mgr.split['t_model_n'] = timer(t0_model_n)
                                mgr.split['t_cumulative'] = timer(t0_cumulative)
                                mgr.split.update(output_dict)
                                for param_name, param_value in mgr.split.items():
                                    if param_name in outputs_df.columns:
                                        outputs_df.at[mgr.model_n, param_name] = param_value
                                    else:
                                        print(f'param {param_name} not in outputs_df.')
                                # save params and results - overwrite this "log" each run iter
                                outputs_df.to_csv(mgr.filenames.output_path, index=False)

                                # ================================================================
                                #   Post-Processing Module - write data; check feature importance
                                # ================================================================
                                mgr.module_start('Postprocessing')
                                # ================================================================
                                #   Incorporate postprocess module / feature importances
                                # ================================================================

                                mgr.model_n += 1
                                mgr.update_filenames()
                                t0_model_n = time.perf_counter()

    mgr.write_log_mgr_info(param_dict=params, console=False)  # save (overwrite) log.txt file
    # filename_pickle = f'save entire output df, along with {mgr}'  # include postproc stuff too #



OrderedDict([('model',
              OrderedDict([('features',
                            [[[FeatureGroup(lag_month=1, group_list=['shop_id', 'item_id'], stats_dict=OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),
                               FeatureGroup(lag_month=1, group_list=['shop_id', 'item_category_id'], stats_dict=OrderedDict([('sales', ['sum', 'median', 'count'])])),
                               FeatureGroup(lag_month=1, group_list=['shop_id', 'item_cluster'], stats_dict=OrderedDict([('sales', ['sum', 'median'])])),
                               FeatureGroup(lag_month=1, group_list=['shop_id'], stats_dict=OrderedDict([('sales', ['sum', 'count'])])),
                               FeatureGroup(lag_month=1, group_list=['item_id'], stats_dict=OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),
                               FeatureGroup(lag_month=1, group_list=['shop_group'], stats_dict=OrderedDict([('rev', ['sum'])])),
         

##**Stop Execution of code below by invoking error**

In [None]:
# Dummy cell to stop the execution so we don't run any of the random code below (if we select "Run All", e.g.)
stop_running_code_at_this_cell = yes


##**The Following Code Cell Contains Example Inputs / Ideas on Choosing Parameters**

In [None]:
# kag_features example and some other comments and choices for LGBM parameters can
#     be found in this code block....


"""
Example:
=============================================================================
input:
=============================================================================
iter1_feature_list = [
    # lag months = 1
    [1,  # months to lag by
     ['shop_id', 'item_id'],  # grouping for aggregate statistics
     OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])],  # aggregate stats
    [1, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['sum', 'median', 'count'])])],
    [1, ['shop_id', 'item_cluster'], OrderedDict([('sales', ['sum', 'median'])])],
    [1, ['shop_id'], OrderedDict([('sales', ['sum', 'count'])])],
    [1, ['item_id'], OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])],
    [1, ['shop_group'], OrderedDict([('rev', ['sum'])])],
    [1, ['item_category_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    [1, ['item_group'], OrderedDict([('sales', ['sum']), ('rev', ['sum'])])],
    [1, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    # lag months = 2
    [2, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    [2, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['count']), ('rev', ['sum'])])],
    [2, ['shop_id'], OrderedDict([('sales', ['sum'])])],
    [2, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    [2, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])],
    [2, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    # lag months = 3
    [3, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])],
    [3, ['shop_id'], OrderedDict([('sales', ['sum'])])],
    [3, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])],
    [3, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])],
    [3, ['item_cluster'], OrderedDict([('sales', ['count'])])],
    # lag months = 4
    [4, ['item_id'], OrderedDict([('sales', ['sum'])])],
    # lag months = 5
    [5, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])],
    # lag months = 6
    [6, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])],
    [6, ['item_id'], OrderedDict([('sales', ['sum'])])],
    # lag months = 8
    [8, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])]]


=============================================================================
output:
=============================================================================
printable_lag_features
=============================================================================
OrderedDict([(1,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('shop_group', ['first']),
                                                   ('item_cluster', ['first']),
                                                   ('item_category_id', ['first']),
                                                   ('item_group', ['first']),
                                                   ('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_group',
                                           'item_cluster',
                                           'item_category_id',
                                           'item_group',
                                           'shop_item__salesSum',
                                           'shop_item__salesMed',
                                           'shop_item__salesCnt',
                                           'shop_item__revSum'])]),
               OrderedDict([('group_name', 'shop_itemCat'),
                            ('group', ['month', 'shop_id', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_itemCat__salesSum',
                                           'shop_itemCat__salesMed',
                                           'shop_itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'shop_itemCluster'),
                            ('group', ['month', 'shop_id', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'median'])])),
                            ('col_names', ['shop_itemCluster__salesSum',
                                           'shop_itemCluster__salesMed'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['shop__salesSum',
                                           'shop__salesCnt'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesMed',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'shopGrp'),
                            ('group', ['month', 'shop_group']),
                            ('stats', OrderedDict([('rev', ['sum'])])),
                            ('col_names', ['shopGrp__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt',
                                           'itemCat__revSum'])]),
               OrderedDict([('group_name', 'itemGrp'),
                            ('group', ['month', 'item_group']),
                            ('stats', OrderedDict([('sales', ['sum']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemGrp__salesSum',
                                           'itemGrp__revSum'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCluster__salesSum',
                                           'itemCluster__salesCnt',
                                           'itemCluster__revSum'])])
               ]),
             (2,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_item__salesSum',
                                           'shop_item__salesCnt',
                                           'shop_item__revSum'])]),
               OrderedDict([('group_name', 'shop_itemCat'),
                            ('group', ['month', 'shop_id', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_itemCat__salesCnt',
                                           'shop_itemCat__revSum'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCluster__salesSum',
                                           'itemCluster__salesCnt',
                                           'itemCluster__revSum'])])
               ]),
             (3,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['count'])])),
                            ('col_names', ['itemCluster__salesCnt'])])
               ]),
             (4,
              [OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['item__salesSum'])])
               ]),
             (5,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])])
               ]),
             (6,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['item__salesSum'])])
               ]),
             (8,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])])
               ])
             ])

=============================================================================
cols
=============================================================================
{'all_keep': set({'ID',
                  'item_category_id',
                  'item_cluster',
                  'item_group',
                  'item_id',
                  'month',
                  'rev',
                  'sales',
                  'shop_group',
                  'shop_id'}),
 'cat_feats': ['shop_id',
               'shop_group',
               'item_id',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'final_stt': ['month',
               'sales',
               'rev',
               'shop_id',
               'item_id',
               'shop_group',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'int_feats': ['month',
               'shop_id',
               'item_id',
               'shop_group',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'keep': {'date_scaling': ['month'],
          'items_enc': ['item_id',
                        'item_cluster',
                        'item_category_id',
                        'item_group'],
          'shops_enc': ['shop_id', 'shop_group'],
          'stt': ['month', 'sales', 'price', 'shop_id', 'item_id'],
          'test': ['ID', 'shop_id', 'item_id']},
 'min_agg_cols': ['shop_item__salesSum',
                  'shop_item__salesMed',
                  'shop_item__salesCnt',
                  'shop_item__revSum',
                  'shop_itemCat__salesSum',
                  'shop_itemCat__salesMed',
                  'shop_itemCat__salesCnt',
                  'shop_itemCat__revSum',
                  'shop_itemCluster__salesSum',
                  'shop_itemCluster__salesMed',
                  'shop__salesSum',
                  'shop__salesCnt',
                  'item__salesSum',
                  'item__salesMed',
                  'item__salesCnt',
                  'item__revSum',
                  'shopGrp__revSum',
                  'itemCat__salesSum',
                  'itemCat__salesCnt',
                  'itemCat__revSum',
                  'itemGrp__salesSum',
                  'itemGrp__revSum',
                  'itemCluster__salesSum',
                  'itemCluster__salesCnt',
                  'itemCluster__revSum'],
 'no_lag': ['shop_group', 'item_cluster', 'item_category_id', 'item_group']}

=============================================================================
all_lag_features
=============================================================================
['shop_item__salesSum_L1',
 'shop_item__salesMed_L1',
 'shop_item__salesCnt_L1',
 'shop_item__revSum_L1',
 'shop_itemCat__salesSum_L1',
 'shop_itemCat__salesMed_L1',
 'shop_itemCat__salesCnt_L1',
 'shop_itemCluster__salesSum_L1',
 'shop_itemCluster__salesMed_L1',
 'shop__salesSum_L1',
 'shop__salesCnt_L1',
 'item__salesSum_L1',
 'item__salesMed_L1',
 'item__salesCnt_L1',
 'item__revSum_L1',
 'shopGrp__revSum_L1',
 'itemCat__salesSum_L1',
 'itemCat__salesCnt_L1',
 'itemCat__revSum_L1',
 'itemGrp__salesSum_L1',
 'itemGrp__revSum_L1',
 'itemCluster__salesSum_L1',
 'itemCluster__salesCnt_L1',
 'itemCluster__revSum_L1',
 'shop_item__salesSum_L2',
 'shop_item__salesCnt_L2',
 'shop_item__revSum_L2',
 'shop_itemCat__salesCnt_L2',
 'shop_itemCat__revSum_L2',
 'shop__salesSum_L2',
 'item__salesSum_L2',
 'item__salesCnt_L2',
 'item__revSum_L2',
 'itemCat__salesSum_L2',
 'itemCat__salesCnt_L2',
 'itemCluster__salesSum_L2',
 'itemCluster__salesCnt_L2',
 'itemCluster__revSum_L2',
 'shop_item__salesSum_L3',
 'shop__salesSum_L3',
 'item__salesSum_L3',
 'item__salesCnt_L3',
 'item__revSum_L3',
 'itemCat__salesSum_L3',
 'itemCat__salesCnt_L3',
 'itemCluster__salesCnt_L3',
 'item__salesSum_L4',
 'shop_item__salesSum_L5',
 'shop_item__salesSum_L6',
 'item__salesSum_L6',
 'shop_item__salesSum_L8']

=============================================================================
min_feat_groups_stats_set
=============================================================================
OrderedDict([
    ('shop_item', OrderedDict([('group', ['month', 'shop_id', 'item_id']),
                               ('stats', OrderedDict([('shop_group', ['first']),
                                                      ('item_cluster', ['first']),
                                                      ('item_category_id', ['first']),
                                                      ('item_group', ['first']),
                                                      ('sales', ['sum', 'median', 'count']),
                                                      ('rev', ['sum'])])),
                               ('col_names', ['shop_group',
                                              'item_cluster',
                                              'item_category_id',
                                              'item_group',
                                              'shop_item__salesSum',
                                              'shop_item__salesMed',
                                              'shop_item__salesCnt',
                                              'shop_item__revSum'])])),
    ('shop_itemCat', OrderedDict([('group', ['month', 'shop_id', 'item_category_id']),
                                  ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                         ('rev', ['sum'])])),
                                  ('col_names', ['shop_itemCat__salesSum',
                                                 'shop_itemCat__salesMed',
                                                 'shop_itemCat__salesCnt',
                                                 'shop_itemCat__revSum'])])),
    ('shop_itemCluster', OrderedDict([('group', ['month', 'shop_id', 'item_cluster']),
                                      ('stats', OrderedDict([('sales', ['sum', 'median'])])),
                                      ('col_names', ['shop_itemCluster__salesSum',
                                                     'shop_itemCluster__salesMed'])])),
    ('shop', OrderedDict([('group', ['month', 'shop_id']),
                          ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                          ('col_names', ['shop__salesSum',
                                         'shop__salesCnt'])])),
    ('item', OrderedDict([('group', ['month', 'item_id']),
                          ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                 ('rev', ['sum'])])),
                          ('col_names', ['item__salesSum',
                                         'item__salesMed',
                                         'item__salesCnt',
                                         'item__revSum'])])),
    ('shopGrp', OrderedDict([('group', ['month', 'shop_group']),
                             ('stats', OrderedDict([('rev', ['sum'])])),
                             ('col_names', ['shopGrp__revSum'])])),
    ('itemCat', OrderedDict([('group', ['month', 'item_category_id']),
                             ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                    ('rev', ['sum'])])),
                             ('col_names', ['itemCat__salesSum',
                                            'itemCat__salesCnt',
                                            'itemCat__revSum'])])),
    ('itemGrp', OrderedDict([('group', ['month', 'item_group']),
                             ('stats', OrderedDict([('sales', ['sum']),
                                                    ('rev', ['sum'])])),
                             ('col_names', ['itemGrp__salesSum',
                                            'itemGrp__revSum'])])),
    ('itemCluster', OrderedDict([('group', ['month', 'item_cluster']),
                                 ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                        ('rev', ['sum'])])),
                                 ('col_names', ['itemCluster__salesSum',
                                                'itemCluster__salesCnt',
                                                'itemCluster__revSum'])]))
    ])

=============================================================================
lag_month_dict
=============================================================================
OrderedDict([(1, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L1'),
                              ('shop_item__salesMed', 'shop_item__salesMed_L1'),
                              ('shop_item__salesCnt', 'shop_item__salesCnt_L1'),
                              ('shop_item__revSum', 'shop_item__revSum_L1'),
                              ('shop_itemCat__salesSum', 'shop_itemCat__salesSum_L1'),
                              ('shop_itemCat__salesMed', 'shop_itemCat__salesMed_L1'),
                              ('shop_itemCat__salesCnt', 'shop_itemCat__salesCnt_L1'),
                              ('shop_itemCluster__salesSum', 'shop_itemCluster__salesSum_L1'),
                              ('shop_itemCluster__salesMed', 'shop_itemCluster__salesMed_L1'),
                              ('shop__salesSum', 'shop__salesSum_L1'),
                              ('shop__salesCnt', 'shop__salesCnt_L1'),
                              ('item__salesSum', 'item__salesSum_L1'),
                              ('item__salesMed', 'item__salesMed_L1'),
                              ('item__salesCnt', 'item__salesCnt_L1'),
                              ('item__revSum', 'item__revSum_L1'),
                              ('shopGrp__revSum', 'shopGrp__revSum_L1'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L1'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L1'),
                              ('itemCat__revSum', 'itemCat__revSum_L1'),
                              ('itemGrp__salesSum', 'itemGrp__salesSum_L1'),
                              ('itemGrp__revSum', 'itemGrp__revSum_L1'),
                              ('itemCluster__salesSum', 'itemCluster__salesSum_L1'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L1'),
                              ('itemCluster__revSum', 'itemCluster__revSum_L1')])),
             (2, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L2'),
                              ('shop_item__salesCnt', 'shop_item__salesCnt_L2'),
                              ('shop_item__revSum', 'shop_item__revSum_L2'),
                              ('shop_itemCat__salesCnt', 'shop_itemCat__salesCnt_L2'),
                              ('shop_itemCat__revSum', 'shop_itemCat__revSum_L2'),
                              ('shop__salesSum', 'shop__salesSum_L2'),
                              ('item__salesSum', 'item__salesSum_L2'),
                              ('item__salesCnt', 'item__salesCnt_L2'),
                              ('item__revSum', 'item__revSum_L2'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L2'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L2'),
                              ('itemCluster__salesSum', 'itemCluster__salesSum_L2'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L2'),
                              ('itemCluster__revSum', 'itemCluster__revSum_L2')])),
             (3, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L3'),
                              ('shop__salesSum', 'shop__salesSum_L3'),
                              ('item__salesSum', 'item__salesSum_L3'),
                              ('item__salesCnt', 'item__salesCnt_L3'),
                              ('item__revSum', 'item__revSum_L3'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L3'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L3'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L3')])),
             (4, OrderedDict([('item__salesSum', 'item__salesSum_L4')])),
             (5, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L5')])),
             (6, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L6'),
                              ('item__salesSum', 'item__salesSum_L6')])),
             (8, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L8')]))
            ])
"""

# =============================================================================
# Discarding Unused DataFrame Columns
# =============================================================================
# For reference, all column names of the loaded dataframes to choose from:
# items_enc_cols = ['item_id', 'item_tested', 'item_cluster', 'item_category_id',
#                   'item_cat_tested', 'item_group', 'item_category1', 'item_category2',
#                   'item_category3', 'item_category4']
# shops_enc_cols = ['shop_id','shop_tested','shop_group','shop_type','s_type_broad',
#                   'shop_federal_district','fd_popdens','fd_gdp','shop_city']
# date_scaling_cols = ['month', 'year', 'season', 'MoY', 'days_in_M',
#                      'weekday_weight', 'retail_sales', 'week_retail_weight']
# stt_cols = ['day', 'week', 'qtr', 'season', 'month', 'price', 'sales', 'shop_id', 'item_id']
# test_cols = ['ID', 'shop_id', 'item_id']
# =============================================================================
# To save memory, we can discard unnecessary features here by specifying only those that we use
# =============================================================================
# keep_cols = {'items_enc': ['item_id', 'item_category_id', 'item_group', 'item_cluster'],
#              'shops_enc': ['shop_id', 'shop_group'],
#              'date_scaling': ['month', 'week_retail_weight'],
#              'stt': ['month', 'sales', 'price', 'shop_id', 'item_id'],
#              'test': ['ID', 'shop_id', 'item_id']}



"""
# ================================================================================================
# Some Useful/Common Choices for Parameter Splits:
# ================================================================================================
('model_type',      [['HGBR']])])  # for SKLearn version of GBDT (to be implemented)
('del_shops',       [[[9, 20]]]),
('del_shops',       [[[0, 1, 8, 9, 11, 13, 17, 20, 23, 27, 29, 30, 32, 33, 40, 43, 51, 54]]]),
('del_shops',       [[[8, 9, 13, 20, 23, 32, 33, 40]]]),
('del_shops',       [[False]]),
('del_item_cats',   [[[8, 10, 32, 59, 80, 81, 82]]]),
('del_item_cats',   [[[8, 80, 81, 82]]]),
('del_item_cats',   [[[1,4,8,10,13,14,17,18,32,39,46,48,50,51,52,53,59,66,68,80,81,82]]]),
('del_item_cats',   [[False]]),
('scale_sales',     [['week_retail_weight']])])
('scale_sales',     [['days_in_M']])])
('scale_sales',     [['weekday_weight']])])  # relative numbers of each weekday
('scale_sales',     [['retail_sales']])])  # Russian recession retail sales idx
('scale_sales',     [[False]])])
('cp_first_mo',     [[False]]),  # if no Cartesian Product fill is desired (to be implemented)
('feat_dtype',       [[np.float32]]),
('feat_dtype',       [[np.uint16]]),
('minmax_range',     [[(0, 32700)]]),  # matches with int16
('minmax_range',     [[(0, 65500)]]),  # matches with uint16
('tr_start_mo',     [[24]]),  # 24 gives less than 1yr data, but avoids Dec. 'outlier' of 2014
('tr_final_mo',     [[29, 32]]),
('tr_final_mo',     [[29, 30, 32]]),
('val_months',      [[1]])])
('val_months',      [[2]])])

# ================================================================================================
# Clarification on meaning of certain parameters:
# ================================================================================================

('features', [feature_iterations]  # list of class instances, 1 per iteration
# cp === Cartesian Product
('cp_fillna0', True  # fill n/a cp rows with 0 (bad for price-based stats, ok for revenue)
('cp_first_mo', 13  # mo + maxlag to start adding cp (eg, maxlag=6 and cp_first_mo=10 fills 4-33)
('cp_test_pairs', False  # force include all of test set 'shop-item pairs'
    # along with each month's conventional Cartesian Product fill
('clip_train', (0, 20)  # this clips sales after doing monthly groupings (monthly_stt df)
('feat_dtype', np.int16  # if df has np.NaNs, int type cannot represent - must use float32
('minmax_range', (0, 16000)  # use >0 for best LGBM results; smaller=faster fit; False=no scaling
('robust_qtiles', (20, 80)  # replace tuple with False if no scaling desired
('use_categorical', True  # relevant df cols -> categorical dtype just before modeling
('val_months', 999  # 1 # 2 # 999= all months after tr; else n mo after tr_final_mo
('colsample_bytree', 0.4  # feature_fraction; default 1 for LGBM, 0 for HGBR = (reverse of LGBM)
('importance_type', 'split'= n times feat used in model; 'gain'= total gains of splits using feat
('eval_metric', if metrics splits, use eg [['rmse',['rmse','l2']]] to get 'rmse' + ['rmse','l2']
('verbose', # int=4 prints every 4th iter; True=every iter; False=no print except best and last
('feature_name', # list of strings or if 'auto' and data is from pd df, data col names are used
('categorical_feature', If 'auto' and data is pd df, pd unordered categorical columns are used



# FEATURES Example:
=============================================================================
input:
=============================================================================
iter1_feature_list = [
    # lag months = 1
    FG(1,  # months to lag by
     ['shop_id', 'item_id'],  # grouping for aggregate statistics
     OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),  # aggregate stats
    FG(1, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['sum', 'median', 'count'])])),
    FG(1, ['shop_id', 'item_cluster'], OrderedDict([('sales', ['sum', 'median'])])),
    FG(1, ['shop_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(1, ['item_id'], OrderedDict([('sales', ['sum', 'median', 'count']), ('rev', ['sum'])])),
    FG(1, ['shop_group'], OrderedDict([('rev', ['sum'])])),
    FG(1, ['item_category_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(1, ['item_group'], OrderedDict([('sales', ['sum']), ('rev', ['sum'])])),
    FG(1, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    # lag months = 2
    FG(2, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(2, ['shop_id', 'item_category_id'], OrderedDict([('sales', ['count']), ('rev', ['sum'])])),
    FG(2, ['shop_id'], OrderedDict([('sales', ['sum'])])),
    FG(2, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(2, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(2, ['item_cluster'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    # lag months = 3
    FG(3, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    FG(3, ['shop_id'], OrderedDict([('sales', ['sum'])])),
    FG(3, ['item_id'], OrderedDict([('sales', ['sum', 'count']), ('rev', ['sum'])])),
    FG(3, ['item_category_id'], OrderedDict([('sales', ['sum', 'count'])])),
    FG(3, ['item_cluster'], OrderedDict([('sales', ['count'])])),
    # lag months = 4
    FG(4, ['item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 5
    FG(5, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 6
    FG(6, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])])),
    FG(6, ['item_id'], OrderedDict([('sales', ['sum'])])),
    # lag months = 8
    FG(8, ['shop_id', 'item_id'], OrderedDict([('sales', ['sum'])]))]


=============================================================================
output:
=============================================================================
printable_lag_features
=============================================================================
OrderedDict([(1,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('shop_group', ['first']),
                                                   ('item_cluster', ['first']),
                                                   ('item_category_id', ['first']),
                                                   ('item_group', ['first']),
                                                   ('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_group',
                                           'item_cluster',
                                           'item_category_id',
                                           'item_group',
                                           'shop_item__salesSum',
                                           'shop_item__salesMed',
                                           'shop_item__salesCnt',
                                           'shop_item__revSum'])]),
               OrderedDict([('group_name', 'shop_itemCat'),
                            ('group', ['month', 'shop_id', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_itemCat__salesSum',
                                           'shop_itemCat__salesMed',
                                           'shop_itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'shop_itemCluster'),
                            ('group', ['month', 'shop_id', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'median'])])),
                            ('col_names', ['shop_itemCluster__salesSum',
                                           'shop_itemCluster__salesMed'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['shop__salesSum',
                                           'shop__salesCnt'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesMed',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'shopGrp'),
                            ('group', ['month', 'shop_group']),
                            ('stats', OrderedDict([('rev', ['sum'])])),
                            ('col_names', ['shopGrp__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt',
                                           'itemCat__revSum'])]),
               OrderedDict([('group_name', 'itemGrp'),
                            ('group', ['month', 'item_group']),
                            ('stats', OrderedDict([('sales', ['sum']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemGrp__salesSum',
                                           'itemGrp__revSum'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCluster__salesSum',
                                           'itemCluster__salesCnt',
                                           'itemCluster__revSum'])])
               ]),
             (2,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_item__salesSum',
                                           'shop_item__salesCnt',
                                           'shop_item__revSum'])]),
               OrderedDict([('group_name', 'shop_itemCat'),
                            ('group', ['month', 'shop_id', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['shop_itemCat__salesCnt',
                                           'shop_itemCat__revSum'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['itemCluster__salesSum',
                                           'itemCluster__salesCnt',
                                           'itemCluster__revSum'])])
               ]),
             (3,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])]),
               OrderedDict([('group_name', 'shop'),
                            ('group', ['month', 'shop_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                   ('rev', ['sum'])])),
                            ('col_names', ['item__salesSum',
                                           'item__salesCnt',
                                           'item__revSum'])]),
               OrderedDict([('group_name', 'itemCat'),
                            ('group', ['month', 'item_category_id']),
                            ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                            ('col_names', ['itemCat__salesSum',
                                           'itemCat__salesCnt'])]),
               OrderedDict([('group_name', 'itemCluster'),
                            ('group', ['month', 'item_cluster']),
                            ('stats', OrderedDict([('sales', ['count'])])),
                            ('col_names', ['itemCluster__salesCnt'])])
               ]),
             (4,
              [OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['item__salesSum'])])
               ]),
             (5,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])])
               ]),
             (6,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])]),
               OrderedDict([('group_name', 'item'),
                            ('group', ['month', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['item__salesSum'])])
               ]),
             (8,
              [OrderedDict([('group_name', 'shop_item'),
                            ('group', ['month', 'shop_id', 'item_id']),
                            ('stats', OrderedDict([('sales', ['sum'])])),
                            ('col_names', ['shop_item__salesSum'])])
               ])
             ])

=============================================================================
cols
=============================================================================
{'all_keep': set({'ID',
                  'item_category_id',
                  'item_cluster',
                  'item_group',
                  'item_id',
                  'month',
                  'rev',
                  'sales',
                  'shop_group',
                  'shop_id'}),
 'cat_feats': ['shop_id',
               'shop_group',
               'item_id',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'final_stt': ['month',
               'sales',
               'rev',
               'shop_id',
               'item_id',
               'shop_group',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'int_feats': ['month',
               'shop_id',
               'item_id',
               'shop_group',
               'item_cluster',
               'item_category_id',
               'item_group'],
 'keep': {'date_scaling': ['month'],
          'items_enc': ['item_id',
                        'item_cluster',
                        'item_category_id',
                        'item_group'],
          'shops_enc': ['shop_id', 'shop_group'],
          'stt': ['month', 'sales', 'price', 'shop_id', 'item_id'],
          'test': ['ID', 'shop_id', 'item_id']},
 'min_agg_cols': ['shop_item__salesSum',
                  'shop_item__salesMed',
                  'shop_item__salesCnt',
                  'shop_item__revSum',
                  'shop_itemCat__salesSum',
                  'shop_itemCat__salesMed',
                  'shop_itemCat__salesCnt',
                  'shop_itemCat__revSum',
                  'shop_itemCluster__salesSum',
                  'shop_itemCluster__salesMed',
                  'shop__salesSum',
                  'shop__salesCnt',
                  'item__salesSum',
                  'item__salesMed',
                  'item__salesCnt',
                  'item__revSum',
                  'shopGrp__revSum',
                  'itemCat__salesSum',
                  'itemCat__salesCnt',
                  'itemCat__revSum',
                  'itemGrp__salesSum',
                  'itemGrp__revSum',
                  'itemCluster__salesSum',
                  'itemCluster__salesCnt',
                  'itemCluster__revSum'],
 'no_lag': ['shop_group', 'item_cluster', 'item_category_id', 'item_group']}

=============================================================================
all_lag_features
=============================================================================
['shop_item__salesSum_L1',
 'shop_item__salesMed_L1',
 'shop_item__salesCnt_L1',
 'shop_item__revSum_L1',
 'shop_itemCat__salesSum_L1',
 'shop_itemCat__salesMed_L1',
 'shop_itemCat__salesCnt_L1',
 'shop_itemCluster__salesSum_L1',
 'shop_itemCluster__salesMed_L1',
 'shop__salesSum_L1',
 'shop__salesCnt_L1',
 'item__salesSum_L1',
 'item__salesMed_L1',
 'item__salesCnt_L1',
 'item__revSum_L1',
 'shopGrp__revSum_L1',
 'itemCat__salesSum_L1',
 'itemCat__salesCnt_L1',
 'itemCat__revSum_L1',
 'itemGrp__salesSum_L1',
 'itemGrp__revSum_L1',
 'itemCluster__salesSum_L1',
 'itemCluster__salesCnt_L1',
 'itemCluster__revSum_L1',
 'shop_item__salesSum_L2',
 'shop_item__salesCnt_L2',
 'shop_item__revSum_L2',
 'shop_itemCat__salesCnt_L2',
 'shop_itemCat__revSum_L2',
 'shop__salesSum_L2',
 'item__salesSum_L2',
 'item__salesCnt_L2',
 'item__revSum_L2',
 'itemCat__salesSum_L2',
 'itemCat__salesCnt_L2',
 'itemCluster__salesSum_L2',
 'itemCluster__salesCnt_L2',
 'itemCluster__revSum_L2',
 'shop_item__salesSum_L3',
 'shop__salesSum_L3',
 'item__salesSum_L3',
 'item__salesCnt_L3',
 'item__revSum_L3',
 'itemCat__salesSum_L3',
 'itemCat__salesCnt_L3',
 'itemCluster__salesCnt_L3',
 'item__salesSum_L4',
 'shop_item__salesSum_L5',
 'shop_item__salesSum_L6',
 'item__salesSum_L6',
 'shop_item__salesSum_L8']

=============================================================================
min_feat_groups_stats_set
=============================================================================
OrderedDict([
    ('shop_item', OrderedDict([('group', ['month', 'shop_id', 'item_id']),
                               ('stats', OrderedDict([('shop_group', ['first']),
                                                      ('item_cluster', ['first']),
                                                      ('item_category_id', ['first']),
                                                      ('item_group', ['first']),
                                                      ('sales', ['sum', 'median', 'count']),
                                                      ('rev', ['sum'])])),
                               ('col_names', ['shop_group',
                                              'item_cluster',
                                              'item_category_id',
                                              'item_group',
                                              'shop_item__salesSum',
                                              'shop_item__salesMed',
                                              'shop_item__salesCnt',
                                              'shop_item__revSum'])])),
    ('shop_itemCat', OrderedDict([('group', ['month', 'shop_id', 'item_category_id']),
                                  ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                         ('rev', ['sum'])])),
                                  ('col_names', ['shop_itemCat__salesSum',
                                                 'shop_itemCat__salesMed',
                                                 'shop_itemCat__salesCnt',
                                                 'shop_itemCat__revSum'])])),
    ('shop_itemCluster', OrderedDict([('group', ['month', 'shop_id', 'item_cluster']),
                                      ('stats', OrderedDict([('sales', ['sum', 'median'])])),
                                      ('col_names', ['shop_itemCluster__salesSum',
                                                     'shop_itemCluster__salesMed'])])),
    ('shop', OrderedDict([('group', ['month', 'shop_id']),
                          ('stats', OrderedDict([('sales', ['sum', 'count'])])),
                          ('col_names', ['shop__salesSum',
                                         'shop__salesCnt'])])),
    ('item', OrderedDict([('group', ['month', 'item_id']),
                          ('stats', OrderedDict([('sales', ['sum', 'median', 'count']),
                                                 ('rev', ['sum'])])),
                          ('col_names', ['item__salesSum',
                                         'item__salesMed',
                                         'item__salesCnt',
                                         'item__revSum'])])),
    ('shopGrp', OrderedDict([('group', ['month', 'shop_group']),
                             ('stats', OrderedDict([('rev', ['sum'])])),
                             ('col_names', ['shopGrp__revSum'])])),
    ('itemCat', OrderedDict([('group', ['month', 'item_category_id']),
                             ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                    ('rev', ['sum'])])),
                             ('col_names', ['itemCat__salesSum',
                                            'itemCat__salesCnt',
                                            'itemCat__revSum'])])),
    ('itemGrp', OrderedDict([('group', ['month', 'item_group']),
                             ('stats', OrderedDict([('sales', ['sum']),
                                                    ('rev', ['sum'])])),
                             ('col_names', ['itemGrp__salesSum',
                                            'itemGrp__revSum'])])),
    ('itemCluster', OrderedDict([('group', ['month', 'item_cluster']),
                                 ('stats', OrderedDict([('sales', ['sum', 'count']),
                                                        ('rev', ['sum'])])),
                                 ('col_names', ['itemCluster__salesSum',
                                                'itemCluster__salesCnt',
                                                'itemCluster__revSum'])]))
    ])

=============================================================================
lag_month_dict
=============================================================================
OrderedDict([(1, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L1'),
                              ('shop_item__salesMed', 'shop_item__salesMed_L1'),
                              ('shop_item__salesCnt', 'shop_item__salesCnt_L1'),
                              ('shop_item__revSum', 'shop_item__revSum_L1'),
                              ('shop_itemCat__salesSum', 'shop_itemCat__salesSum_L1'),
                              ('shop_itemCat__salesMed', 'shop_itemCat__salesMed_L1'),
                              ('shop_itemCat__salesCnt', 'shop_itemCat__salesCnt_L1'),
                              ('shop_itemCluster__salesSum', 'shop_itemCluster__salesSum_L1'),
                              ('shop_itemCluster__salesMed', 'shop_itemCluster__salesMed_L1'),
                              ('shop__salesSum', 'shop__salesSum_L1'),
                              ('shop__salesCnt', 'shop__salesCnt_L1'),
                              ('item__salesSum', 'item__salesSum_L1'),
                              ('item__salesMed', 'item__salesMed_L1'),
                              ('item__salesCnt', 'item__salesCnt_L1'),
                              ('item__revSum', 'item__revSum_L1'),
                              ('shopGrp__revSum', 'shopGrp__revSum_L1'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L1'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L1'),
                              ('itemCat__revSum', 'itemCat__revSum_L1'),
                              ('itemGrp__salesSum', 'itemGrp__salesSum_L1'),
                              ('itemGrp__revSum', 'itemGrp__revSum_L1'),
                              ('itemCluster__salesSum', 'itemCluster__salesSum_L1'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L1'),
                              ('itemCluster__revSum', 'itemCluster__revSum_L1')])),
             (2, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L2'),
                              ('shop_item__salesCnt', 'shop_item__salesCnt_L2'),
                              ('shop_item__revSum', 'shop_item__revSum_L2'),
                              ('shop_itemCat__salesCnt', 'shop_itemCat__salesCnt_L2'),
                              ('shop_itemCat__revSum', 'shop_itemCat__revSum_L2'),
                              ('shop__salesSum', 'shop__salesSum_L2'),
                              ('item__salesSum', 'item__salesSum_L2'),
                              ('item__salesCnt', 'item__salesCnt_L2'),
                              ('item__revSum', 'item__revSum_L2'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L2'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L2'),
                              ('itemCluster__salesSum', 'itemCluster__salesSum_L2'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L2'),
                              ('itemCluster__revSum', 'itemCluster__revSum_L2')])),
             (3, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L3'),
                              ('shop__salesSum', 'shop__salesSum_L3'),
                              ('item__salesSum', 'item__salesSum_L3'),
                              ('item__salesCnt', 'item__salesCnt_L3'),
                              ('item__revSum', 'item__revSum_L3'),
                              ('itemCat__salesSum', 'itemCat__salesSum_L3'),
                              ('itemCat__salesCnt', 'itemCat__salesCnt_L3'),
                              ('itemCluster__salesCnt', 'itemCluster__salesCnt_L3')])),
             (4, OrderedDict([('item__salesSum', 'item__salesSum_L4')])),
             (5, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L5')])),
             (6, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L6'),
                              ('item__salesSum', 'item__salesSum_L6')])),
             (8, OrderedDict([('shop_item__salesSum', 'shop_item__salesSum_L8')]))
            ])

# =============================================================================
# Example print_package_versions:
# =============================================================================
Python version: 3.8.3
lightgbm version: 3.0.0
matplotlib version: 3.3.2
numpy version: 1.19.2
pandas version: 1.1.3
scikit-learn version: 0.23.2

os: win32
os_full: Windows-10-10.0.19041-SP0
runtime: Windows
gb_physical_dram: 48.0
n_logical_cpu: 12
n_physical_cpu: 6
n_multiprocessing_cpu: 12
chipset: Intel64 Family 6 Model 158 Stepping 10, GenuineIntel

# =============================================================================
# Discarding Unused DataFrame Columns
# =============================================================================
For reference, all column names of the loaded dataframes to choose from:
items_enc_cols = ['item_id', 'item_tested', 'item_cluster', 'item_category_id',
                  'item_cat_tested', 'item_group', 'item_category1', 'item_category2',
                  'item_category3', 'item_category4']
shops_enc_cols = ['shop_id','shop_tested','shop_group','shop_type','s_type_broad',
                  'shop_federal_district','fd_popdens','fd_gdp','shop_city']
date_scaling_cols = ['month', 'year', 'season', 'MoY', 'days_in_M',
                      'weekday_weight', 'retail_sales', 'week_retail_weight']
stt_cols = ['day', 'week', 'qtr', 'season', 'month', 'price', 'sales', 'shop_id', 'item_id']
test_cols = ['ID', 'shop_id', 'item_id']
=============================================================================
To save memory, we can discard unnecessary features here by specifying only those that we use
=============================================================================
keep_cols = {'items_enc': ['item_id', 'item_category_id', 'item_group', 'item_cluster'],
              'shops_enc': ['shop_id', 'shop_group'],
              'date_scaling': ['month', 'week_retail_weight'],
              'stt': ['month', 'sales', 'price', 'shop_id', 'item_id'],
              'test': ['ID', 'shop_id', 'item_id']}

# =============================================================================
# AttrDict example/explanation:
# =============================================================================
    somedict = {'key': 123, 'stuff': 456}
    data = AttrDict(somedict)
    print(data.key)
    print(data.stuff)
    >> 123
    >> 456
    data.key = 'abc'
    print(data.key)
    print(data['key'])
    >> abc
    >> abc
    def fn(**i): print(i)
    fn(**somedict)
    >> {'key': 123, 'stuff': 456}
    fn(**data)
    >> {'key': 'abc', 'stuff': 456}
    data.alpha = 'oh'
    fn(**data)
    {'key': 5, 'stuff': 456, 'alpha': 'oh'}
    data
    >> {'key': 5, 'stuff': 456, 'alpha': 'oh'}
"""



##**Final Project for Coursera's 'How to Win a Data Science Competition'**
April, 2020;  Andreas Theodoulou and Michael Gaidis;  (Competition Info last updated:  3 years ago)

###**About this Competition**

You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.

Evaluation: root mean squared error (RMSE). True target values are clipped into [0,20] range.

###**File descriptions**

***sales_train.csv*** - the training set. Daily historical data from January 2013 to October 2015.

***test.csv*** - the test set. You need to forecast the sales for these shops and products for November 2015.

***sample_submission.csv*** - a sample submission file in the correct format.

***items.csv*** - supplemental information about the items/products.

***item_categories.csv***  - supplemental information about the items categories.

***shops.csv***- supplemental information about the shops.

###**Data fields**

***ID*** - an Id that represents a (Shop, Item) tuple within the test set

***shop_id*** - unique identifier of a shop

***item_id*** - unique identifier of a product

***item_category_id*** - unique identifier of item category

***item_cnt_day*** - number of products sold. You are predicting a monthly amount of this measure

***item_price*** - current price of an item

***date*** - date in format dd/mm/yyyy

***month*** - a consecutive month number. January 2013 is 0, February 2013 is 1,..., October 2015 is 33

***item_name*** - name of item

***shop_name*** - name of shop

***item_category_name*** - name of item category

## **Colab Prep Tips** for those using Google Colab




### **Save Previous Work**
* Click **File -> Save a copy in Drive** and click **Open in new tab** in the pop-up window to save your progress in Google Drive. (This places the copy at the top level of Colab directory.)
* Or, in Google Drive before opening this notebook, right-click on this ipynb and select ***make a copy***, then with the copy in the same directory, right-click and select ***rename*** to update the version number.  Finally, right-click on the new version and ***open in colab***.

### **Select Runtime Type** *before* running notebook:
* Click **Runtime -> Change runtime type** and select **GPU** or **TPU** in Hardware accelerator box to enable faster training.

### **Keep Colab Active**
* To keep Colab connected by clicking on Colab window once every minute, go to Chrome Dev Tools --> Console Tab --> run the following code (April 2020):
</br>Take note that this should prevent disconnecting after each 1.5 hours of inactivity, but each runtime, if you don't have Colab Pro, will be terminated after 12 hours. (Pro = 24 hours) (Interval below is in millisec.)
```
function ClickConnect(){
    console.log("Clicked on connect button"); 
    document.querySelector("#ok").click()
}
setInterval(ClickConnect,60000)
```
Note that it will throw an error, its ok, it means that the Disconnection notification is not shown. Once it appear it will be clicked to reconnect.

* If that doesn't work, try this in the console:
```
function ClickConnect(){
    console.log("Clicked on connect button"); 
    document.querySelector("colab-connect-button").click()
}
setInterval(ClickConnect,60000)
```
* Lastly, can try this (older):
```
function KeepClicking(){
   console.log("Clicking");
   document.querySelector("colab-toolbar-button#connect").click()
}setInterval(KeepClicking,600000)
```

### **Save Previous Work**
* Click **File -> Save a copy in Drive** and click **Open in new tab** in the pop-up window to save your progress in Google Drive. (This places the copy at the top level of Colab directory.)
* Or, in Google Drive before opening this notebook, right-click on this ipynb and select ***make a copy***, then with the copy in the same directory, right-click and select ***rename*** to update the version number.  Finally, right-click on the new version and ***open in colab***.
```
from datetime import datetime
from pytz import timezone
amsterdam = timezone('Europe/Amsterdam')
ams_time = amsterdam.localize(datetime(2002, 10, 27, 6, 0, 0))
print(ams_time)
# 2002-10-27 06:00:00+01:00
# It will also know when it's Summer Time
# in Amsterdam (similar to Daylight Savings Time):
ams_time = amsterdam.localize(datetime(2002, 6, 27, 6, 0, 0))
print(ams_time)
# 2002-06-27 06:00:00+02:00
```

### **Import Python Packages; Set Environment Options; Identify Input Data Files**



### **Analysis and Descriptive (Helper) Functions**



###**Define Feature Columns, Statistics, and Lags**

###**Define Dictionaries / Dataframes to Enable Looping Grid Search for Optimal Parameters**

##**Mount Google Drive for access to Google Drive local repo; Load Data**

In [None]:
# import psutil
# print(psutil.cpu_count(logical=False))
# print(psutil.cpu_freq(percpu=False))
# for k,v in os.environ.items():
#     print(k, v)
# import sys
# print(sys.platform)

# Output:
# 2
# None
# ENV /root/.bashrc
# GCS_READ_CACHE_BLOCK_SIZE_MB 16
# CLOUDSDK_CONFIG /content/.config
# CUDA_VERSION 10.1.243
# PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin
# HOME /root
# LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
# LANG en_US.UTF-8
# SHELL /bin/bash
# LIBRARY_PATH /usr/local/cuda/lib64/stubs
# CUDA_PKG_VERSION 10-1=10.1.243-1
# SHLVL 1
# GCE_METADATA_TIMEOUT 0
# NCCL_VERSION 2.7.8
# NVIDIA_VISIBLE_DEVICES all
# TF_FORCE_GPU_ALLOW_GROWTH true
# DEBIAN_FRONTEND noninteractive
# CUDNN_VERSION 7.6.5.32
# LAST_FORCED_REBUILD 20200910
# JPY_PARENT_PID 24
# PYTHONPATH /env/python
# DATALAB_SETTINGS_OVERRIDES {"kernelManagerProxyPort":6000,"kernelManagerProxyHost":"172.28.0.3","jupyterArgs":["--ip=\"172.28.0.2\""]}
# NO_GCE_CHECK True
# GLIBCXX_FORCE_NEW 1
# NVIDIA_DRIVER_CAPABILITIES compute,utility
# _ /tools/node/bin/node
# LD_PRELOAD /usr/lib/x86_64-linux-gnu/libtcmalloc.so.4
# NVIDIA_REQUIRE_CUDA cuda>=10.1 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419
# OLDPWD /
# HOSTNAME 931aab700078
# COLAB_GPU 0
# PWD /
# CLOUDSDK_PYTHON python3
# GLIBCPP_FORCE_NEW 1
# PYTHONWARNINGS ignore:::pip._internal.cli.base_command
# TBE_CREDS_ADDR 172.28.0.1:8008
# TERM xterm-color
# CLICOLOR 1
# PAGER cat
# GIT_PAGER cat
# MPLBACKEND module://ipykernel.pylab.backend_inline
# TZ EST+05EDT,M4.1.0,M10.5.0
# KMP_DUPLICATE_LIB_OK True
# KMP_INIT_AT_FORK FALSE
# linux

In [None]:
ilp = Path("/usr/lib/python3.6/importlib/__init__.py")
print(ilp.read_text())

"""A pure Python implementation of import."""
__all__ = ['__import__', 'import_module', 'invalidate_caches', 'reload']

# Bootstrap help #####################################################

# Until bootstrapping is complete, DO NOT import any modules that attempt
# to import importlib._bootstrap (directly or indirectly). Since this
# partially initialised package would be present in sys.modules, those
# modules would get an uninitialised copy of the source version, instead
# of a fully initialised version (either the frozen one or the one
# initialised below if the frozen one is not available).
import _imp  # Just the builtin component, NOT the full Python module
import sys

try:
    import _frozen_importlib as _bootstrap
except ImportError:
    from . import _bootstrap
    _bootstrap._setup(sys, _imp)
else:
    # importlib._bootstrap is the built-in import, ensure we don't create
    # a second copy of the module.
    _bootstrap.__name__ = 'importlib._bootstrap'
    _bootstrap.__pac

##**Data Preparation: Feature Merging and Feature Generation**
###**1) Compute and Merge Statistics-Based Features on Grouped-by-Month training data**
* Note that features based on price are nonsensical if we add cartesian product fill.  However, item sales and item revenues are OK to use.

###**2) Add Cartesian Product rows to the training data:**
* Idea is to help the model by informing it that we explicitly have no information about certain relevant shop_item pairs in certain months.
* Each month in train data will have additional rows such that the Cartesian Product of all shops and items ALREADY PRESENT IN THAT MONTH will be included.* * When we merge lagged features below, we will only forward-shift the shop-item pairs that are present in the later month. *(Might revisit later, if memory requirements are not too big, can forward-shift all shop-item pairs.)*
* **If not adding Cartesian Product, or if fillna(0) is used, can round features to integers, saving memory (pandas integers cannot store np.NaN; need float32)**

###**3) Add Lagged Statistics columns to the training data:**

In [None]:
# domino tiles:  https://www.fileformat.info/info/unicode/block/domino_tiles/utf8test.htm
print('\u2227'*5,'\u2228'*5,'\u2303'*5,'\u2304'*5,'^^^','\u02c5','\u02c4','\u02c6'*5,'\u02ec'*5,'\u22c0'*5,'\u22c1'*5,'\u2306'*5,'\u2305'*5,'\u23f7'*5)
print('\u25b2'*5,'\u25bc'*5,'\u25c6'*5,'\u25d2'*5,'\u25d3'*5,'\u25b4'*5,'\u25be'*5,'\u2635'*5,'\u269c'*5,'\u26d6'*5,'\u2b81'*5,'\u2b7f'*5,'\u26dd'*5)
print('\u26ba'*5,'\u26bb'*5,'\u2622'*5,'\u262f'*5,'\u2934'*5,'\u2935'*5,'\u2b71'*5,'\u2b73'*5,'\u2b9d'*5,'\u2b9f'*5,'\u2bc5'*5,'\u2bc6'*5)
print('\u2ba4'*5,'\u2ba5'*5,'\u2182'*5,'\u2180'*5,'\u2b12'*5,'\u2b13'*5,'\u2b18'*5,'\u2b19'*5,'\u273d'*5,'\u2720'*5,'\u1cf2'*5,'\u1cf6'*5,'\u205a'*5,'\u2021'*5)
print('\u224b'*5,'\u224d'*5,'\u2259'*5,'\u225a'*5,'\u2263'*5,'\u2251'*5,'\u2253'*5,'\u22ce'*5,'\u22cf'*5,'\u2339'*5,'\u2797'*5,'\u27d7'*5,'\u267b'*5)
print('\u29d6'*5,'\u29d7'*5,'\u2bc1'*5,'\u2b27'*5,'\u2a77'*5,'\u2a8b'*5,'\u2ad8'*5,'\u2e44'*5,'\u2e0e'*5,'\u2e1e'*5,'\u2e1f'*5,'\u3013'*5)
print('\U0001f503'*5,'\U0001f501'*5,'\U0001f3ac','\U0001f5aa','\U0001f5ab','\U0001f536'*5,'\U0001f06d'*5,'\U0001f6d1'*5)

##**To Do List:**

###**Loop over things to compute statistics, column scaling, adding cartesian product rows, and adding lagged features**
* multiprocess --> pool(merge,[months list]) do months in parallel? (maybe split monthly_stt by months, then do the merges in parallel, then concatenate all the months back together)
* multiprocess --> pool(merge,[lags list]) do lags in parallel?; check for proper column order / reset if necessary  (can I just add the shifted columns, and delete any N/A things where I don't have a shop-item match?, or make a big df with all the lags and then just one single merge (how="left") )

###**Loop over model fitting parameter splits**
* multiprocess.Pool (if not too overwhelming, can do several (or all) model fitting iterations in parallel)
* (?replicate "test" with simple code?) so we don't need to load and carry "test" dataframe in memory throughout.  Or, load from disk when loading ftr files in prediction module.
* del all dataframes containing data, after all loops over model params are done

###**Additional routines**
* Plot feature importance ... each iteration, and ensemble averages (e.g., if rmse is < xxx).  Make df of feature importances (names=columns) vs. run iteration number (rows) and compute mean, stddev, range, quantiles
* Compute ensemble averages: straight average, weighted by rmse, etc.     
```
Simple ensemble averaging ensemble_y_pred_test = []
? ensemble_y_pred_test.append(y_pred_test)
? y_test_pred_avg = np.mean(ensemble_y_pred_test, axis=0)
compute feature importances averaged over ensemble
```
* Look at other locations for multiprocessing
* Look at other locations for timing blocks (and maybe save in MEMORY_STATS, as in have a MEMORY_STATS append at the end of every timed block)
* Look at other locations for MEMORY_STATS
</br>

* Categorical features with LGBM: double-check it is working?
```
categorical_feature 🔗︎, default = "", type = multi-int or string, aliases: cat_feature, categorical_column, cat_column
        used to specify categorical features
        use number for index, e.g. categorical_feature=0,1,2 means columns 0, 1 and 2 are categorical features
        add a prefix name: for column name, e.g. categorical_feature=name:c1,c2,c3 means c1, c2 and c3
        Index starts from 0 and it doesn’t count the label column when passing type is int
        All values should be less than Int32.MaxValue (2147483647)
        Using large values could be memory consuming. Tree decision rule works best when categorical features are presented by consecutive integers starting from zero
        All negative values will be treated as missing values
```
Scikit-learn API: If ‘auto’ and data is pandas DataFrame, pandas unordered categorical columns are used. *????Note: (WHAT API?) only supports categorical int type (**not applicable** for data represented as **pandas DataFrame** in Python-package)*... double-check that we are using a working API
</br>

* Special code needed for **GPU** enabled-LGBM modeling??
</br>

* Possible setup for continued training, especially if we find runtimes are cut off by Colab.  (will probably need to save lgbm.model to disk at each step)...  init_model (string, Booster, LGBMModel or None, optional (default=None)) – 
Filename of LightGBM model, Booster instance or LGBMModel instance used for **continue training**




#**Potentially Useful Code Snippets**

###**Ensembling and Trend/Feature Importance**

In [None]:
# ENSEMBLING and FEATURE IMPORTANCE / TRENDS ############################################

# ensembling_fn = True # ensembling_fn(output_file_names):
#     # can pull the submission files off the disk using OUTPUTS_df[model filename], after optionally setting a threshold for inclusion in the
#     #    ensemble, such as OUTPUTS_df[val_rmse] must be in the lowest quantile or something similar
#     # average, weighted-average, other method, to combine anything already saved to disk (default = straight avg of all runs in above loop)

# compute_trends_fn = True # compute_trends_fn(output_results):
#     # create a df of features & feature importances for each run (or "explode" sideways the OUTPUTS_df features & importances)... use pd.info to
#     #    compute quantiles, mean, stddev for each of the features, and determine if anything looks interesting
#     # look at feature importances all together, and see if anything obvious good or bad
#     '''
#     Might want to look at the predict_contrib parameter for LGBM:  https://lightgbm.readthedocs.io/en/latest/Parameters.html
#     '''
#     # look at splits and see if any parameters obviously good or bad (correlation matrix of parameters with output results?)
#     #feat_imp = pd.DataFrame.from_dict(OUTPUTS_df["feature_importances_"])
#     #OUTPUTS_df.at[RUN_n,"feature_name_"]
#     make empty df with columns from list at = outputs.at[0,featname]
#     append rows with elements = list elements in feature_importances_ for each feature name
#     df now has as many rows as N_TRAIN_iterations
#     compute df.info stats or quantiles/mean/std and store somewhere; make some plots; make some automated recommendations?
nocode=True

###**Averaging Several Stored Predictions/Submissions from Disk**

In [None]:
# average several submission files to get ensemble average
%cd "{GDRIVE_REPO_PATH}"
# source_dir = Path('models_and_predictions/bagging_LGBM')
# prediction_files = source_dir.iterdir()
source_dir = 'models_and_predictions/bagging_LGBM'
prediction_files = os.listdir(source_dir)
print("Loading Files from Google Drive repo into Colab...\n")

# filename to save ensemble average predictions for submission
ensemble_name = 'LGBMv6v7_bag06'

print(f'filename {ensemble_name}')
# Loop to load the data files into appropriately-named pandas DataFrames, and save in np array for easy calc of ensemble average
preds = []
for f_name in prediction_files:
    filename = f_name.rsplit("/")[-1]
    data_frame_name = filename.split(".")[0][:-11]
    path_name = os.path.join('models_and_predictions/bagging_LGBM/'+ filename)
    exec(data_frame_name + " = pd.read_csv(path_name)")
    print(f'Data Frame: {data_frame_name}; n_rows = {len(eval(data_frame_name))}, n_cols = ')
    preds.append(eval(data_frame_name).item_cnt_month.to_numpy())

# Simple ensemble averaging
pred_ens_avg = np.mean(preds, axis=0)
ensemble_submission = LGBMv6mg_17_.copy(deep=True)
ensemble_submission.item_cnt_month = pred_ens_avg

ensemble_submission.to_csv("./models_and_predictions/" + ensemble_name + '_submission.csv', index=False)

display(ensemble_submission.head(8))
print(f'filename {ensemble_name} saved: {strftime("%a %X %x")}')
print('Coursera:  ')

###**Feature Importances**

In [None]:
# Plot feature importance - Results Visualization
itercount=0
if ITERS.at[itercount,'_model_type'] == 'LGBM':
    print_threshold = 25
    feature_importances_ = ITERS.at[itercount,'feature_importances_']
    feature_name_        = ITERS.at[itercount,'feature_name_']
    fi = pd.DataFrame(zip(feature_name_,feature_importances_),columns=['feature','value'])
    fi = fi.sort_values('value',ascending=False,ignore_index=True)
    fi['norm_value'] = round(100*fi.value / fi.value.max(),2)
    fi['lag'] = fi.feature.apply(lambda x: (x.split('L')[-1]) if len(x.split('L'))> 1 else 0)
    fi['feature_base'] = fi.feature.apply(lambda x: x.split('_L')[0])
    print(fi.iloc[list(range(0,8))+list(range(-7,0)),:]) #[[1,3,5,7,-7,-5]][:])
    # model_filename_fi = ITERS.at[itercount,'_model_type']+ITERS.at[itercount,'_model_filename'] + "_feature_importance.csv"
    # fi.to_csv("./models_and_predictions/" + model_filename_fi, index=False)
    # printout to assist with removing low-importance features for following runs
    if fi.norm_value.min() < print_threshold:
        fi_low = fi[fi.norm_value < print_threshold]
        fi_low = fi_low.sort_values(['lag','norm_value'])
        fi_low.norm_value = fi_low.norm_value.apply(lambda x: f'{round(x):d}')
        fi_low['lag_feature_importance'] = fi_low.apply(lambda x: f"{f'L{x.lag} fi{x.norm_value}':{len(x.feature_base)}s}",axis=1)
        print(fi_low.lag_feature_importance.to_list())
        print(fi_low.feature_base.to_list())
    # make importances relative to max importance
    feature_importances_ = 100.0 * (feature_importances_ / feature_importances_.max())
    sorted_idx = np.arange(feature_importances_.shape[0])
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(24,12)) 
    plt.bar(pos, feature_importances_[sorted_idx], align='center')
    plt.xticks(pos, feature_name_[sorted_idx])
    plt.ylabel('Relative Importance')
    plt.title('Variable Importance')
    plt.tick_params(axis='x', which='major', labelsize = 13, labelrotation=90)
    plt.grid(True,which='major',axis='y')
    plt.tick_params(axis='y', which='major', grid_color='black',grid_alpha=0.7)
    # plt.savefig('LGBM_feature_importance_v1.4_mg.png')
    plt.show()

###**Using GPUs with LGBM**

In [None]:
# GPU use with LGBM modeling:
'''
May want to see if we can better inform LGBM routine when we are using a GPU
https://lightgbm.readthedocs.io/en/latest/GPU-Targets.html#query-opencl-devices-in-your-system
Your system might have multiple GPUs from different vendors (“platforms”) installed. Setting up LightGBM GPU device requires two parameters: 
OpenCL Platform ID (gpu_platform_id) and OpenCL Device ID (gpu_device_id). Generally speaking, each vendor provides an OpenCL platform, 
and devices from the same vendor have different device IDs under that platform. For example, if your system has an Intel integrated GPU and 
two discrete GPUs from AMD, you will have two OpenCL platforms (with gpu_platform_id=0 and gpu_platform_id=1). If the platform 0 is Intel, 
it has one device (gpu_device_id=0) representing the Intel GPU; if the platform 1 is AMD, it has two devices (gpu_device_id=0, gpu_device_id=1) 
representing the two AMD GPUs. If you have a discrete GPU by AMD/NVIDIA and an integrated GPU by Intel, make sure to select the correct gpu_platform_id 
to use the discrete GPU as it usually provides better performance.

On Windows, OpenCL devices can be queried using GPUCapsViewer, under the OpenCL tab. http://www.ozone3d.net/gpu_caps_viewer/ 
Note that the platform and device IDs reported by this utility start from 1. So you should minus the reported IDs by 1.

On Linux, OpenCL devices can be listed using the clinfo command. On Ubuntu, you can install clinfo by executing sudo apt-get install clinfo.

Make sure you list the OpenCL devices in your system and set gpu_platform_id and gpu_device_id correctly. 
In the following examples, our system has 1 GPU platform (gpu_platform_id = 0) from AMD APP SDK. 
The first device gpu_device_id = 0 is a GPU device (AMD Oland), and the second device gpu_device_id = 1 is the x86 CPU backend.

R Example of using GPU (gpu_platform_id = 0 and gpu_device_id = 0 in our system):
> params <- list(objective = "regression",
+                metric = "rmse",
+                device = "gpu",
+                gpu_platform_id = 0,
+                gpu_device_id = 0,
+                nthread = 1,
+                boost_from_average = FALSE,
+                num_tree_per_iteration = 10,
+                max_bin = 32)
> model <- lgb.train(params,
+                    dtrain,
+                    2,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1,
+                    early_stopping_rounds = 10)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data: 6513, number of used features: 116
[LightGBM] [Info] Using GPU Device: Oland, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 12
[LightGBM] [Info] 40 dense feature groups (0.12 MB) transferred to GPU in 0.004211 secs. 76 sparse feature groups.
[LightGBM] [Info] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Trained a tree with leaves=16 and max_depth=8
[1]:    test's rmse:1.10643e-17
[LightGBM] [Info] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Trained a tree with leaves=7 and max_depth=5
[2]:    test's rmse:0

Running on OpenCL CPU backend devices is in generally slow, and we observe crashes on some Windows and macOS systems. 
Make sure you check the Using GPU Device line in the log and it is not using a CPU. The above log shows that we are using Oland GPU from AMD and not CPU.

Example of using CPU (gpu_platform_id = 0, gpu_device_id = 1). The GPU device reported is Intel(R) Core(TM) i7-4600U CPU, 
so it is using the CPU backend rather than a real GPU.

> params <- list(objective = "regression",
+                metric = "rmse",
+                device = "gpu",
+                gpu_platform_id = 0,
+                gpu_device_id = 1,
+                nthread = 1,
+                boost_from_average = FALSE,
+                num_tree_per_iteration = 10,
+                max_bin = 32)
> model <- lgb.train(params,
+                    dtrain,
+                    2,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1,
+                    early_stopping_rounds = 10)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data: 6513, number of used features: 116
[LightGBM] [Info] Using requested OpenCL platform 0 device 1
[LightGBM] [Info] Using GPU Device: Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 12
[LightGBM] [Info] 40 dense feature groups (0.12 MB) transferred to GPU in 0.004540 secs. 76 sparse feature groups.
[LightGBM] [Info] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Trained a tree with leaves=16 and max_depth=8
[1]:    test's rmse:1.10643e-17
[LightGBM] [Info] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Trained a tree with leaves=7 and max_depth=5
[2]:    test's rmse:0

Known issues:
Using a bad combination of gpu_platform_id and gpu_device_id can potentially lead to a crash due to OpenCL driver issues on some machines 
(you will lose your entire session content). Beware of it.
****
**** some systems have integrated graphics card (Intel HD Graphics) and a dedicated graphics card (AMD, NVIDIA), the dedicated graphics card may 
automatically override the integrated graphics card. The workaround is to disable your dedicated graphics card to use your integrated graphics card.
'''
nocode=True

###**Old code: LightGBM - Lightweight Gradient-Boosted Decision Tree**
###**Old code: SK_HGBR - SKLearn Histogram Gradient Boosting Regressor**

In [None]:
# model_gbdt = lgb.LGBMRegressor(
#     objective='regression',
#     boosting_type='gbdt',           # gbdt= Gradient Boosting Decision Tree; dart= Dropouts meet Multiple Additive Regression Trees; goss= Gradient-based One-Side Sampling; rf= Random Forest
#     learning_rate=params["lr"],     # You can use callbacks parameter of fit method to shrink/adapt learning rate in training using reset_parameter callback
#     n_estimators=params["maxit"],   # Number of boosted trees to fit = max_iterations
#     metric='rmse',
#     subsample_for_bin=200000,       # Number of samples for constructing bins
#     num_leaves=31,                  # Maximum tree leaves for base learners
#     max_depth=-1,                   # Maximum tree depth for base learners, <=0 means no limit
#     min_split_gain=0.0,             # Minimum loss reduction required to make a further partition on a leaf node of the tree
#     min_child_weight=0.001,         # Minimum sum of instance weight (hessian) needed in a child (leaf)
#     min_child_samples=20,           # Minimum number of data needed in a child (leaf)
#     colsample_bytree=params["reg"], # dropout fraction of columns during fitting (max=1 = no dropout)
#     random_state=params["seed"],    # seed value
#     silent=False,                   # whether to print info during fitting
#     importance_type='split',        # feature importance type: 'split'= N times feature is used in model; 'gain'= total gains of splits which use the feature
#     reg_alpha=0.0,                  # L1 regularization
#     reg_lambda=0.0,                 # L2 regularization
#     n_jobs=- 1,                     # N parallel threads to use on computer
#     subsample=1.0,                  # row fraction used for training: keep at 1 for time series data
#     subsample_freq=0                # keep at 0 for time series
#     )


# model_gbdt.fit( 
#     data['X_train'],                        # Input feature matrix (array-like or sparse matrix of shape = [n_samples, n_features])
#     data['y_train'],                        # The target values (class labels in classification, real numbers in regression) (array-like of shape = [n_samples])
#     eval_set=[(data['X_val'], data['y_val'])],              # can have multiple tuples of validation data inside this list
#     eval_names=None,                        # Names of eval_set (list of strings or None, optional (default=None))
#     eval_metric='rmse',                     # Default: 'l2' (= mean squared error, 'mse') for LGBMRegressor; options include 'l2_root'='root_mean_squared_error'='rmse' and 'l1'='mean_absolute_error'='mae' + more
#     early_stopping_rounds=params["estop"],  # Activates early stopping. The model will train until the validation score stops improving. Validation score needs to improve at least every early_stopping_rounds 
#                                             #     to continue training. Requires at least one validation data and one metric. If there’s more than one, will check all of them. But the training data is ignored anyway. 
#                                             #     To check only the first metric, set the first_metric_only parameter to True in additional parameters **kwargs of the model constructor.
#     init_score=None,                        # Init score of train data
#     eval_init_score=None,                   # Init score of eval data (list of arrays or None, optional (default=None))
#     verbose=CONSTANTS["VERBOSITY"] ,        # If True, metric on the eval set is printed at each boosting stage. If n=int, the metric on the eval set is printed at every nth boosting stage. Best and final also print.
#     feature_name='auto',                    # Feature names. If 'auto' and data is pandas DataFrame, data columns names are used. (list of strings or 'auto', optional (default='auto'))
#     categorical_feature='auto',             # Categorical features (list of strings or int, or 'auto', optional (default='auto')) If list of int, interpreted as indices. 
#                                             # If list of strings, interpreted as feature names (need to specify feature_name as well). 
#                                             # If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. All values in categorical features should be less than int32 max value (2147483647). 
#                                             # Large values could be memory-consuming. Consider using consecutive integers starting from zero. All negative values in categorical features are treated as missing values.
#     callbacks=None                          # List of callback functions that are applied at each iteration (list of callback functions or None, optional (default=None)) See Callbacks in Python API for more information.
#     )

    # if mod_type == 'HGBR':
    #     # TTSplit should use TRAIN_FINAL = 33 (train on all data), and it will return also val=month33 for calculation at end (only)
    #     model_gbdt = HistGradientBoostingRegressor(
    #         learning_rate=LR, 
    #         max_iter=maxiter, 
    #         l2_regularization = reg,
    #         early_stopping=False, 
    #         verbosity = verb,
    #         random_state=seed_val)
    
    #     tic = perf_counter()
    #     model_gbdt.fit(X_train_np, y_train)
    #     toc = perf_counter()
    #     model_fit_time = datetime.utcfromtimestamp(toc-tic).strftime('%H:%M:%S')
    #     print(f"model HGBR fit time: {model_fit_time}")
    #     best_iter = maxiter
    #     best_val_rmse = 0
# model_params = {
        #         'objective':param_df.at[iternum,'objective'],
        #         'boosting_type':param_df.at[iternum,'boosting_type'],
        #         'learning_rate':param_df.at[iternum,'learning_rate'],
        #         'n_estimators':param_df.at[iternum,'n_estimators'],
        #         'metric':param_df.at[iternum,'metric'],
        #         'subsample_for_bin':param_df.at[iternum,'subsample_for_bin'],
        #         'num_leaves':param_df.at[iternum,'num_leaves'],
        #         'max_depth':param_df.at[iternum,'max_depth'],
        #         'min_split_gain':param_df.at[iternum,'min_split_gain'],
        #         'min_child_weight':param_df.at[iternum,'min_child_weight'],
        #         'min_child_samples':param_df.at[iternum,'min_child_samples'],
        #         'colsample_bytree':param_df.at[iternum,'colsample_bytree'],
        #         'random_state':param_df.at[iternum,'random_state'],
        #         'silent':param_df.at[iternum,'silent'],
        #         'importance_type':param_df.at[iternum,'importance_type'],
        #         'reg_alpha':param_df.at[iternum,'reg_alpha'],
        #         'reg_lambda':param_df.at[iternum,'reg_lambda'],
        #         'n_jobs':param_df.at[iternum,'n_jobs'],
        #         'subsample':param_df.at[iternum,'subsample'],
        #         'subsample_freq':param_df.at[iternum,'subsample_freq']
                # }

        # fit_params = {
        #         'eval_metric':param_df.at[iternum,'eval_metric'],
        #         'early_stopping_rounds':param_df.at[iternum,'early_stopping_rounds'],
        #         'init_score':param_df.at[iternum,'init_score'],
        #         'eval_init_score':param_df.at[iternum,'eval_init_score'],
        #         'verbose':param_df.at[iternum,'verbose'],
        #         'feature_name':param_df.at[iternum,'feature_name'],
        #         'categorical_feature':param_df.at[iternum,'categorical_feature'],
        #         'callbacks':param_df.at[iternum,'callbacks']
        #         }

        # param_df.at[iternum,"feature_name_"]            = model_gbdt.feature_name_




# # Parameters Dictionary stores everything for dumping to file later
# SPEC = OrderedDict()
# FEATURES["_MODEL_NAME"] = 'LGBMv13_15ens'   # 'LGBMv10_11ens'  # Name of file model substring to save data submission to (= False if user to input it below)
# FEATURES["_MODEL_TYPE"] = 'LGBM'  # 'HGBR'
# FEATURES["_TEST_MONTH"] = 34

# # Optional operations to delete irrelevant shops or item categories, and to scale sales by month length, etc.;  set to FALSE if no operation desired
# FEATURES["_EDA_DELETE_SHOPS"]     = [9,20] #[0,1,8,9,11,13,17,20,23,27,29,30,32,33,40,43,51,54] #[8, 9, 13, 20, 23, 32, 33, 40] # [9,20] #  # False # these are online shops, untested shops, and early-termination + online shops
# FEATURES["_EDA_DELETE_ITEM_CATS"] = [8, 10, 32, 59, 80, 81, 82]  #[1,4,8,10,13,14,17,18,32,39,46,48,50,51,52,53,59,66,68,80,81,82] #  #[8, 80, 81, 82]  # False # hokey categories, untested categories, really hokey categories
# FEATURES["_EDA_SCALE_MONTH"]         = 'week_retail_weight'  # False # scale sales by days in month, number of each weekday, and Russian recession retail sales index

# # columns to keep for this round of modeling (dropping some of the less important features to save memory):
# FEATURES["COLS_KEEP_ITEMS"]             = ['item_id', 'item_group', 'item_cluster', 'item_category_id']  #, 'item_category4']
# FEATURES["COLS_KEEP_SHOPS"]             = ['shop_id','shop_group']
# FEATURES["COLS_KEEP_DATE_SCALING"]      = ['month', 'days_in_M', 'weekday_weight', 'retail_sales', 'week_retail_weight']
# FEATURES["COLS_KEEP_BASE_TRAIN_TEST"]   = ['month', 'price', 'sales', 'shop_id', 'item_id']

# # re-order columns for organized readability, for the (to be created) combined sales-train-test (stt) dataset
# FEATURES["COLS_ORDER_STT"]        = ['month', 'sales', 'revenue', 'shop_id', 'item_id', 'shop_group', 'item_category_id', 'item_group', 'item_cluster'] #,   'revenue','item_category4','shop_group'
# FEATURES["PROVIDED_INTEGER_FEATURES"]        = [e for e in FEATURES["COLS_ORDER_STT"] if e not in {'sales','price','revenue'}]  
# FEATURES["FEATURES_MONTHLY_STT_START"]       = [e for e in FEATURES["COLS_ORDER_STT"] if e not in {'month','sales','price','revenue','shop_id','item_id'}]  # these are categorical features that need to be merged onto test data set
# FEATURES["PROVIDED_CATEGORICAL_FEATURES"]    = [e for e in FEATURES["COLS_ORDER_STT"] if e not in {'month','sales','price','revenue'}]
# FEATURES["_USE_CATEGORICAL"]         = True  # pd dataframe columns "PROVIDED_CATEGORICAL_FEATURES" are changed to categorical dtype just before model fitting/creation

# FEATURES["AGG_STATS"] = OrderedDict()
# FEATURES["AGG_STATS"]["sales"]     = ['sum', 'median', 'count']
# FEATURES["AGG_STATS"]["revenue"]   = ['sum']  # revenue can handle fillna(0) cartesian product; price doesn't make sense with fillna(0), so don't use that at this time
# #FEATURES["AGG_STATS"]["price"]     = ['median','std']

# # aggregate statistics columns (initial computation shall be 'sales per month' prediction target for shop_id-item_id pair grouping)
# FEATURES["STATS_FEATURES"] = [['shop_id', 'item_id'], ['shop_id', 'item_category_id'], ['shop_id', 'item_cluster']] + FEATURES["PROVIDED_CATEGORICAL_FEATURES"]

# FEATURES["LAGS_MONTHS"] = [1,2,3,4,5,6,7,8]  # month lags to include in model 
# FEATURES["LAG_FEATURES"] = {}
# for i in FEATURES["LAGS_MONTHS"]:
#     FEATURES["LAG_FEATURES"][i] = ['y_sales', 'shop_id_x_item_category_id_sales_sum', 'item_id_sales_sum', 'item_cluster_sales_sum'] 
# FEATURES["LAG_FEATURES"][1] = ['y_sales', 'shop_id_x_item_id_sales_median', 'shop_id_x_item_id_sales_count', 'shop_id_x_item_id_revenue_sum', 
#                      'shop_id_x_item_category_id_sales_sum', 'shop_id_x_item_category_id_sales_median', 'shop_id_x_item_category_id_sales_count', 
#                      'shop_id_x_item_cluster_sales_sum', 'shop_id_x_item_cluster_sales_median', 
#                      'shop_id_sales_sum', 'shop_id_sales_count', 
#                      'item_id_sales_sum', 'item_id_sales_median', 'item_id_sales_count', 'item_id_revenue_sum', 
#                      'shop_group_revenue_sum', 
#                      'item_category_id_sales_sum', 'item_category_id_sales_count', 'item_category_id_revenue_sum', 
#                      'item_group_sales_sum', 'item_group_revenue_sum', 
#                      'item_cluster_sales_sum', 'item_cluster_sales_count', 'item_cluster_revenue_sum']

# FEATURES["LAG_FEATURES"][2] = ['y_sales', 'shop_id_x_item_id_sales_count', 'shop_id_x_item_id_revenue_sum', 
#                      'shop_id_x_item_category_id_sales_sum', 'shop_id_x_item_category_id_sales_count', 'shop_id_x_item_category_id_revenue_sum', 
#                      'shop_id_x_item_cluster_sales_sum', 'shop_id_x_item_cluster_sales_count', 
#                      'shop_id_sales_sum', 'item_id_sales_sum', 'item_id_sales_count', 'item_id_revenue_sum', 
#                      'item_category_id_sales_sum', 'item_category_id_sales_count', 
#                      'item_group_sales_sum', 
#                      'item_cluster_sales_sum', 'item_cluster_sales_count', 'item_cluster_revenue_sum']

# FEATURES["LAG_FEATURES"][3] = ['y_sales', 'shop_id_x_item_id_sales_count', 
#                      'shop_id_x_item_category_id_sales_sum', 
#                      'shop_id_sales_sum', 
#                      'item_id_sales_sum', 'item_id_sales_count', 'item_id_revenue_sum', 
#                      'item_category_id_sales_sum', 'item_category_id_sales_count', 
#                      'item_cluster_sales_sum', 'item_cluster_sales_count']

# # keep at least the highest importance feature for each lag, but remove all others with < 20% importance (month 13-32 training)
# FEATURES["LAG_FEATURES"][2] = [e for e in FEATURES["LAG_FEATURES"][2] if e not in {'item_group_sales_sum','shop_id_x_item_category_id_sales_sum','shop_id_x_item_cluster_sales_sum','shop_id_x_item_cluster_sales_count'}]
# FEATURES["LAG_FEATURES"][3] = [e for e in FEATURES["LAG_FEATURES"][3] if e not in {'item_cluster_sales_sum','shop_id_x_item_category_id_sales_sum','shop_id_x_item_id_sales_count'}]
# FEATURES["LAG_FEATURES"][4] = [e for e in FEATURES["LAG_FEATURES"][4] if e not in {'shop_id_x_item_category_id_sales_sum','y_sales','item_cluster_sales_sum'}]
# FEATURES["LAG_FEATURES"][5] = [e for e in FEATURES["LAG_FEATURES"][5] if e not in {'item_cluster_sales_sum','item_id_sales_sum','shop_id_x_item_category_id_sales_sum'}]
# FEATURES["LAG_FEATURES"][6] = [e for e in FEATURES["LAG_FEATURES"][6] if e not in {'item_id_sales_sum','item_cluster_sales_sum','shop_id_x_item_category_id_sales_sum'}]
# FEATURES["LAG_FEATURES"][7] = [e for e in FEATURES["LAG_FEATURES"][7] if e not in {'y_sales','item_cluster_sales_sum','shop_id_x_item_category_id_sales_sum'}]
# FEATURES["LAG_FEATURES"][8] = [e for e in FEATURES["LAG_FEATURES"][8] if e not in {'item_id_sales_sum','item_cluster_sales_sum','shop_id_x_item_category_id_sales_sum'}]

# # LAG_STATS_SET is SET of all aggregate statistics columns for all lags (allows us to shed the other stats, keeping memory requirements low)
# LAG_STATS_SET = FEATURES["LAG_FEATURES"][1]
# for l in FEATURES["LAGS_MONTHS"][1:]:
#     LAG_STATS_SET = LAG_STATS_SET + [x for x in FEATURES["LAG_FEATURES"][l] if x not in LAG_STATS_SET]
# FEATURES["STT_MONTHLY_COLS"] = FEATURES["PROVIDED_INTEGER_FEATURES"] + LAG_STATS_SET

# # Define various constants that drive the attributes of the various features
# FEATURES["_CLIP_TRAIN_H"]   = 20          # this clips sales after doing monthly groupings (monthly_stt dataframe) will also clip item_cnt_month predictions to 20 after the model runs
# FEATURES["_CLIP_TRAIN_L"]   = 0                   
# FEATURES["_CLIP_PREDICT_H"] = 20          # this clips the final result before submission to coursera
# FEATURES["_CLIP_PREDICT_L"] = 0    

# FEATURES["_USE_ROBUST_SCALER"]         = True        # scale features to reduce influence of outliers
# FEATURES["_ROBUST_SCALER_QUANTILES"]   = (20,80)
# FEATURES["_USE_MINMAX_SCALER"]         = True        # scale features to use large range of np.int16
# FEATURES["_MINMAX_SCALER_RANGE"]       = (0,16000)   # int16 = (0,32700); uint16 = (0,65500)  --> keep this range positive for best results with LGBM; keep range smaller for faster LGBM fitting
# FEATURES["_FEATURE_DATA_TYPE"]         = np.int16    # np.float32 #np.int16   np.uint16          # if fill n/a = 0, can adjust feature values to be integer values and save memory (not finding that int can store np.NAN)
# FEATURES["_USE_CARTPROD_FILL"]         = True        # use cartesian fill, or not
# FEATURES["_CARTPROD_TEST_PAIRS"]  = False       # include all shop-item pairings from test month as well as the in-month pairings
# FEATURES["_CARTPROD_FILLNA0"]    = True        # fill n/a cartesian additions with zeros (not good for price-based stats, however)
# FEATURES["_CARTPROD_FIRST_MONTH"] = 13          # month number + max lag to start adding Cartesian product rows (i.e., maxlag=6mo and CARTPROD_FILL_MONTH_BEGIN=10 will cartesian fill from 4 to 33)
# FEATURES["TRAIN_MONTH_START"]         = [13]        # == 24 ==> less than a year of data, but avoids December 'outlier' of 2014
# FEATURES["TRAIN_MONTH_END"]           = [29]        # [29,32] #,30,32]
# FEATURES["N_VAL_MONTHS"]              = [False]     #1 # ; if false, val is all months after training, up to and including 33; otherwise val is this many months after train_month_end

# # Define hyperparameters for modeling
# FEATURES["LEARNING_RATE"]       = [0.05]  # default = 0.1
# FEATURES["MAX_ITERATIONS"]      = [200] # default = 100
# FEATURES["EARLY_STOPPING"]      = [20]
# FEATURES["REGULARIZATION"]      = [0.4] # default = 1 for LGBM, 0 for HGBR (these models use inverse forms of regularization)
# FEATURES["VERBOSITY"]           = True #4 four is to print every 4th iteration; True is every iteration; False is no print except best and last
# FEATURES["SEED_VALUES"]         = [42]

# FEATURES["ALL_exploded_shape[0]"] = (len(FEATURES["SEED_VALUES"])*len(FEATURES["N_VAL_MONTHS"])*len(FEATURES["TRAIN_MONTH_END"])*len(FEATURES["TRAIN_MONTH_START"])*
#                          len(FEATURES["EARLY_STOPPING"])*len(FEATURES["MAX_ITERATIONS"])*len(FEATURES["REGULARIZATION"])*len(FEATURES["LEARNING_RATE"]) )


# print(f'Done: {strftime("%a %X %x")}')








# def unscale(scaler,target):
#     return scaler.inverse_transform(target.reshape(-1, 1)).squeeze()

# def GBDT_model(data=df, CONSTANTS=SPEC, params=OrderedDict()):
#     """
#     data is entire dataframe with train, validation, and test rows, and all columns including target prediction at "y_target"
#     constants is dictionary of setup constants
#     params is dictionary of this particular model train/val split and model fitting/prediction parameters
#     """
#     results = OrderedDict()
#     if CONSTANTS["_MODEL_TYPE"] == 'LGBM':
        
#         train_start = params["train_start_mo"]
#         train_end   = params["train_final_mo"]
#         val_months  = params["val_mo"]
#         test_month  = CONSTANTS["TEST_MONTH"]

#         train   = data.query('(month >= @train_start) & (month <= @train_end)')
#         y_train = train['y_target'].astype(np.float32)
#         y_train = y_train.reset_index(drop=True)
#         X_train = train.drop(['y_target'], axis=1)
#         X_train = X_train.reset_index(drop=True)
#         feature_names = X_train.columns

#         if val_months:
#             val = data.query('(month > (@train_end)) & (month <= (@train_end + @val_months)) & (month < @test_month)')
#         else:
#             val = data.query('((month > (@train_end)) & (month < @test_month)) | (month == (@test_month-1))')
#         y_val = val['y_target'].astype(np.float32)
#         y_val = y_val.reset_index(drop=True)
#         X_val = val.drop(['y_target'], axis=1)
#         X_val = X_val.reset_index(drop=True)

#         X_test = data.query('month == @test_month').drop(['y_target'], axis=1)
#         X_test = X_test.reset_index(drop=True)

#         print('X_train:')
#         print_col_info(X_train,8)
#         print(f'\n{X_train.head(2)}\n\n')
#         print('X_val:')
#         print_col_info(X_val,8)
#         print(f'\n{X_val.head(2)}\n\n')
#         print('X_test:')
#         print_col_info(X_test,8)
#         print(f'\n{X_test.head(2)}\n\n')
#         data_types = X_train.dtypes

#         del [[data, train, val]]

#         print('Starting training...')
#         model_gbdt = lgb.LGBMRegressor(
#             objective='regression',
#             boosting_type='gbdt',           # gbdt= Gradient Boosting Decision Tree; dart= Dropouts meet Multiple Additive Regression Trees; goss= Gradient-based One-Side Sampling; rf= Random Forest
#             learning_rate=params["lr"],     # You can use callbacks parameter of fit method to shrink/adapt learning rate in training using reset_parameter callback
#             n_estimators=params["maxit"],   # Number of boosted trees to fit = max_iterations
#             metric='rmse',
#             subsample_for_bin=200000,       # Number of samples for constructing bins
#             num_leaves=31,                  # Maximum tree leaves for base learners
#             max_depth=-1,                   # Maximum tree depth for base learners, <=0 means no limit
#             min_split_gain=0.0,             # Minimum loss reduction required to make a further partition on a leaf node of the tree
#             min_child_weight=0.001,         # Minimum sum of instance weight (hessian) needed in a child (leaf)
#             min_child_samples=20,           # Minimum number of data needed in a child (leaf)
#             colsample_bytree=params["reg"], # dropout fraction of columns during fitting (max=1 = no dropout)
#             random_state=params["seed"],    # seed value
#             silent=False,                   # whether to print info during fitting
#             importance_type='split',        # feature importance type: 'split'= N times feature is used in model; 'gain'= total gains of splits which use the feature
#             reg_alpha=0.0,                  # L1 regularization
#             reg_lambda=0.0,                 # L2 regularization
#             n_jobs=- 1,                     # N parallel threads to use on computer
#             subsample=1.0,                  # row fraction used for training: keep at 1 for time series data
#             subsample_freq=0,               # keep at 0 for time series
#             )

#         tic = perf_counter()
#         model_gbdt.fit( 
#             X_train,                                # Input feature matrix (array-like or sparse matrix of shape = [n_samples, n_features])
#             y_train,                                # The target values (class labels in classification, real numbers in regression) (array-like of shape = [n_samples])
#             eval_set=[(X_val, y_val)],              # can have multiple tuples of validation data inside this list
#             eval_names=None,                        # Names of eval_set (list of strings or None, optional (default=None))
#             eval_metric='rmse',                     # Default: 'l2' (= mean squared error, 'mse') for LGBMRegressor; options include 'l2_root'='root_mean_squared_error'='rmse' and 'l1'='mean_absolute_error'='mae' + more
#             early_stopping_rounds=params["estop"],  # Activates early stopping. The model will train until the validation score stops improving. Validation score needs to improve at least every early_stopping_rounds 
#                                                     #     to continue training. Requires at least one validation data and one metric. If there’s more than one, will check all of them. But the training data is ignored anyway. 
#                                                     #     To check only the first metric, set the first_metric_only parameter to True in additional parameters **kwargs of the model constructor.
#             init_score=None,                        # Init score of train data
#             eval_init_score=None,                   # Init score of eval data (list of arrays or None, optional (default=None))
#             verbose=CONSTANTS["VERBOSITY"] ,        # If True, metric on the eval set is printed at each boosting stage. If n=int, the metric on the eval set is printed at every nth boosting stage. Best and final also print.
#             feature_name='auto',                    # Feature names. If 'auto' and data is pandas DataFrame, data columns names are used. (list of strings or 'auto', optional (default='auto'))
#             categorical_feature='auto',             # Categorical features (list of strings or int, or 'auto', optional (default='auto')) If list of int, interpreted as indices. 
#                                                     # If list of strings, interpreted as feature names (need to specify feature_name as well). 
#                                                     # If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. All values in categorical features should be less than int32 max value (2147483647). 
#                                                     # Large values could be memory-consuming. Consider using consecutive integers starting from zero. All negative values in categorical features are treated as missing values.
#             callbacks=None                          # List of callback functions that are applied at each iteration (list of callback functions or None, optional (default=None)) See Callbacks in Python API for more information.
#             )

#         toc = perf_counter()
#         results["model_fit_time"] = datetime.utcfromtimestamp(toc-tic).strftime('%H:%M:%S')
#         print(f'Model LGBM fit time: {results["model_fit_time"]}')
#         results["best_iter"] = model_gbdt.best_iteration_
#         results["best_val_rmse"] = 0 #best_score


#     # if mod_type == 'HGBR':
#     #     # TTSplit should use TRAIN_FINAL = 33 (train on all data), and it will return also val=month33 for calculation at end (only)
#     #     model_gbdt = HistGradientBoostingRegressor(
#     #         learning_rate=LR, 
#     #         max_iter=maxiter, 
#     #         l2_regularization = reg,
#     #         early_stopping=False, 
#     #         verbosity = verb,
#     #         random_state=seed_val)
    
#     #     tic = perf_counter()
#     #     model_gbdt.fit(X_train_np, y_train)
#     #     toc = perf_counter()
#     #     model_fit_time = datetime.utcfromtimestamp(toc-tic).strftime('%H:%M:%S')
#     #     print(f"model HGBR fit time: {model_fit_time}")
#     #     best_iter = maxiter
#     #     best_val_rmse = 0
        
#     print("Starting predictions...")
#     tic = perf_counter()
#     y_pred_train =  model_gbdt.predict( X_train, num_iteration=model_gbdt.best_iteration_ )
#     y_pred_val =    model_gbdt.predict( X_val,   num_iteration=model_gbdt.best_iteration_ )
#     y_pred_test =   model_gbdt.predict( X_test,  num_iteration=model_gbdt.best_iteration_ )
#     y_train =       y_train.to_numpy()
#     y_val =         y_val.to_numpy()
#     # always do minmax scaling after robust scaling; and do inverse scaling with minmax first, then robust
#     if CONSTANTS["_USE_MINMAX_SCALER"]:
#         y_pred_train =  unscale(minmax_scalers['y_sales'],  y_pred_train)
#         y_pred_val =    unscale(minmax_scalers['y_sales'],  y_pred_val)
#         y_pred_test =   unscale(minmax_scalers['y_sales'],  y_pred_test)
#         y_train =       unscale(minmax_scalers['y_sales'],  y_train)
#         y_val =         unscale(minmax_scalers['y_sales'],  y_val)
#     if CONSTANTS["_USE_ROBUST_SCALER"]:
#         y_pred_train =  unscale(robust_scalers['y_sales'],  y_pred_train)
#         y_pred_val =    unscale(robust_scalers['y_sales'],  y_pred_val)
#         y_pred_test =   unscale(robust_scalers['y_sales'],  y_pred_test)
#         y_train =       unscale(robust_scalers['y_sales'],  y_train)
#         y_val =         unscale(robust_scalers['y_sales'],  y_val)
#     y_pred_train =  y_pred_train.clip(CONSTANTS["_CLIP_PREDICT_L"], CONSTANTS["_CLIP_PREDICT_H"])
#     y_pred_val =    y_pred_val.clip(  CONSTANTS["_CLIP_PREDICT_L"], CONSTANTS["_CLIP_PREDICT_H"])
#     y_pred_test =   y_pred_test.clip( CONSTANTS["_CLIP_PREDICT_L"], CONSTANTS["_CLIP_PREDICT_H"]) 
#     toc = perf_counter()
#     results["predict_time"] = datetime.utcfromtimestamp(toc-tic).strftime('%H:%M:%S')
#     print(f'Transform and Predict train/val/test time: {results["predict_time"]}')

#     results["train_r2"],   results["val_r2"]    = sk_r2(y_train, y_pred_train),            sk_r2(y_val, y_pred_val)
#     results["train_rmse"], results["val_rmse"]  = np.sqrt(sk_mse(y_train, y_pred_train)),  np.sqrt(sk_mse(y_val, y_pred_val))
#     print(f'R^2 train  = {results["train_r2"]:.4f}      R^2 val  = {results["val_r2"]:.4f}')
#     print(f'RMSE train = {results["train_rmse"]:.4f}    RMSE val = {results["val_rmse"]:.4f}\n')

#     return model_gbdt, model_gbdt.get_params(), X_test, y_pred_test, feature_names, data_types, results

# print(f'Done: {strftime("%a %X %x")}')





# ensemble_feature_names = []
# ensemble_y_pred_test = []
# ensemble_df_columns = ['lr', 'reg', 'max_iter', 'estop', 'start', 'end', 'n_val_mo', 'seed', 'trR2', 'valR2', 'tr_rmse', 'val_rmse', 'best_iter', 'best_val_rmse', 'model_time','predict_time','total_time']
# ensemble_df_rows = []
# model_params = OrderedDict()
# itercount = 0
# for lr in FEATURES["LEARNING_RATE"]:
#     for reg in FEATURES["REGULARIZATION"]:
#         for maxit in FEATURES["MAX_ITERATIONS"]:
#             for estop in FEATURES["EARLY_STOPPING"]:
#                 for train_start_mo in FEATURES["TRAIN_MONTH_START"]:
#                     for train_final_mo in FEATURES["TRAIN_MONTH_END"]:
#                         for val_mo in FEATURES["N_VAL_MONTHS"]:
#                             for seed in FEATURES["SEED_VALUES"]:
#                                 itercount += 1
#                                 print(f'\n\nBelow: Model {itercount} of {FEATURES["ALL_exploded_shape[0]"]}: LR = {lr}; LFF = {reg}, train_start = {train_start_mo}; train_end = {train_final_mo}; seed = {seed}\n')
#                                 time0 = time.time()
#                                 model_params["lr"] = lr
#                                 model_params['reg'] = reg
#                                 model_params['maxit'] = maxit
#                                 model_params['estop'] = estop
#                                 model_params['train_start_mo'] = train_start_mo
#                                 model_params['train_final_mo'] = train_final_mo
#                                 model_params['val_mo'] = val_mo
#                                 model_params['seed'] = seed
#                                 ##model_fit, y_pred_test, train_r2, val_r2, train_rmse, val_rmse, best_iter, best_val_rmse, model_fit_time, predict_time = 
#                                 model_fit, model_params, X_test, y_pred_test, feature_names, data_types, results = GBDT_model(df, SPEC, model_params)
#                                 time2 = time.time(); model_time = datetime.utcfromtimestamp(time2 - time0).strftime('%H:%M:%S')

#                                 ensemble_feature_names.append(feature_names)
#                                 ensemble_y_pred_test.append(y_pred_test)
#                                 ##ensemble_df_rows.append([lr,reg,maxit,estop,train_start_mo,train_final_mo,val_mo,seed,train_r2,val_r2,train_rmse,val_rmse,best_iter,best_val_rmse,model_fit_time,predict_time,model_time])

#                                 # intermediate save after each model fit set of parameters, in case of crash or disconnect from Colab
#                                 # Simple ensemble averaging
#                                 y_test_pred_avg = np.mean(ensemble_y_pred_test, axis=0)
#                                 # Merge the test predictions with IDs from the original test dataset, and keep only columns "ID" and "item_cnt_month"
#                                 y_submission = pd.DataFrame.from_dict({'item_cnt_month':y_test_pred_avg,'shop_id':X_test.shop_id,'item_id':X_test.item_id})
#                                 y_submission = test.merge(y_submission, on=['shop_id','item_id'], how= 'left').reset_index(drop=True).drop(['shop_id','item_id'],axis=1)
#                                 y_submission.to_csv("./models_and_predictions/" + FEATURES["_MODEL_NAME"] + '_submission.csv', index=False)
#                                 ##ensemble_scores = pd.DataFrame(ensemble_df_rows, columns = ensemble_df_columns)
#                                 ##ensemble_scores.to_csv("./models_and_predictions/" + model_filename_ens, index=False)
#                                 time3 = time.time(); iteration_time = datetime.utcfromtimestamp(time3 - time0).strftime('%H:%M:%S')
#                                 #print(f'TTSplit Execution Time = {ttsplit_time};  
#                                 print(f'Model fit/predict Execution Time = {model_time};  Total Iteration Execution Time = {iteration_time}')
#                                 print(f'Below: Model {itercount} of {FEATURES["ALL_exploded_shape[0]"]}: LR = {lr}; LFF = {reg}, train_start = {train_start_mo}; train_end = {train_final_mo}; seed = {seed}\n')
# print(model_params)
# print(feature_names)
# print(data_types)
# print(results)
# #display(ensemble_scores)

# print(f'\nDone: {strftime("%a %X %x")}\n')

nocode=True

##**Random Stuff**

###**K-Fold Training Splits; Ensemble Average; Save Intermediate Results**

In [None]:

%cd "{GDRIVE_REPO_PATH}"

ensemble_y_pred_test = []
ensemble_df_columns = ['tr_rmse','val_rmse','trR2','valR2','lr','reg','max_iter','estop','bin_sample','start','end','val_key','seed','best_iter','best_val_rmse','model_t','predict_t','total_t']
ensemble_df_rows = []


    if not ITERS.at[itercount,"_model_filename"]:
        ITERS.at[itercount,"_model_filename"] = input("Enter the Base Model Name Substring for Output File Naming (like: 'v4mg_01' )")
    filename_parameters = ITERS.at[itercount,"_model_type"] + ITERS.at[itercount,"_model_filename"] + "_params.csv"
    filename_submission = ITERS.at[itercount,"_model_type"] + ITERS.at[itercount,"_model_filename"] + '_submission.csv'

    print(f'\n\nBelow: Model {itercount+1} of {len(ITERS)}: lr= {ITERS.at[itercount,"learning_rate"]}; Reg= {ITERS.at[itercount,"colsample_bytree"]}, ',end='')
    print(f'train_start = {ITERS.at[itercount,"_train_start_month"]}; train_end = {ITERS.at[itercount,"_train_final_month"]}; seed = {ITERS.at[itercount,"random_state"]}\n')
    
    time0 = time.time()
    # CHANGE --> only redo this inside the loop if the months change
    DataSets, ITERS.at[itercount,"feature_name_"] = TTSplit(data=df, params=ITERS, iternum=itercount)

    y_pred_test, results_dict, parameters_of_model = GBDT_model(DataSets, model_params_dict, fit_params_dict) #ITERS, itercount)
    time1 = time.time()
    

    ITERS.at[itercount,"time_predict_end"] = datetime.utcfromtimestamp(time1 - time0).strftime('%H:%M:%S')
    print(f'Total Iteration Execution Time = {ITERS.at[itercount,"time_predict_end"]}')

    # intermediate save after each model fit set of parameters, in case of crash or disconnect from Colab
    # Simple ensemble averaging
    ensemble_y_pred_test.append(y_pred_test)
    y_test_pred_avg = np.mean(ensemble_y_pred_test, axis=0)
    # Merge the test predictions with IDs from the original test dataset, and keep only columns "ID" and "item_cnt_month"
    y_submission = pd.DataFrame.from_dict({'item_cnt_month':y_test_pred_avg,'shop_id':DataSets['X_test'].shop_id,'item_id':DataSets['X_test'].item_id})
    y_submission = test.merge(y_submission, on=['shop_id','item_id'], how= 'left').reset_index(drop=True).drop(['shop_id','item_id'],axis=1)
    y_submission.to_csv("./models_and_predictions/" + filename_submission, index=False)

    ITERS.to_csv("./models_and_predictions/" + filename_parameters, index=False)

    ensemble_df_rows.append(ITERS[['tr_rmse','val_rmse','tr_R2','val_R2','learning_rate','colsample_bytree','n_estimators','early_stopping_rounds','subsample_for_bin','_train_start_month','_train_final_month',
                                          '_validate_months','random_state','best_iteration_','best_score_','time_model_fit','time_model_predict','time_predict_end']].iloc[itercount].to_list())
    ensemble_scores = pd.DataFrame(ensemble_df_rows, columns = ensemble_df_columns)

    print(f'\nModel {itercount+1} of {len(ITERS)}: lr= {ITERS.at[itercount,"learning_rate"]}; Reg= {ITERS.at[itercount,"colsample_bytree"]}, ',end='')
    print(f'train_start = {ITERS.at[itercount,"_train_start_month"]}; train_end = {ITERS.at[itercount,"_train_final_month"]}; seed = {ITERS.at[itercount,"random_state"]}\n')
    display(ensemble_scores)
    
    itercount += 1

print(f'\nDone: {strftime("%a %X %x")}\n')

###**Document Results**

In [None]:
# Printout for copy-paste version control

print('\n------------------------------------------\n------------------------------------------')
print(f'{FEATURES["_MODEL_NAME"]}  Model Type: {FEATURES["_MODEL_TYPE"]}\nCoursera: \n------------------------------------------')
display_params()
print('------')
print(ensemble_scores)
print('------')
print(ensemble_scores.describe(percentiles=[], include=np.number))
print(f'------\nHighest and Lowest Feature Importance for Final Model:\n{fi.iloc[list(range(0,8))+list(range(-7,0)),:]}\n------')
print(y_submission.head(8))
print('------------------------------------------\n\n')


###**Record Results**