In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
pd.options.display.max_columns = 100

from splitting import Splitting
from analysis import AnalysisMetrics
from hyperparams import Hyperparameters
from transforms import ClusterTransform
from modeling import Model 

print(pd.__version__)


In [None]:
model = Model(model_name='Candidate_20210324')

params = {
 u'5Pyaterochka (X5)': {'colsample_bytree': 0.55,
  'gamma': 0.8,
  'learning_rate': 0.05,
  'max_depth': 5,
  'min_child_weight': 6.0,
  'n_estimators': 111,
  'reg_alpha': 0.55,
  'reg_lambda': 1.1,
  'subsample': 0.4,
  'transformer_nominal': 'JamesSteinEncoder',
  'transformer_ordinal': 'OrdinalEncoder',
  'under_predict_weight': 2.0},
 u'Dixy': {'colsample_bytree': 0.4,
  'gamma': 0.75,
  'learning_rate': 0.1,
  'max_depth': 5,
  'min_child_weight': 7.0,
  'n_estimators': 59,
  'reg_alpha': 0.6000000000000001,
  'reg_lambda': 1.3,
  'subsample': 0.55,
  'transformer_nominal': 'JamesSteinEncoder',
  'transformer_ordinal': 'OrdinalEncoder',
  'under_predict_weight': 2.0},
 u'Lenta': {'colsample_bytree': 0.8,
  'gamma': 0.7000000000000001,
  'learning_rate': 0.08,
  'max_depth': 2,
  'min_child_weight': 1.0,
  'n_estimators': 137,
  'reg_alpha': 0.7000000000000001,
  'reg_lambda': 1.0,
  'subsample': 0.8,
  'transformer_nominal': 'JamesSteinEncoder',
  'transformer_ordinal': 'OrdinalEncoder',
  'under_predict_weight': 2.5},
 u'Magnit': {'colsample_bytree': 0.4,
  'gamma': 0.9500000000000001,
  'learning_rate': 0.08,
  'max_depth': 4,
  'min_child_weight': 4.0,
  'n_estimators': 152,
  'reg_alpha': 0.9500000000000001,
  'reg_lambda': 1.1,
  'subsample': 0.7000000000000001,
  'transformer_nominal': 'JamesSteinEncoder',
  'transformer_ordinal': 'OrdinalEncoder',
  'under_predict_weight': 3.0},
 u'Okey Group': {'colsample_bytree': 0.5,
  'gamma': 1.0,
  'learning_rate': 0.09,
  'max_depth': 4,
  'min_child_weight': 1.0,
  'n_estimators': 98,
  'reg_alpha': 0.8,
  'reg_lambda': 1.1500000000000001,
  'subsample': 0.45,
  'transformer_nominal': 'JamesSteinEncoder',
  'transformer_ordinal': 'OrdinalEncoder',
  'under_predict_weight': 2.0}
}

features = [
    u'original_pid',
    u'account_id',
    u'original_product_dimension_25',
    u'original_product_dimension_26',
    u'week_agg_8',
    u'baseline_units',
    u'consumer_length',
    u'promotion_type',
    u'discount_perc_cohort',
    u'promoted_niv',
    u'previous_promotion_week_distance',
    u'total_nr_products'
]

duplication_map = {
    'Okey Group': ['5Pyaterochka (X5)','Dixy', 'Okey Group'],
    'Lenta': ['Lenta'],
    '5Pyaterochka (X5)': ['5Pyaterochka (X5)', 'Lenta', 'Dixy', 'Okey Group', 'Magnit'],
    'Dixy': ['5Pyaterochka (X5)', 'Lenta', 'Dixy', 'Okey Group', 'Magnit'],
    'Magnit': ['5Pyaterochka (X5)', 'Lenta', 'Magnit'],
}

model.create(
    params=params,
    feature_filename='./outputs/im_feature_info_dict_mars_ru_20210212.txt',
    features=features,
    target='total_units',
#     cat_feature=None,
    cat_feature='account_banner',
#     cat_feature=['5Pyaterochka (X5)','Lenta','Dixy','Okey Group'],
    output_dir='outputs',
    data_filename='../data/20210212_mars_ru_prod_trainset.msgpack',
    filter_filename='./outputs/im_data_retrieval-v6-20210212.txt',
    account_filter=['5Pyaterochka (X5)','Lenta','Dixy','Okey Group','Magnit'],
    future_data_filename='../data/20210212_mars_ru_prod_futureset.msgpack',
    future_target='total_units_2',
    duplication_map=duplication_map)

In [None]:
model.train()

In [None]:
analysis_metrics = AnalysisMetrics(model=model,
                                   reevaluate=True,
                                   number_tests=100,
                                   use_product_filter=True,
                                   filter_threshold=300)

model.update_info(analysis_metrics)
model.save()

In [None]:
analysis_metrics.plot_metrics(['r2'])

In [None]:
model

In [None]:
model = Model(model_name="Candidate_20210323_2")
model.load()
analysis_metrics = AnalysisMetrics(model=model)

analysis_metrics.plot_metrics()

In [None]:
# analysis_metrics.get_historic_overall_results()
# analysis_metrics.get_historic_account_results()

In [None]:
# analysis_metrics.metrics_median('overall')
# model.get_duplication_info()
model.clusters