In [None]:
from forecast_process import *

## Write to elastic search

In [None]:
def to_elastic(flat_result_all_cat, index_name, doctype):
    settings = {
        "index":{
            "number_of_shards": 5,
            "number_of_replicas": 1,
            "mapping": {
                "total_fields":{
                    "limit": "1000"
                }
            },
        }
    }

    mapping = {
            doctype: { 
               "dynamic_templates": [
                   {"strings": {
                        "match_mapping_type": "string",
                        "mapping": {
                          "type": "keyword"
                        }
                      }
                    }
                ]
            }
        }

    uri = ['http://{}:{}'.format(ip, '9200') for ip in ['192.168.0.179', '192.168.0.178']]
    es = DocTools(uri)
    esi = es.indextool()
    esi.create(index_name, overwrite = True, settings = settings, mapping = mapping)
    res = es.bulk(index_name, flat_result_all_cat, doctype = doctype)

## Build data

In [None]:
def init_func(group_file):
    #load fact
    dlf = datalabframework.project.load()
    engine = datalabframework.project.engine()
    spark = engine.context()
    fact_transaction = engine.load('fact_table').select('sku_id', 'sku_name', 'transaction_date', 'quantity', \
                                                        'doc_type', 'unit_price', 'cat_id', 'cat_group_id', \
                                                        'cat_root_id', 'cat_name', 'cat_group_name', 'cat_root_name', \
                                                        'brand_id', 'brand_name')
    product_quantity_date = fact_transaction.where(F.expr('doc_type == "PTX"') | F.expr('doc_type == "HDF"'))\
                .where(F.expr('unit_price != 0'))\
                .groupby('sku_id', 'sku_name', 'transaction_date', 'cat_id', 'cat_group_id', 'cat_root_id', 
                         'cat_name', 'cat_group_name', 'cat_root_name', 'brand_id', 'brand_name')\
                .agg(F.sum('quantity').alias('daily_quantity'), F.avg('unit_price').alias('daily_price'))\
                .orderBy('transaction_date')
    #read group
    cluster_group = pd.read_csv(group_file)
    cluster_group.columns = ['brand', 'line', 'series', 'price_segment', 'sku_id']
    product_quantity_date = product_quantity_date.toPandas()
    product_quantity_date['sku_id'] = product_quantity_date['sku_id'].astype(int)
    product_quantity_date['daily_quantity'] = product_quantity_date['daily_quantity'].astype(int)
    product_quantity_date['transaction_date'] = pd.to_datetime(product_quantity_date['transaction_date'])
    cluster_group['sku_id'] = cluster_group['sku_id'].astype(int)
    return product_quantity_date, cluster_group

In [None]:
def select_trans_group(product_quantity_date, cat_root_name = 'laptop',  group_products = None):
    """
    Select all transactions of a category (or all products) and caculate total sale of this category by date.
    
    Parameters
    ----------
    product_quantity_date: Spark.DataFrame
        Arregation quantity dataframe of all sku and transaction date
    cat_root_name: string
        Name of selected root category
    group_products: DataFrame
        List of selected product
        
    Returns
    -------
    Spark.DataFrame
        Total sales of a category by date
    """
    cat_trans = product_quantity_date[(product_quantity_date['cat_root_name'] == cat_root_name)]
    cat_trans =  cat_trans.merge(group_products, on = 'sku_id')
    total_by_date = cat_trans.groupby('transaction_date').agg({'daily_quantity':'sum'}).reset_index()
    return total_by_date

## Forecast Process

In [None]:
def caculate_history_and_forecast(product_quantity_date, cat_root_name, group_sku, attr_dict, freq_ = 'D'):
    total_by_date = select_trans_group(product_quantity_date, cat_root_name, group_sku)
    if (total_by_date.shape[0] == 0):
        return None, None, None, None
    flat_result_cv, flat_result_test, hist_data, preds = adaptive_forecast_process(total_by_date, freq_)
    if flat_result_test:
        flat_result_test.update(attr_dict)
        for cv in flat_result_cv:
            cv.update(attr_dict)
    if hist_data:
        for data in hist_data:
            data.update(attr_dict)
    if preds != None:
        for pred in preds:
            pred.update(attr_dict)
    return flat_result_cv, flat_result_test, hist_data, preds

In [None]:
def run(freq_str):
    freq_ = 'D'
    if freq_str == 'month':
        freq_ = 'M'
    elif freq_str == 'week':
        freq_ = 'W-SUN'
    product_quantity_date, cluster_laptop = init_func('csv_folder/cluster_laptop_products.csv')
    combination_attr = [['brand'], ['brand', 'line'], ['brand', 'line', 'series'], ['brand', 'price_segment'],\
                        ['brand', 'line', 'price_segment'], ['brand', 'line', 'series', 'price_segment']]
    flat_test_result_all_group = []
    flat_cv_result_all_group = []
    preds_future = []
    history_data = []
    for comb in combination_attr:
        distinct_value_attr_df = cluster_laptop[comb].drop_duplicates().values
        n = 0
        print('number distint:', len(distinct_value_attr_df))
        for value_attr in distinct_value_attr_df:
            print('n = ', n, ' ', value_attr)
            n += 1
            selected_cluster = cluster_laptop
            for i, attr in enumerate(comb):
                selected_cluster = selected_cluster[selected_cluster[attr] == value_attr[i]]
            group_sku_id = selected_cluster[['sku_id']]
            attr_dict = dict(zip(['brand', 'line', 'series', 'price_segment'], [None] * 4))
            attr_dict.update(dict(zip(comb, value_attr)))
            print(attr_dict)
            flat_result_cv, flat_result_test, hist_data, preds = caculate_history_and_forecast(product_quantity_date, 'laptop', \
                                                                                               group_sku_id, attr_dict, freq_)
            if flat_result_test:
                flat_test_result_all_group.append(flat_result_test)
            if flat_result_cv:
                flat_cv_result_all_group.extend(flat_result_cv)
            if hist_data:
                history_data.extend(hist_data)
            if preds:
                preds_future.extend(preds)
    json.dump(flat_test_result_all_group, open('json_output/laptop_cluster/' + freq_str+ '/info_test_forecast.json', 'w', encoding = 'utf8'))
    json.dump(flat_cv_result_all_group, open('json_output/laptop_cluster/' + freq_str+ '/info_cv_forecast.json', 'w', encoding = 'utf8'))
    json.dump(preds_future, open('json_output/laptop_cluster/' + freq_str+ '/future_prediction.json', 'w', encoding = 'utf8'))
    json.dump(history_data, open('json_output/laptop_cluster/' + freq_str+ '/history_data.json', 'w', encoding = 'utf8'))


In [None]:
run('day')

In [None]:
info_test_forecast = json.load(open('json_output/laptop_cluster/' + 'week'+ '/info_test_forecast.json', 'r'))

In [None]:
for info in info_test_forecast:
    info['wape_test'] = float(info['wape_test'])
    info['wape_cv'] = float(info['wape_cv'])

In [None]:
preds_future = json.load(open('json_output/laptop_cluster/' + 'week'+ '/future_prediction.json', 'r'))

In [None]:
history = json.load(open('json_output/laptop_cluster/' + 'week'+ '/history_data.json', 'r'))

In [None]:
pd_hist = pd.DataFrame(history)

In [None]:
pd_hist[(pd_hist['brand'] =='HP') & (pd.isna(pd_hist['line'])) & (pd.isna(pd_hist['series'])) & (pd.isna(pd_hist['price_segment']))]

In [None]:
len(preds_future)

In [None]:
for pred in preds_future:
    if math.isnan(pred['below_error']):
        pred['below_error'] = -1.0
        pred['upper_error'] = -1.0
    if pred['brand'] == 'ASUS':
        pred['brand'] = 'Asus'

In [None]:
to_elastic(info_test_forecast, 'week_cluster_laptop_info_test_forecast', 'week_forecast')

In [None]:
to_elastic(preds_future, 'week_cluster_laptop_prediction_future', 'week_forecast')

In [None]:
to_elastic(history, 'week_cluster_laptop_history_data', 'week_forecast')

In [None]:
future_prediction

## Test 

In [None]:
product_quantity_date, cluster_group = init_func('csv_folder/cluster_laptop_products.csv')

In [None]:
group_sku_id = cluster_group[(cluster_group['brand'] == 'HP')] 

In [None]:
t = list(group_sku_id['sku_id'])

In [None]:
s = ["19030456", "19030364", "1810091", "1810978", "19030431", "19030432", "19020005", "1702882", "18101019", "1701726", "1701983", "18101020", "18101021", "1700079", "1700080", "1700131", "1702436", "1702864", "1703322", "1800471", "1702431", "1703473", "1800543", "1703321", "1800472", "1805399", "1806177", "18120012", "18120171", "1805398", "19010044", "1700132", "1701645", "19010177", "19010176", "1700767", "1701094", "1701417", "1701646", "1702317", "1702356", "1800641", "1702536", "1702614", "1800141", "1702535", "1805184", "1809209", "1810092", "1809210", "1809211", "1808428", "18120052", "1700033", "1700835", "1700965", "1702286", "1702287", "1702528", "1702525", "1806168", "1702280", "1800932", "1800581", "1806176", "1702829", "1807526", "1808473", "1808027", "1807525", "1807527", "18110381", "19030342", "1809550", "1702200", "1809549", "1701195", "1800606", "1807494", "1700034", "1700035", "1700172", "1700832", "1700833", "1700834", "1700013", "1702437", "1702421", "1702683", "1702783", "1702621", "1800848", "1800849", "1702620", "1704925", "1704926", "1704927", "1800539", "1800540", "1800541", "1807495", "1808426", "18120051", "19010425", "19010426", "1806192", "1806193", "1806194", "18120015", "18120124", "18120125", "1700015", "1700768", "1700769", "1700771", "1701264", "1701267", "1701828", "1602768", "1603762", "1700016", "1700770", "1701265", "1603758", "1603761", "1701262", "1702686", "1702687", "1702688", "1702615", "1702616", "1800644", "1800846", "1800847", "1702613", "1702617", "1704809", "1704810", "1704811", "1704812", "1800142", "1800612", "1800642", "1800977", "1805173", "1702612", "1800147", "1805462", "1806245", "1806246", "1806247", "1807332", "1807334", "1807333", "1807457", "1808102", "18120010", "18120126", "18120127", "18110331", "18120011", "1809231", "19010043", "1809173", "1809172", "1809326", "1809327", "1807458", "1807459", "1702531", "1702532", "1702420", "1805027", "1700170", "1700171", "1702357", "1703124", "1806213", "1702358", "1702359", "1800470", "1800529", "1806214", "1806215", "1702999", "1809208", "19010042", "1701038", "1700017", "1603756", "1701630", "1701631", "1704805", "1809232", "1704807", "1703319", "1704806", "1800146", "1703320", "19030067", "19030458", "19030066", "1700804", "1603672", "1700842", "1800169", "1704820", "1810516", "1703107", "1704817", "1704815", "1800542", "1704816", "1704818", "19030313", "19030314", "19030315", "19030316", "1700018", "1700839", "1701980", "1700841", "1702199", "1603534", "1800525", "1703106", "1703110", "1704819", "1800524", "1800526", "1808427", "1703108", "1703109", "19030457", "19030299", "19030317", "19020315", "19020316", "1704879", "1808474", "1701451", "1704880", "1805174"]

In [None]:
len(s) == len(t)

In [None]:
for sku in s:
    if int(sku) not in t:
        print(sku)

In [None]:
cat_root_name = 'laptop'

In [None]:
total_by_date = select_trans_group(product_quantity_date, cat_root_name, group_sku_id)

In [None]:
flat_result_cv, flat_result_test, hist_data, preds = adaptive_forecast_process(total_by_date, 'W-SUN')

In [None]:
hist_data

In [None]:
hist_data

In [None]:
flat_result_cv, flat_result_test, hist_data, preds= caculate_history_and_forecast(product_quantity_date, cat_root_name, group_sku_id, {}, freq_ = 'M')

In [None]:
hist_data