In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

        
import statsmodels.tsa.stattools as ts
import matplotlib.pyplot as plt
import plotly.express as px
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

In [None]:
d = pd.read_csv('/kaggle/input/good-types/good_types_stat.cvs', index_col = 0)
d[d.pvalue > 0.05].sort_values('pvalue').tail(60)

# Discover Sales Type

In [None]:
def read():
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    sales_train = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')

    sales_by_cat = sales_train.drop(['item_id', 'dept_id', 'cat_id', 'state_id', 'store_id'], axis = 1).set_index(['id']).T.reset_index()
    pivot_tbl = sales_by_cat.merge(calendar[['date', 'd']], left_on = 'index', right_on = 'd')
    pivot_tbl = pivot_tbl.drop(['d', 'index'], axis = 1).set_index('date')
    pivot_tbl.index = pd.to_datetime(pivot_tbl.index)  
    return pivot_tbl

def dftest(timeseries):
    return ts.adfuller(timeseries, autolag='AIC')[1]
    
def get_zero_stat(df, zero_continue_interval, Debug = False):
    nzero_idx = df.to_numpy().nonzero()[0]
    first_sale  = df.index[nzero_idx[0]]
    last_sale   = df.index[nzero_idx[len(nzero_idx) - 1]]

    intervals_mask = nzero_idx[1:] - nzero_idx[:-1] > zero_continue_interval
    intervals = np.argwhere(intervals_mask == True).reshape(-1) + 1

    if len(intervals) == 0:
        start_last_long_sales = first_sale
        last_long_nosales_interval_length = 0
    else:
        idx2 = nzero_idx[intervals[len(intervals) - 1]]
        idx1 = nzero_idx[intervals[len(intervals) - 1] - 1]
        last_long_nosales_interval_length = idx2 - idx1
        start_last_long_sales = df.index[nzero_idx[intervals[len(intervals) - 1]]]
    
    if Debug == True:
        print('first_sale = {}, start_last_long_sales = {}, last_sale = {}, last_long_nosales_interval_length = {}'.
              format(first_sale, start_last_long_sales, last_sale, last_long_nosales_interval_length))
                                         
    return first_sale, start_last_long_sales, last_sale, last_long_nosales_interval_length

def get_series_types(pivot_tbl, zero_continue_interval = 30, Debug = False):
    i = 0
    total = pivot_tbl.shape[1]
    series_types = pd.DataFrame(columns = ['good', 'first_sale', 'start_last_long_sales', 'last_sale', 'sparse level', 'pvalue', 'last_long_nosales_interval_length'])
    for (columnName, columnData) in pivot_tbl.iteritems():
        first_sale, start_last_long_sales, last_sale, last_long_nosales_interval_length  = get_zero_stat(columnData, zero_continue_interval, Debug = Debug)
        if len(columnData[start_last_long_sales: last_sale]) > 10:
            pvalue = dftest(columnData[start_last_long_sales: last_sale])
        else:
            pvalue = -1
        
        sparse_level = len(columnData[first_sale: last_sale].to_numpy().nonzero()[0])/len(columnData[first_sale: last_sale])
        series_types.loc[i] = [columnName, first_sale, start_last_long_sales, last_sale, sparse_level, pvalue, last_long_nosales_interval_length]
        i += 1
        if i % 1000 == 0:
            print('iteration {}/{}'.format(i, total))
    return series_types

In [None]:
pivot_tbl = read()[['HOBBIES_1_001_CA_1_validation']]

In [None]:
series_types = get_series_types(pivot_tbl, Debug = False)
series_types.last_long_nosales_interval_length = series_types.last_long_nosales_interval_length.astype('int')
series_types['last_long_sales_interval_length'] = series_types.apply(lambda r : (r.last_sale - r.start_last_long_sales).days, axis=1)

# Show Long Sales/No Sales Intervals

In [None]:
plt.figure(figsize = (15, 5))
_ = plt.hist(series_types.last_long_sales_interval_length, bins = 170)

In [None]:
plt.figure(figsize = (15, 5))
_ = plt.hist(series_types.last_long_nosales_interval_length, bins = 170)

In [None]:
series_types.describe(percentiles = [0.1, 0.25, 0.5, 0.75])

# Check One Good

In [None]:
series_types[series_types.pvalue > 0.05].sort_values('pvalue')

In [None]:
idx = 0
good = series_types.good[idx]
start_sales = series_types.first_sale[idx]
start_last_long_sales = series_types.start_last_long_sales[idx]
last_sales = series_types.last_sale[idx]
print('good is ' + good)

In [None]:
df = pivot_tbl[good]
fig = px.bar(df, x=df.index, y = good, height=350, title = 'Whole History')
fig.show()

In [None]:
df = pivot_tbl[(pivot_tbl.index >= start_sales) & (pivot_tbl.index <= last_sales)][good]
fig = px.bar(df, x=df.index, y = good, height=450, title = 'All Sales')
fig.show()

In [None]:
df = pivot_tbl[(pivot_tbl.index >= start_last_long_sales) & (pivot_tbl.index <= last_sales)][good]
fig = px.bar(df, x=df.index, y = good, height=450, title = 'Last Long Sales')
fig.show()