In [38]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, ElasticNetCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
# other imports
import numpy as np
import pandas as pd
from statsmodels.regression.linear_model import OLS
from scipy.stats import norm
import json
import random
# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

# Load CSV

In [39]:
df = pd.read_csv('/Users/arthur.pentecoste/ProjectRA/Demand_Paper/Code_DAC/72_cleaned_data.csv')
df

Unnamed: 0,sku,class,week,desc,units,revenue,promoFlag,promoPrice,forecast,fatigue,month,price
0,559215,724,2013111,UNIBALL 207 GEL RT BLK DZ,323,7072.87,False,,,0,11,21.897430
1,559215,724,2013112,UNIBALL 207 GEL RT BLK DZ,321,7018.90,False,,,0,11,21.865732
2,559215,724,2013113,UNIBALL 207 GEL RT BLK DZ,290,6372.39,False,,,0,11,21.973759
3,559215,724,2013114,UNIBALL 207 GEL RT BLK DZ,126,2786.73,False,,,0,11,22.116905
4,559215,724,2013115,UNIBALL 207 GEL RT BLK DZ,228,5055.72,False,,,0,11,22.174211
...,...,...,...,...,...,...,...,...,...,...,...,...
17202,105455,724,2016102,ENERGEL RTX PEARL RT MED BLK 3,4,25.96,False,,,46,10,6.490000
17203,105455,724,2016103,ENERGEL RTX PEARL RT MED BLK 3,4,25.96,False,,,47,10,6.490000
17204,105455,724,2016104,ENERGEL RTX PEARL RT MED BLK 3,6,38.94,False,,,48,10,6.490000
17205,105455,724,2016111,ENERGEL RTX PEARL RT MED BLK 3,8,51.92,False,,,49,11,6.490000


In [40]:
df.describe()

Unnamed: 0,sku,class,week,units,revenue,promoPrice,forecast,fatigue,month,price
count,17207.0,17207.0,17207.0,17207.0,17207.0,1528.0,1528.0,17207.0,17207.0,17207.0
mean,564013.0,723.01511,2014946.0,200.17365,2062.385408,7.802598,592.948,9.179229,6.566746,12.632323
std,230643.2,2.097862,886.1273,452.983603,3877.511502,4.279726,1374.193311,13.853754,3.441437,7.024332
min,105455.0,720.0,2013111.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0
25%,409908.0,721.0,2014084.0,25.0,251.64,4.89,100.0,0.0,4.0,7.39
50%,555025.0,723.0,2015054.0,73.0,844.26,6.0,208.366,2.0,7.0,10.900996
75%,722395.0,724.0,2016023.0,175.0,2159.27,10.0,518.0,13.0,10.0,16.805109
max,1231065.0,728.0,2016112.0,8604.0,52339.57,22.45,22684.75,91.0,12.0,46.65


# Preprocessing steps (optional)

In [41]:
df['trend'] = df.week // 1000 - 2013
df['promo'] = df.promoFlag.astype('int')

data = pd.get_dummies(data=df, columns=['month','class'])

In [42]:
data

Unnamed: 0,sku,week,desc,units,revenue,promoFlag,promoPrice,forecast,fatigue,price,...,month_11,month_12,class_720,class_721,class_722,class_723,class_724,class_726,class_727,class_728
0,559215,2013111,UNIBALL 207 GEL RT BLK DZ,323,7072.87,False,,,0,21.897430,...,1,0,0,0,0,0,1,0,0,0
1,559215,2013112,UNIBALL 207 GEL RT BLK DZ,321,7018.90,False,,,0,21.865732,...,1,0,0,0,0,0,1,0,0,0
2,559215,2013113,UNIBALL 207 GEL RT BLK DZ,290,6372.39,False,,,0,21.973759,...,1,0,0,0,0,0,1,0,0,0
3,559215,2013114,UNIBALL 207 GEL RT BLK DZ,126,2786.73,False,,,0,22.116905,...,1,0,0,0,0,0,1,0,0,0
4,559215,2013115,UNIBALL 207 GEL RT BLK DZ,228,5055.72,False,,,0,22.174211,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17202,105455,2016102,ENERGEL RTX PEARL RT MED BLK 3,4,25.96,False,,,46,6.490000,...,0,0,0,0,0,0,1,0,0,0
17203,105455,2016103,ENERGEL RTX PEARL RT MED BLK 3,4,25.96,False,,,47,6.490000,...,0,0,0,0,0,0,1,0,0,0
17204,105455,2016104,ENERGEL RTX PEARL RT MED BLK 3,6,38.94,False,,,48,6.490000,...,0,0,0,0,0,0,1,0,0,0
17205,105455,2016111,ENERGEL RTX PEARL RT MED BLK 3,8,51.92,False,,,49,6.490000,...,1,0,0,0,0,0,1,0,0,0


# Create dictionary

In [43]:
#Create dictionary
skuSet = df.sku.unique()
skuData = {}
colnames = list(data.columns)
for i in skuSet:
    df_i = data[data.sku == i]
    skuData[i] = {'X': df_i[colnames[9:]].values,
                  'y': df_i.units.values}

# DAC Algorithm

In [44]:
def learn_structure(theta = 0.01, 
                    upp = 0.9, 
                    low = 0.1,
                    num_clusters = 9,
                    print_structure = False):
    
    d = skuData[skuSet[0]]['X'].shape[1]
    n = skuSet.size
    aggre_level = []
    clus_columns = []
    n_cols_alg = 0
    z = num_clusters
    all_coeff = np.zeros((n,d))
    all_coeff[0,:] = skuModels[skuSet[0]].params


    for j in range(d):

        # a n-1 vector recording if two betas have the same mean
        test_j = np.zeros(n-1)

        for i in range(1,n):
            sku = skuSet[i]
            all_coeff[i,j] = skuModels[sku].params[j]

            z_stat = ( np.abs(skuModels[skuSet[0]].params[j] - skuModels[sku].params[j]) / 
                      np.sqrt(np.square(skuModels[skuSet[0]].bse[j]) + np.square(skuModels[sku].bse[j])) )
            p_value = 1 - norm.cdf(z_stat)
            if p_value >= theta:
                test_j[i-1] = 1

        if print_structure:
            print('Feature:', colnames[j+9])
            print('Ratio:', np.mean(test_j))

        if np.sum(test_j) >= upp*(n-1):
            aggre_level.append('dept')
            n_cols_alg += 1

        elif np.sum(test_j) <= low*(n-1):
            aggre_level.append('sku')
            n_cols_alg += n

        else:
            aggre_level.append('clus')
            clus_columns.append(j)
            n_cols_alg += z

        if print_structure:
            print(aggre_level[-1])
            print()

    if len(clus_columns) > 0:
        X_clus = all_coeff[:, clus_columns]
        kmeans = KMeans(n_clusters=z, random_state=0).fit(X_clus)

    X_alg_train = np.zeros((train_size, n_cols_alg))
    X_alg_test = np.zeros((test_size, n_cols_alg))

    count = 0
    for i in range(d):
        if aggre_level[i] == 'dept':

            for sku in skuSet:
                # find the corresponding range
                idx_train = range_dict[sku]['train']
                idx_test = range_dict[sku]['test']

                # stack the data
                X_alg_train[idx_train[0]:idx_train[1], count] = X_dict[sku]['train'][:,i]
                X_alg_test[idx_test[0]:idx_test[1], count] = X_dict[sku]['test'][:,i]

            count += 1

        elif aggre_level[i] == 'clus':

            for j in range(z):
                # the indices of items in cluster j
                clus_items = list(np.where(kmeans.labels_ == j)[0])
                for idx in clus_items:
                    sku = skuSet[idx]
                    # find the corresponding range
                    idx_train = range_dict[sku]['train']
                    idx_test = range_dict[sku]['test']

                    # stack the data
                    X_alg_train[idx_train[0]:idx_train[1], count] = X_dict[sku]['train'][:,i]
                    X_alg_test[idx_test[0]:idx_test[1], count] = X_dict[sku]['test'][:,i]


                count += 1
        else:
            for sku in skuSet:
                # find the corresponding range
                idx_train = range_dict[sku]['train']
                idx_test = range_dict[sku]['test']

                # stack the data
                X_alg_train[idx_train[0]:idx_train[1], count] = X_dict[sku]['train'][:,i]
                X_alg_test[idx_test[0]:idx_test[1], count] = X_dict[sku]['test'][:,i]

                count += 1
        
    return X_alg_train, X_alg_test


In [45]:
def mape(y_true, y_pred): 
    return np.median(np.abs((y_true - y_pred) / y_true))

# Hyperparameters

In [60]:
params_dac=[0.9,0.2,7,0.05]

In [61]:
results={}

results['decentralized']={}
results['decentralized']['R2']=[]
results['decentralized']['MSE']=[]
results['decentralized']['MAPE']=[]


results['centralized']={}
results['centralized']['R2']=[]
results['centralized']['MSE']=[]
results['centralized']['MAPE']=[]


results['dac']={}
results['dac']['R2']=[]
results['dac']['MSE']=[]
results['dac']['MAPE']=[]

# Decentralized

In [62]:
X_dict = {}
y_dict = {}
    
skuModels = {}
y_pred = []
y_test = []
y_train = []
    
train_size = 0
test_size = 0
range_dict = {}
row_train = 0
row_test = 0
    
for i in skuSet:
    X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(skuData[i]['X'], skuData[i]['y'], test_size = 0.3)
        
    X_dict[i] = {'train': X_train_i, 'test': X_test_i}
    y_dict[i] = {'train': y_train_i, 'test': y_test_i}
        
    train_size += y_train_i.size
    test_size += y_test_i.size
    range_dict[i] = {'train': (row_train, row_train + y_train_i.size), 
                     'test':  (row_test, row_test + y_test_i.size) }
        
    row_train += y_train_i.size
    row_test += y_test_i.size
        
    model_i = OLS(y_train_i, X_train_i, hasconst = False)
    skuModels[i] = model_i.fit()
    y_pred += list(skuModels[i].predict(X_test_i))
    y_test += list(y_test_i)
    y_train += list(y_train_i)
    
y_train = np.array(y_train)
y_test = np.array(y_test)
    
results['decentralized']['R2'].append(r2_score(y_test, np.array(y_pred)))
results['decentralized']['MSE'].append(mean_squared_error(y_test, np.array(y_pred)))
results['decentralized']['MAPE'].append(mape(y_test,np.array(y_pred)))

# Centralized

In [63]:
X_cen_train = X_dict[skuSet[0]]['train'] 
X_cen_test = X_dict[skuSet[0]]['test']
for sku in skuSet[1:]:
    X_cen_train = np.concatenate((X_cen_train, X_dict[sku]['train']), axis = 0)
    X_cen_test = np.concatenate((X_cen_test, X_dict[sku]['test']), axis = 0)
    
model_cen = LinearRegression(fit_intercept=False).fit(X_cen_train, y_train)
    
results['centralized']['R2'].append(r2_score(y_test, model_cen.predict(X_cen_test)))
results['centralized']['MSE'].append(mean_squared_error(y_test, model_cen.predict(X_cen_test)))
results['centralized']['MAPE'].append(mape(y_test, model_cen.predict(X_cen_test)))

# DAC

In [64]:
upp,low,z,theta = params_dac
X_alg_train, X_alg_test = learn_structure(upp = upp, low = low,num_clusters = z,theta=theta)
model_dac = LinearRegression().fit(X_alg_train, y_train)


results['dac']['R2'].append(r2_score(y_test, model_dac.predict(X_alg_test)))
results['dac']['MSE'].append(mean_squared_error(y_test, model_dac.predict(X_alg_test)))
results['dac']['MAPE'].append(mape(y_test, model_dac.predict(X_alg_test)))

# Results

In [65]:
results

{'decentralized': {'R2': [0.9097815737605137],
  'MSE': [20498.27641554144],
  'MAPE': [0.23476801758091598]},
 'centralized': {'R2': [0.15208436445306872],
  'MSE': [192652.54116008233],
  'MAPE': [0.9502016014221815]},
 'dac': {'R2': [0.9090446406267634],
  'MSE': [20665.712932725706],
  'MAPE': [0.24600776336477986]}}