In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from datetime import datetime
from datetime import timedelta
import random
from importlib import reload
from IPython.display import Audio
import m5forecasting as m5
sound_file = './sound/radio.wav'

In [2]:
df = m5.df
all_data = m5.all_data

In [46]:
# rolling = [[2,3,4], [1,3,5], [1,2]]
# start_day = 1913
# end_day = 1913
# num_days = end_day - start_day + 1
# total_entries = num_days * 30490
# # rolling lag features
# count = 0
# temp_training = np.zeros((total_entries, len(rolling)))
# for time_series_idx in range(30490):
#     for i in range(end_day - num_days,end_day):
#         temp_training[count] = [all_data[time_series_idx, [i - lag for lag in rolling_lags]].sum() / len(rolling_lags) 
#                                 for rolling_lags in rolling]
#         count += 1

## Feature Generation

In [51]:
def getYearMonthDay(day):
    temp = datetime(2011,1,28) + timedelta(days=day)
    return [temp.year, temp.month, temp.day]

def getDay(day):
    temp = day % 7
    return [temp, int(temp > 0 and temp < 3)]

def getSnapValue(day, state_index):
    return int(m5.calendar[m5.calendar.d == "d_" + str(day)].iloc[:, 11 + state_index])

# item number, item department, item category, store, state
departmentInt = {}
current_id = "HOBBIES_1"
count = 0
for i in range(3049):
    if df.iloc[i]["dept_id"] != current_id or i == 3048:
        departmentInt[current_id] = count
        count += 1
        current_id = df.iloc[i]["dept_id"]

categoryInt = {}
current_id = "HOBBIES"
count = 0
for i in range(3049):
    if df.iloc[i]["cat_id"] != current_id or i == 3048:
        categoryInt[current_id] = count
        count += 1
        current_id = df.iloc[i]["cat_id"]

storeInt = {}
current_id = "CA_1"
count = 0
for i in range(0,30490,3049):
    if df.iloc[i]["store_id"] != current_id:
        storeInt[current_id] = count
        count += 1
        current_id = df.iloc[i]["store_id"]
        storeInt[current_id] = count

stateInt = {}
current_id = "CA"
count = 0
for i in range(0,30490,3049):
    if df.iloc[i]["state_id"] != current_id:
        stateInt[current_id] = count
        count += 1
        current_id = df.iloc[i]["state_id"]
        stateInt[current_id] = count

def generateTrainingData(start_day, end_day, lags, rolling, all_data, withLabels = True, verbose=False):
    num_days = end_day - start_day + 1
    num_series = 30490
    total_entries = num_days * num_series
    training_data = np.zeros((total_entries, len(lags)))
    categoricalfeatures = [0] * len(lags)
    if withLabels:
        training_labels = np.zeros(total_entries)
    
    # lag features
    count = 0
    for time_series_idx in range(30490):
        for i in range(end_day - num_days,end_day):
            training_data[count] = all_data[time_series_idx, [i - lag for lag in lags]]
            if withLabels:
                training_labels[count] = all_data[time_series_idx, i]
            count += 1
    
    if verbose:
        print("Finished lag features")
    
    # rolling lag features
    count = 0
    categoricalfeatures.extend([0] * len(rolling))
    temp_training = np.zeros((total_entries, len(rolling)))
    for time_series_idx in range(30490):
        for i in range(end_day - num_days,end_day):
            temp_training[count] = [all_data[time_series_idx, [i - lag for lag in rolling_lags]].sum() / len(rolling_lags) for rolling_lags in rolling]
            count += 1
    training_data = np.concatenate((training_data, temp_training), axis=1)
    
    if verbose:
        print("Finished rolling lag features")
    
    # item features - item number, department, category, store, state
    temp_training = np.zeros((total_entries, 5))
    categoricalfeatures.extend([1] * 5)
    for i in range(0, total_entries, num_days):
        item_number = i // num_days % 3049
        temp_training[i:i + num_days, 0] = item_number
        temp_training[i:i + num_days, 1] = departmentInt[df["dept_id"].iloc[item_number]]
        temp_training[i:i + num_days, 2] = categoryInt[df["cat_id"].iloc[item_number]]
        temp_training[i:i + num_days, 3] = storeInt[df["store_id"].iloc[i // num_days]]
        temp_training[i:i + num_days, 4] = stateInt[df["state_id"].iloc[i // num_days]]
    training_data = np.concatenate((training_data, temp_training), axis=1)
    
    if verbose:
        print("Finished item features")
    
    # snap feature
    temp_training = np.zeros((total_entries, 1))
    categoricalfeatures.append(1)
    for idx, day in enumerate(range(start_day, end_day + 1)):
        temp_training[range(idx, idx + 3049 * 4 * num_days, num_days), 0] = getSnapValue(day, 0)
        temp_training[range(idx + 3049 * 4 * num_days, idx + 3049 * 7 * num_days, num_days), 0] = getSnapValue(day, 1)
        temp_training[range(idx + 3049 * 7 * num_days, idx + 3049 * 10 * num_days, num_days), 0] = getSnapValue(day, 2)
    training_data = np.concatenate((training_data, temp_training), axis=1)
    
    # month, day, year
    temp_training = np.zeros((total_entries, 3))
    categoricalfeatures.extend([1] * 3)
    for i in range(total_entries):
        temp_training[i] = getYearMonthDay(i % num_days + (end_day + 1 - num_days))
    training_data = np.concatenate((training_data, temp_training), axis=1)
    
    if verbose:
        print("Finished date features")
    
    # day of week and is weekend
    temp_training = np.zeros((total_entries, 2))
    categoricalfeatures.extend([1] * 2)
    for i in range(total_entries):
        temp_training[i] = getDay(i % num_days + (end_day + 1 - num_days))
    training_data = np.concatenate((training_data, temp_training), axis=1)
    
    if verbose:
        print("Finished day features")
    
    if withLabels:
        return training_data, training_labels, [idx for idx, val in enumerate(categoricalfeatures) if val == 1]
    else:
        return training_data

## Model Training / Testing

In [4]:
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual < 0, -2.0 * residual, -2 * residual * 1.15)
    hess = np.where(residual < 0, 2.0, 2.0 * 1.15)
    return grad, hess

def custom_asymmetric_valid(y_pred, y_true):
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual**2)*10.0, residual**2) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [5]:
def Level_12_WRMSSE(y_pred, y_true):
    y_true = y_true.get_label()
    diff = (y_true - y_pred).astype("float")
    diff_squared = (diff * diff).reshape((30490, 28))
    return "Lev 12", np.sum(np.sqrt(np.sum(diff_squared, axis = 1)) * m5.weights[12] / 12.0), False

def Level_1_WRMSSE(y_pred, y_true):
    y_true = y_true.get_label()
    diff = (y_true - y_pred).astype("float").reshape((30490, 28))
    diff_squared = np.square(m5.transformer[1](diff))
    return "Lev 1", np.sum(np.sqrt(np.sum(diff_squared, axis = 1)) * m5.weights[1] / 12.0), False

In [391]:
def Level_1_12_WRMSSE(y_pred, y_true):
    response1 = Level_1_WRMSSE(y_pred, y_true)
    response12 = Level_12_WRMSSE(y_pred, y_true)
    return "Lev 1: {} Lev 12:".format(response1[1]), response12[1], False

## Ignore Above

In [22]:
def byLevelWRMSSE(y_pred, y_true, level):
    y_true = y_true.get_label()
    diff = (y_true - y_pred).astype("float").reshape((30490, 28))
    diff_squared = np.square(m5.transformer[level](diff))
    return "Lev {}".format(level), np.sum(np.sqrt(np.sum(diff_squared, axis = 1)) * m5.weights[level] / 12.0), False

In [6]:
minWeight = min([i for i in m5.weights[12] if i != 0])
maximum = m5.weights[12].max()
def level12Weighted(y_pred, y_true):
    y_true = y_true.get_label()
    num_days = int(len(y_pred) / 30490)
    residual = (y_true - y_pred).astype("float")
    for i in range(30490): 
        residual[i * num_days:(i+1) * num_days] = max(minWeight, m5.weights[12][i]) / maximum * residual[i * num_days:(i+1) * num_days]
    grad = -5.0 * residual
    hess = 5.0 * np.ones(residual.shape)
    return grad, hess

In [19]:
def groupByDay(y_pred, y_true):
    y_true = y_true.get_label()
    num_days = int(len(y_pred) / 30490)
    residual = (y_true - y_pred).astype("float")
    grad = np.zeros(y_pred.shape)
    hess = np.zeros(y_pred.shape)
    for i in range(num_days):
        all_series = residual[range(i, total, num_days)]
        grad[range(i, total, num_days)] -= all_series.sum() / (30490 * 2.0)
    return grad, hess

In [20]:
def combined1(y_pred, y_true):
    l12grad, l12hess = level12Weighted(y_pred, y_true)
    groupgrad, grouphess = groupByDay(y_pred, y_true)
    return l12grad + groupgrad, l12hess + grouphess

In [None]:
def groupByTimeSeries():
    # put all data from one time series under one sqrt so that it affects the gradient
    for i in range(30490):
        residual[i * num_days: (i+1) * num_days]

In [7]:
params = {
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.2,
    'bagging_freq': 2, 
    'colsample_bytree': 0.95,
    'colsample_bynode': 0.5,
    'num_leaves': 1000,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'metric': "None",
    'max_bin': 3049,
    'first_metric_only': True
}

In [8]:
def learningRate(num_round):
    return 0.08 + 0.01 * num_round / 500

Data
- SNAP data
- weather data
- rolling averages
- seasonal data
- prices?

Custom Loss
- Independent gradients with RMSE and weights
- Dependent gradients - group by time series and group by day
- Various levels

In [24]:
def expandWeights(weight, days):
    new_weights = np.zeros(len(weight) * days)
    for i in range(days):
        new_weights[range(i, len(new_weights), days)] = weight
    return new_weights

In [52]:
%%time
lags = list(range(7, 15, 7)) + list(range(15, 40, 1))
rolling = [range(1,8), range(2,9), range(3,10), range(4,11), range(5, 12), range(6, 13), range(7, 14)]
start_day = 1200
end_day = 1885
num_train_days = end_day - start_day + 1
num_val_days = 1913 - end_day + 1
training_data, training_labels, categories = generateTrainingData(start_day, end_day, lags, rolling, all_data, verbose=True)

val_data, val_labels, _ = generateTrainingData(end_day + 1, 1913, lags, rolling, all_data, verbose=True)

training_dataset = lgb.Dataset(training_data, 
                               label=training_labels, 
                               free_raw_data=False, 
                               weight=expandWeights(m5.weights[12], num_train_days))
val_dataset = lgb.Dataset(val_data,
                          label=val_labels, 
                          free_raw_data=False, 
                          reference=training_dataset)

Finished lag features
Finished rolling lag features
Finished item features
Finished date features
Finished day features
Finished lag features
Finished rolling lag features
Finished item features
Finished date features
Finished day features
CPU times: user 19min 47s, sys: 47.3 s, total: 20min 35s
Wall time: 20min 33s


In [None]:
np.save("features/training_data.npy", training_data)
np.save("features/training_labels.npy", training_data)
np.save("features/training_data.npy", training_data)
np.save("features/training_data.npy", training_data)
np.save("features/training_data.npy", training_data)

In [53]:
%%time
model = lgb.train(params, 
                  training_dataset, 
                  categorical_feature=categories, 
                  num_boost_round=4000,
                  valid_sets = [val_dataset],
                  fobj=level12Weighted,
                  feval = lambda y_pred, y_true: [byLevelWRMSSE(y_pred, y_true, level) for level in [1, 3, 6, 10, 12]],
                  early_stopping_rounds=100,
                  verbose_eval=3,
                  learning_rates=learningRate
                 )

New categorical_feature is [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[3]	valid_0's Lev 1: 0.594939	valid_0's Lev 3: 0.517056	valid_0's Lev 6: 0.515979	valid_0's Lev 10: 0.222723	valid_0's Lev 12: 0.120143
[6]	valid_0's Lev 1: 0.587216	valid_0's Lev 3: 0.510337	valid_0's Lev 6: 0.509264	valid_0's Lev 10: 0.21739	valid_0's Lev 12: 0.117888
[9]	valid_0's Lev 1: 0.579719	valid_0's Lev 3: 0.503825	valid_0's Lev 6: 0.502764	valid_0's Lev 10: 0.212499	valid_0's Lev 12: 0.115876
[12]	valid_0's Lev 1: 0.572331	valid_0's Lev 3: 0.4974	valid_0's Lev 6: 0.49637	valid_0's Lev 10: 0.207807	valid_0's Lev 12: 0.113941
[15]	valid_0's Lev 1: 0.5652	valid_0's Lev 3: 0.491205	valid_0's Lev 6: 0.490209	valid_0's Lev 10: 0.203465	valid_0's Lev 12: 0.112204
[18]	valid_0's Lev 1: 0.558233	valid_0's Lev 3: 0.485161	valid_0's Lev 6: 0.484198	valid_0's Lev 10: 0.199476	valid_0's Lev 12: 0.110648
[21]	valid_0's Lev 1: 0.551485	valid_0's Lev 3: 0.479307	valid_0's Lev 6: 0.478385	valid_0's Lev 10: 0.195692	valid_0's Lev 1

[183]	valid_0's Lev 1: 0.314951	valid_0's Lev 3: 0.274923	valid_0's Lev 6: 0.276336	valid_0's Lev 10: 0.108715	valid_0's Lev 12: 0.0799883
[186]	valid_0's Lev 1: 0.312054	valid_0's Lev 3: 0.272429	valid_0's Lev 6: 0.273863	valid_0's Lev 10: 0.107933	valid_0's Lev 12: 0.0797655
[189]	valid_0's Lev 1: 0.309211	valid_0's Lev 3: 0.269982	valid_0's Lev 6: 0.271435	valid_0's Lev 10: 0.107184	valid_0's Lev 12: 0.0795584
[192]	valid_0's Lev 1: 0.306379	valid_0's Lev 3: 0.267546	valid_0's Lev 6: 0.269018	valid_0's Lev 10: 0.106439	valid_0's Lev 12: 0.0793484
[195]	valid_0's Lev 1: 0.303583	valid_0's Lev 3: 0.265139	valid_0's Lev 6: 0.266632	valid_0's Lev 10: 0.105705	valid_0's Lev 12: 0.0791398
[198]	valid_0's Lev 1: 0.30082	valid_0's Lev 3: 0.262762	valid_0's Lev 6: 0.264278	valid_0's Lev 10: 0.104978	valid_0's Lev 12: 0.0789348
[201]	valid_0's Lev 1: 0.298117	valid_0's Lev 3: 0.260436	valid_0's Lev 6: 0.26197	valid_0's Lev 10: 0.104288	valid_0's Lev 12: 0.0787494
[204]	valid_0's Lev 1: 0.2954

KeyboardInterrupt: 

## Recursive Predictor

In [11]:
%%time
all_data = np.array(df.iloc[:,6:])
for i in range(28):
    test_data = generateTrainingData(1914 + i, 1914 + i, lags, all_data, withLabels=False)
    preds = model.predict(test_data)
    all_data = np.concatenate((all_data, preds.reshape((30490, 1))), axis = 1)
    if (i + 1) % 7 == 0:
        print("Finished day {}".format(i+1))
predictions = all_data[:, -28:]
all_data = np.array(df.iloc[:,6:])

Finished day 7
Finished day 14
Finished day 21
Finished day 28
CPU times: user 58min 29s, sys: 16.6 s, total: 58min 46s
Wall time: 4min 41s


In [222]:
predictions.shape

(30490, 28)

## Preparing for Submission

In [16]:
modelName = "WeightedLev12_CustLoss_Recursive"
submissionReady = True
try:
    if tempName != modelName:
        modelNumber = 1
except:
    modelNumber = 1

In [17]:
try:
    submission
except:
    submission = pd.read_csv("submissions/sample_submission.csv")

try:
    for idx, col in enumerate(submission.columns.values):
        if (col != "id"):
            submission[col] = np.concatenate((predictions[:, idx - 1], np.zeros(30490)), axis = 0)
    if submissionReady:
        submission.to_csv("submissions/{}_{}.csv".format(modelName, modelNumber), index=False)
        modelNumber += 1
        tempName = modelName
    Audio(sound_file, autoplay=True)
except:
    Audio(sound_file, autoplay=True)

## Analyzing Submission

In [12]:
loss = m5.WRMSSE(predictions)

In [15]:
loss.getLossByLevel()

array([0.03114119, 0.03597479, 0.04509371, 0.05176154, 0.06080759,
       0.05439107, 0.06341521, 0.06035788, 0.06733157, 0.07376391,
       0.07209218, 0.0709939 ])

In [363]:
predictions

array([[0.87363992, 0.88076001, 0.8674595 , ..., 0.80392866, 0.88931671,
        0.96785068],
       [0.12801542, 0.21768327, 0.25947869, ..., 0.58718253, 0.73988839,
        0.75394457],
       [0.61364024, 0.61756487, 0.57133333, ..., 0.67876188, 0.9176863 ,
        0.84674448],
       ...,
       [0.76337234, 0.74738471, 0.72485069, ..., 0.94270864, 0.86811901,
        0.87433442],
       [0.89914451, 0.83169341, 0.7941819 , ..., 0.73284713, 0.913416  ,
        1.1372374 ],
       [0.45932018, 0.77602184, 1.08207415, ..., 0.86362387, 1.007293  ,
        1.09448519]])

In [352]:
np.arange(20).reshape((4,5))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [367]:
np.sum(np.square(predictions - actuals), axis = 0)

array([100019.93324995,  83367.38996016,  91031.97160793,  89281.7725182 ,
       111754.72564358, 137812.66195427, 148735.55006789, 131057.25625598,
       149678.98943148, 137090.57126126, 133787.96511714, 171787.95659446,
       193377.202348  , 179735.17123683, 175064.79215575, 124920.41921029,
       157498.65193641, 133301.93631517, 134005.2117409 , 201860.46232403,
       236577.64660356, 139264.36576748, 117248.85819343, 110172.76577734,
       114640.65986176, 124651.1363411 , 175640.15521601, 180264.41339906])

In [373]:
predictions - actuals

array([[ 0.87363992,  0.88076001,  0.8674595 , ..., -2.19607134,
         0.88931671, -0.03214932],
       [ 0.12801542, -0.78231673,  0.25947869, ...,  0.58718253,
         0.73988839,  0.75394457],
       [ 0.61364024,  0.61756487, -0.42866667, ..., -2.32123812,
         0.9176863 , -0.15325552],
       ...,
       [ 0.76337234,  0.74738471, -0.27514931, ..., -0.05729136,
         0.86811901, -1.12566558],
       [-0.10085549, -2.16830659,  0.7941819 , ..., -0.26715287,
        -0.086584  ,  1.1372374 ],
       [ 0.45932018,  0.77602184,  1.08207415, ..., -1.13637613,
        -3.992707  ,  0.09448519]])

In [374]:
modelName

'WeightedLev12_CustLoss_Recursive'

In [393]:
actuals

array([[0, 0, 0, ..., 3, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 3, 0, 1],
       ...,
       [0, 0, 1, ..., 1, 0, 2],
       [1, 3, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 2, 5, 1]])

In [None]:
def analyzeDay(preds, day, level):
    transformed_preds = transformer[level](preds)[:, day - 1]
    transformed_actuals = transformer[level](np.array(actuals))[:, day - 1]
    return (weights[level] * np.sqrt(28) * np.abs(transformed_preds - transformed_actuals)).sum()

In [424]:
import calendar

In [425]:
calendar

<module 'calendar' from '/Users/rajatmittal/.pyenv/versions/3.7.3/lib/python3.7/calendar.py'>

In [54]:
np.save("test", [1,2,3])

In [56]:
np.load("test.npy")

array([1, 2, 3])

In [58]:
priceByDay = pd.read_csv("Other_Data/priceByDay.csv")

In [59]:
m5.weights[12]

array([9.70740759e-06, 5.11647351e-07, 2.70314355e-06, ...,
       3.30844514e-06, 1.13730592e-06, 5.81508798e-07])

In [61]:
m5.bare_revenue

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,8.38,25.14,0.00,8.38,8.38,8.38,25.14,0.00,8.38,8.38
1,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,3.97,0.00,0.00,0.00,0.00
2,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,5.94,2.97,5.94,2.97,2.97,2.97,0.00,2.97,2.97,2.97
3,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,4.64,0.00,23.20,18.56,4.64,0.00,4.64,13.92,32.48,9.28
4,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,5.76,2.88,2.88,0.00,2.88,2.88,5.76,5.76,5.76,11.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0.0,0.0,5.0,5.0,0.0,7.50,2.5,10.00,2.50,0.00,...,5.96,0.00,0.00,0.00,0.00,0.00,2.98,0.00,0.00,2.98
30486,0.0,0.0,0.0,0.0,0.0,12.35,0.0,2.47,2.47,7.41,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.48,0.00
30487,0.0,24.0,0.0,8.0,8.0,16.00,4.0,32.00,20.00,8.00,...,7.96,3.98,0.00,7.96,0.00,3.98,0.00,0.00,3.98,0.00
30488,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,1.28,0.00,0.00,1.28,0.00,3.84,1.28,3.84


In [76]:
priceByDay.iloc[:,2:] * df.iloc[:,6:]

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,8.38,25.14,0.00,8.38,8.38,8.38,25.14,0.00,8.38,8.38
1,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,3.97,0.00,0.00,0.00,0.00
2,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,5.94,2.97,5.94,2.97,2.97,2.97,0.00,2.97,2.97,2.97
3,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,4.64,0.00,23.20,18.56,4.64,0.00,4.64,13.92,32.48,9.28
4,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,5.76,2.88,2.88,0.00,2.88,2.88,5.76,5.76,5.76,11.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0.0,0.0,5.0,5.0,0.0,7.50,2.5,10.00,2.50,0.00,...,5.96,0.00,0.00,0.00,0.00,0.00,2.98,0.00,0.00,2.98
30486,0.0,0.0,0.0,0.0,0.0,12.35,0.0,2.47,2.47,7.41,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.48,0.00
30487,0.0,24.0,0.0,8.0,8.0,16.00,4.0,32.00,20.00,8.00,...,7.96,3.98,0.00,7.96,0.00,3.98,0.00,0.00,3.98,0.00
30488,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,1.28,0.00,0.00,1.28,0.00,3.84,1.28,3.84


In [69]:
m5.bare_revenue.loc[30486, "d_6"]

12.35

In [77]:
(priceByDay.iloc[:,2:] * df.iloc[:,6:]).loc[30486, "d_6"]

12.350000000000001

In [78]:
m5.bare_revenue.to_csv("revenue.csv", index=False)

In [79]:
m5.df.to_csv("sales_train_validation.csv", index=False)

In [84]:
m5.calendar.to_csv("calendar.csv", index=False)

In [83]:
m5.full_df.to_csv("sales_train_evaluation.csv", index=False)

In [72]:
priceByDay

Unnamed: 0,item_id,store_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001,CA_1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38
1,HOBBIES_1_002,CA_1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97
2,HOBBIES_1_003,CA_1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97
3,HOBBIES_1_004,CA_1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64
4,HOBBIES_1_005,CA_1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823,WI_3,2.50,2.50,2.50,2.50,2.50,2.50,2.50,2.50,...,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98
30486,FOODS_3_824,WI_3,2.47,2.47,2.47,2.47,2.47,2.47,2.47,2.47,...,2.48,2.48,2.48,2.48,2.48,2.48,2.48,2.48,2.48,2.48
30487,FOODS_3_825,WI_3,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00,...,3.98,3.98,3.98,3.98,3.98,3.98,3.98,3.98,3.98,3.98
30488,FOODS_3_826,WI_3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.28,1.28,1.28,1.28,1.28,1.28,1.28,1.28,1.28,1.28


In [75]:
df.iloc[:, 6:]

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0,0,0,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,0,0,0,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,2,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,0,0,1,0
30487,0,6,0,2,2,4,1,8,5,2,...,2,1,0,2,0,1,0,0,1,0
30488,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3
