<a href="https://colab.research.google.com/github/ozakiryota/kaggle/blob/main/store_sales/store_sales_rnn_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downlad and check datasets

Activate Kaggle API

In [None]:
from google.colab import files
import os

jason_path = "/root/.kaggle/kaggle.json"

if not os.path.exists(jason_path):
    uploaded = files.upload()

    for fn in uploaded.keys():
        print("User uploaded file '{name}' with length {length} bytes".format(
            name=fn, length=len(uploaded[fn])))

    # Then move kaggle.json into the folder where the API expects to find it.
    !mkdir -p /root/.kaggle/ && mv kaggle.json /root/.kaggle/ && chmod 600 /root/.kaggle/kaggle.json

else:
    print(jason_path, "is already exists.")

Download the datasets

In [None]:
!pip install kaggle
!kaggle competitions download -c store-sales-time-series-forecasting
!unzip -n train.csv.zip
!unzip -n transactions.csv.zip

Check the datasets

In [None]:
import pandas as pd

df = pd.read_csv("/content/train.csv")
# df = pd.read_csv("/content/test.csv")
print(df.info())
df

# Fuctions and class

In [None]:
import pandas as pd

def makeDataList(main_csv_path_list, oil_csv_path, holiday_csv_path):
    datalist = pd.DataFrame()
    for path in main_csv_path_list:
        datalist = pd.concat([datalist, pd.read_csv(path)], ignore_index=True)
    
    datalist = datalist.drop("id", axis=1).set_index(["date", "store_nbr", "family"]).sort_index().unstack(["store_nbr", "family"])

    datalist = pd.merge(datalist, pd.read_csv(oil_csv_path), on="date", how="left")
    datalist["dcoilwtico"] = datalist["dcoilwtico"].fillna(datalist["dcoilwtico"].mean())
    datalist = pd.merge(datalist, pd.read_csv(holiday_csv_path), on="date", how="left").drop(["locale_name", "description"], axis=1)
    datalist["type"] = datalist["type"].fillna("NA")
    datalist["locale"] = datalist["locale"].fillna("NA")
    datalist["transferred"] = datalist["transferred"].fillna("NA")

    # datalist["date"] = pd.to_datetime(datalist["date"])
    datalist = datalist.drop("date", axis=1)
    datalist = pd.get_dummies(datalist)

    return datalist

In [None]:
## TEST
datalist = makeDataList(["/content/train.csv"], "/content/oil.csv", "/content/holidays_events.csv")
print("datalist.columns =\n", datalist.columns)
print("datalist.values[0] =", datalist.values[0])
datalist

In [None]:
def getSaleColumns(datalist):
    sale_columns = []
    for col in datalist.columns:
        if col[0] == "sales":
            sale_columns.append(col)
    return sale_columns

In [None]:
## TEST
print(getSaleColumns(datalist))

In [None]:
## TEST
from sklearn.model_selection import train_test_split

train_datalist, val_datalist = train_test_split(datalist, test_size=0.1, random_state=1234, shuffle=True)
print(train_datalist)
print(val_datalist)

In [None]:
import numpy as np

import torch.utils.data as data

class DatasetMaker(data.Dataset):
    def __init__(self, datalist, past_size, future_size):
        self.datalist = datalist.values.astype(np.float32)
        self.past_size = past_size
        self.future_size = future_size

        sale_columns = getSaleColumns(datalist)
        self.info_datalist = datalist.drop(sale_columns, axis=1).values.astype(np.float32)
        self.sale_datalist = datalist[sale_columns].values.astype(np.float32)

    def __len__(self):
        return len(self.datalist) - (self.past_size + self.future_size) + 1

    def __getitem__(self, index):
        past_end = index + self.past_size   # future_start
        future_end = past_end + self.future_size
        inputs_past = self.datalist[index:past_end]
        inputs_info = self.info_datalist[past_end:future_end]
        labels = self.sale_datalist[past_end - 1:future_end]
        return inputs_past, inputs_info, labels

In [None]:
## TEST
dataset = DatasetMaker(datalist, 16, 16)
inputs_past0 = dataset.__getitem__(index=0)[0]
inputs_info0 = dataset.__getitem__(index=0)[1]
labels0 = dataset.__getitem__(index=0)[2]
print("dataset.__len__() =", dataset.__len__())
print("inputs_past0 =\n", inputs_past0)
print("inputs_past0.shape =", inputs_past0.shape)
print("inputs_info0 =\n", inputs_info0)
print("inputs_info0.shape =", inputs_info0.shape)
print("labels0 =\n", labels0)
print("labels0.shape =", labels0.shape)

In [None]:
## TEST
import torch

batch_size = 10
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
batch_itr = iter(dataloader)
inputs_past, inputs_info, labels = next(batch_itr)

print("inputs_past.size() =", inputs_past.size())
print("inputs_info.size() =", inputs_info.size())
# print("labels =\n", labels)
print("labels.size() =", labels.size())

In [None]:
from torch import nn

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()

        self.input_dim = input_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        x, final_h = self.rnn(x, None)
        return final_h

class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super().__init__()

        self.input_dim = input_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = self.cnn = nn.Sequential(
            nn.Linear(hidden_dim, 2*hidden_dim),
            nn.Linear(2*hidden_dim, hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x, h):
        x, final_h = self.rnn(x, h)
        x = self.fc(final_h[-1])
        return x, final_h

In [None]:
## TEST
enc = Encoder(
    input_dim = dataset.__getitem__(index=0)[0].shape[1],
    hidden_dim = 128,
    num_layers = 2
)
dec = Decoder(
    input_dim = dataset.__getitem__(index=0)[1].shape[1] + dataset.__getitem__(index=0)[2].shape[1],
    hidden_dim = 128,
    output_dim = dataset.__getitem__(index=0)[2].shape[1],
    num_layers = 2
)
print(enc)
print(dec)
h = enc(inputs_past)
dec_inputs = torch.cat((inputs_info[:, 0, :], labels[:, 0, :]), dim=1).unsqueeze(1)
outputs, h = dec(dec_inputs, h)
print("outputs =\n", outputs)
print("outputs.size() =", outputs.size())
print("h =\n", h)
print("h.size() =", h.size())

# Training

In [None]:
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.optim as optim

class Trainer:
    def __init__(self, num_epochs, train_csv_path, oil_csv_path, holiday_csv_path, batch_size, lr, save_weights_path_list):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("self.device = ", self.device)

        self.num_epochs = num_epochs
        self.save_weights_path_list = save_weights_path_list
        past_size = 16
        future_size = 16

        datalist = makeDataList([train_csv_path], oil_csv_path, holiday_csv_path)
        train_datalist, val_datalist = train_test_split(datalist, test_size=0.1, random_state=1234, shuffle=True)
        train_dataset = DatasetMaker(train_datalist, past_size, future_size)
        val_dataset = DatasetMaker(val_datalist, past_size, future_size)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
        self.dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

        hidden_dim = 256
        num_layers = 3
        self.enc = Encoder(
            input_dim = dataset.__getitem__(index=0)[0].shape[1],
            hidden_dim = hidden_dim,
            num_layers = num_layers
        )
        self.dec = Decoder(
            input_dim = dataset.__getitem__(index=0)[1].shape[1] + dataset.__getitem__(index=0)[2].shape[1],
            hidden_dim = hidden_dim,
            output_dim = dataset.__getitem__(index=0)[2].shape[1],
            num_layers = num_layers
        )
        self.enc.to(self.device)
        self.dec.to(self.device)
        print(self.enc)
        print(self.dec)
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam([
            {"params": self.enc.parameters()},
            {"params": self.dec.parameters()}
            ], lr=lr
        )

    def train(self):
        ## time
        start_clock = time.time()
        ## record
        record_loss_dict = {"train": [], "val": []}
        min_loss_epoch = 0.0
        ## loop
        for epoch in range(self.num_epochs):
            if epoch == 0 or not (epoch+1) % (num_epochs // 10):
                print("----------")
                print("Epoch {}/{}".format(epoch+1, self.num_epochs))
            ## phase
            for phase in ["train", "val"]:
                ## setting
                if phase == "train":
                    self.enc.train()
                    self.dec.train()
                else:
                    self.enc.eval()
                    self.dec.eval()
                ## buffer
                loss_epoch = 0.0
                num_inputs_epoch = 0
                for inputs_past, inputs_info, labels in self.dataloaders_dict[phase]:
                    inputs_past = inputs_past.to(self.device)
                    inputs_info = inputs_info.to(self.device)
                    labels = labels.to(self.device)
                    ## reset gradient
                    self.optimizer.zero_grad()
                    ## buffer
                    loss = 0.0
                    ## switch computing gradient
                    with torch.set_grad_enabled(phase == "train"):
                        ## forward
                        h = self.enc(inputs_past)
                        for i in range(labels.shape[1] - 1):
                            dec_inputs = torch.cat((inputs_info[:, i, :], labels[:, i, :]), dim=1).unsqueeze(1)
                            outputs, h = self.dec(dec_inputs, h)
                            loss += self.criterion(outputs, labels[:, i+1, :])
                        ## backward
                        if phase == "train":
                            loss.backward()
                            self.optimizer.step()
                    ## add
                    loss_epoch += loss.item() * labels.size(0)
                    num_inputs_epoch += labels.size(0)
                ## average loss
                loss_epoch = loss_epoch / num_inputs_epoch
                record_loss_dict[phase].append(loss_epoch)
                if epoch == 0 or not (epoch+1) % (num_epochs // 10):
                    print("{} Loss: {:.4f}".format(phase, loss_epoch))
            ## save
            if epoch == 0 or record_loss_dict["val"][-1] < min_loss_epoch:
                min_loss_epoch = record_loss_dict["val"][-1]
                torch.save(self.enc.state_dict(), self.save_weights_path_list[0])
                torch.save(self.dec.state_dict(), self.save_weights_path_list[1])
        ## time
        mins = (time.time() - start_clock) // 60
        secs = (time.time() - start_clock) % 60
        print ("training time: ", mins, " [min] ", secs, " [sec]")
        ## graph
        self.showGraph(record_loss_dict)

    def showGraph(self, record_loss_dict):
        graph = plt.figure()
        plt.plot(range(len(record_loss_dict["train"])), record_loss_dict["train"], label="Training")
        plt.plot(range(len(record_loss_dict["val"])), record_loss_dict["val"], label="Validation")
        plt.legend()
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("loss: train=" + str(record_loss_dict["train"][-1]) + ", val=" + str(record_loss_dict["val"][-1]))
        plt.show()

num_epochs = 1000
train_csv_path = "/content/train.csv"
oil_csv_path = "/content/oil.csv"
holiday_csv_path = "/content/holidays_events.csv"
batch_size = 100
lr = 0.01
save_weights_path_list = ["/content/enc_weights.pth", "/content/dec_weights.pth"]

trainer = Trainer(num_epochs, train_csv_path, oil_csv_path, holiday_csv_path, batch_size, lr, save_weights_path_list)
trainer.train()

# Evaluation

In [None]:
import time
import pandas as pd

import torch

class Evaluator:
    def __init__(self, train_csv_path, test_csv_path, oil_csv_path, holiday_csv_path, weights_path, save_csv_path_list):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("self.device = ", self.device)

        past_size = 16

        self.test_csv_path = test_csv_path
        self.save_csv_path = save_csv_path
        datalist = makeDataList([train_csv_path, test_csv_path], oil_csv_path, holiday_csv_path)
        self.sale_columns = getSaleColumns(datalist)
        self.past_datalist = datalist.loc[datalist[self.sale_columns[0]].isna() == False][-past_size:]
        self.future_datalist = datalist.loc[datalist[self.sale_columns[0]].isna()].drop(self.sale_columns, axis=1)

        hidden_dim = 256
        num_layers = 3
        self.enc = Encoder(
            input_dim = self.past_datalist.shape[1],
            hidden_dim = hidden_dim,
            num_layers = num_layers
        )
        self.dec = Decoder(
            input_dim = self.future_datalist.shape[1] + len(self.sale_columns),
            hidden_dim = hidden_dim,
            output_dim = len(self.sale_columns),
            num_layers = num_layers
        )
        self.enc.to(self.device)
        self.dec.to(self.device)
        if torch.cuda.is_available():
            loaded_enc_weights = torch.load(weights_path_list[0])
            loaded_dec_weights = torch.load(weights_path_list[1])
            print("Weights have been loaded [GPU -> GPU]:", weights_path_list)
        else:
            loaded_enc_weights = torch.load(weights_path_list[0], map_location={"cuda:0": "cpu"})
            loaded_dec_weights = torch.load(weights_path_list[1], map_location={"cuda:0": "cpu"})
            print("Weights have been loaded [GPU -> CPU]:", weights_path_list)
        self.enc.load_state_dict(loaded_enc_weights)
        self.dec.load_state_dict(loaded_dec_weights)
        print(self.enc)
        print(self.dec)

    def evaluate(self):
        ## time
        start_clock = time.time()
        ## setting
        self.enc.eval()
        self.dec.eval()
        ## ndarray -> tensor
        inputs_past = torch.from_numpy(self.past_datalist.values.astype(np.float32)).unsqueeze(0).to(self.device)
        inputs_future = torch.from_numpy(self.future_datalist.values.astype(np.float32)).unsqueeze(0).to(self.device)
        sales = torch.from_numpy(self.past_datalist[self.sale_columns].values[-1].astype(np.float32)).unsqueeze(0).to(self.device)
        print("inputs_future.size() =", inputs_future.size())
        ## forward
        with torch.no_grad():
            h = self.enc(inputs_past)
            for i in range(len(self.future_datalist)):
                dec_inputs = torch.cat((inputs_future[:, i, :], sales[-1].unsqueeze(0)), dim=1).unsqueeze(1)
                outputs, h = self.dec(dec_inputs, h)
                if i == 0:
                    sales = outputs
                else:
                    sales = torch.cat((sales, outputs), dim=0)
        ## save
        self.writeCSV(sales)
        ## time
        mins = (time.time() - start_clock) // 60
        secs = (time.time() - start_clock) % 60
        print ("evaluation time: ", mins, " [min] ", secs, " [sec]")

    def writeCSV(self, sales):
        result_df = pd.DataFrame({"id": pd.read_csv(self.test_csv_path)["id"].values, "sales": sales.cpu().detach().numpy().reshape(-1)})
        result_df.to_csv(self.save_csv_path, index=False)
        print(result_df)


train_csv_path = "/content/train.csv"
test_csv_path = "/content/test.csv"
oil_csv_path = "/content/oil.csv"
holiday_csv_path = "/content/holidays_events.csv"

weights_path_list = ["/content/enc_weights.pth", "/content/dec_weights.pth"]
save_csv_path = "/content/submission.csv"

evaluator = Evaluator(train_csv_path, test_csv_path, oil_csv_path, holiday_csv_path, weights_path_list, save_csv_path)
evaluator.evaluate()

# Submission

In [None]:
import sys

print("Submit the file? (y/n)")
ans = input()
while ans != "y" and ans != "n":
    print("Type 'y' or 'n'")
    ans = input()
if ans == "y":
    !kaggle competitions submit -c store-sales-time-series-forecasting -f submission.csv -m "store_sales_rnn_seq2seq.ipynb"