In [1]:
from typing import List, Callable, Union, Any, TypeVar, Tuple
from torch import nn
import torch
from torch.nn import functional as F
import numpy as np
import pandas as pd
from torch import tensor as Tensor
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from sklearn.model_selection import train_test_split
import random
from models.rnn import CMV_LSTM, MV_LSTM
from models.vae import ConditionalVAE, VanillaVAEEncoder, VanillaVAEDecoder
from models.rvae import UnitRVAE, RVAE
from utils.loss import VAE_Loss

In [2]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seq_len = 10
batch_size = 5
test_batch_size = 3
seed = 42
torch.autograd.set_detect_anomaly(True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# Load datas
df_econs = pd.read_csv('data/total_economy_data.csv', index_col=0)
df_econs = df_econs[df_econs['year'] > 2015].reset_index(drop=True, inplace=False)
df_econs1 = df_econs.drop(columns=['year', 'month', 'By Section_Creadit cooperative institutions', 'Interest Rates on Loans and Discounts_Trust Accounts Loans To Corporations'])
#df_econs1 = df_econs1.dropna(axis=1)
arr_econs = df_econs1.to_numpy()
arr_econs = arr_econs / 1e10
df_trends = pd.read_csv('data/total_trends.csv', index_col=0)
df_trends = df_trends[df_trends['year'] > 2015].reset_index(drop=True, inplace=False)
df_trends1 = df_trends.drop(columns=['year', 'month', 'naver_건물위생관리', 'naver_잡화류', 'naver_항체치료제', 'naver_가구업', 'naver_방송업', 'naver_호텔숙박업', 'naver_무인감시카메라', 'naver_필름제조'])
df_trends1 = df_trends1.dropna(axis=1)
arr_trends = df_trends1.to_numpy() / 20
df_weathers = pd.read_csv('data/average_weather.csv', index_col=0)
df_weathers = df_weathers[df_weathers['year'] > 2015].reset_index(drop=True, inplace=False)
df_weathers1 = df_weathers.drop(columns=['year', 'month', 'avgtgmin', 'tamax', 'tmmax', 'avgcatot', 'tamin', 'tmmin', 'taavg', 'sumssday', 'pa', 'avghm', 'ta', 'avgtamin', 'avgtamax', 'ps', 'daydur'])
df_weathers1 = df_weathers1.dropna(axis=1)
arr_weathers = df_weathers1.to_numpy() / 10

In [5]:
rvae = RVAE(
        arr_econs.shape[-1],
        arr_trends.shape[-1],
        arr_weathers.shape[-1],
        28,
        16,
        4,
        seq_len
        )
rvae.rnn.init_hidden()

In [6]:
rvae.load_state_dict(torch.load('best_total_time_series1.pt'))
model = rvae

In [7]:
class timeseries(Dataset):
    def __init__(self, x, y):
        self.x = x # torch.tensor(x, dtype=torch.float32)
        self.y = y # torch.tensor(y, dtype=torch.float32)
        self.len = len(x) # x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
  
    def __len__(self):
        return self.len
def get_total_datas(seq_length, data, shuffle=True, seed=42):
    total_num = len(data)
    list_x_datas = []
    list_y_datas = []
    for i in range(seq_length, total_num):
        for j in range(seq_length, i + 1):
            data_y = torch.tensor(data[i], dtype=torch.float32)
            data_x = torch.tensor(data[i - j:i], dtype=torch.float32)
            list_y_datas.append(data_y.reshape([1, *data[i].shape]))
            list_x_datas.append(data_x.reshape([1, *data[i - j:i].shape]))
    
    if shuffle:
        list_x_datas1, list_y_datas1 = shuffle_data_list(list_x_datas, list_y_datas, seed=seed)
        return (list_x_datas1), (list_y_datas1)
    return (list_x_datas), (list_y_datas)


def get_total_datas_by_length(seq_length, data, shuffle=True, seed=42, min_batch_size=1):
    total_num = len(data)
    dict_x_datas = {}
    dict_y_datas = {}
    for j in range(seq_length, total_num - min_batch_size + 1):
        dict_x_datas[j] = []
        dict_y_datas[j] = []
        for i in range(total_num - j):
            data_y = torch.tensor(data[i + j], dtype=torch.float32)
            data_x = torch.tensor(data[i:i + j], dtype=torch.float32)
            dict_y_datas[j].append(data_y.reshape([1, *data[i + j].shape]))
            dict_x_datas[j].append(data_x.reshape([1, *data[i:i + j].shape]))

    if shuffle:
        random.seed(seed)
        dict_x_datas1 = {}
        dict_y_datas1 = {}
        for j in range(seq_length, total_num - min_batch_size + 1):
            #set_seeds(random.randint(1, 512))
            list_x_datas, list_y_datas = shuffle_data_list(dict_x_datas[j], dict_y_datas[j], seed=random.randint(1, 512))
            dict_x_datas1[j] = list_x_datas
            dict_y_datas1[j] = list_y_datas
        return (dict_x_datas1), (dict_y_datas1)
    return (dict_x_datas), (dict_y_datas)


def get_test_list(seq_length, data, base_idx):
    test_list = []
    total_num = len(data)
    for i in range(base_idx, total_num):
        for j in range(seq_length, i + 1):
            test_list.append((torch.tensor(data[i - j:i], dtype=torch.float32), torch.tensor(data[i], dtype=torch.float32)))
    return test_list


def get_test_datas_by_length(seq_length, data, base_idx, min_batch_size=1):
    total_num = len(data)
    dict_x_datas = {}
    dict_y_datas = {}
    for j in range(seq_length, total_num - min_batch_size + 1):
        list_xs = []
        list_ys = []
        for i in range(total_num - j):
            if i + j >= base_idx:
                data_y = torch.tensor(data[i + j], dtype=torch.float32)
                data_x = torch.tensor(data[i:i + j], dtype=torch.float32)
                list_ys.append(data_y.reshape([1, *data[i + j].shape]))
                list_xs.append(data_x.reshape([1, *data[i:i + j].shape]))
        if len(list_xs) > 0:
            dict_x_datas[j] = list_xs
            dict_y_datas[j] = list_ys
    return (dict_x_datas), (dict_y_datas)


def shuffle_data_list(*args, seed=42):
    random.seed(seed)
    new_lists = []
    shuffled_indices = list(range(len(args[0])))
    random.shuffle(shuffled_indices)
    for arg in args:
        datas = []
        for idx in shuffled_indices:
            datas.append(arg[idx])
        new_lists.append(datas)
    return new_lists


class timeseries(Dataset):
    def __init__(self, x, y):
        self.x = x # torch.tensor(x, dtype=torch.float32)
        self.y = y # torch.tensor(y, dtype=torch.float32)
        self.len = len(x) # x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
  
    def __len__(self):
        return self.len

In [11]:

values_list = []
columns = ['year']
for i in range(48):
    columns.append(f'environment_{i}')

In [12]:
model = model.to(device)
model.eval()
for year in [2018, 2019, 2020, 2021, 2022]:
    values = [year]
    df_econs1 = df_econs[df_econs['year'] > 2015].reset_index(drop=True, inplace=False)
    df_econs1 = df_econs1[df_econs1['year'] < year + 1].reset_index(drop=True, inplace=False)
    df_econs1 = df_econs1.drop(columns=['year', 'month', 'By Section_Creadit cooperative institutions', 'Interest Rates on Loans and Discounts_Trust Accounts Loans To Corporations'])
    #df_econs1 = df_econs1.dropna(axis=1)
    arr_econs = df_econs1.to_numpy()
    arr_econs = arr_econs / 1e10
    tensor_econs = torch.Tensor(arr_econs)

    df_trends1 = df_trends[df_trends['year'] > 2015].reset_index(drop=True, inplace=False)
    df_trends1 = df_trends1[df_trends1['year'] < year + 1].reset_index(drop=True, inplace=False)
    df_trends1 = df_trends1.drop(columns=['year', 'month', 'naver_건물위생관리', 'naver_잡화류', 'naver_항체치료제', 'naver_가구업', 'naver_방송업', 'naver_호텔숙박업', 'naver_무인감시카메라', 'naver_필름제조'])
    #df_trends1 = df_trends1.dropna(axis=1)
    if year == 2022:
        arr_trends = df_trends1[:-2].to_numpy() / 20
    else:
        arr_trends = df_trends1.to_numpy() / 20
    tensor_trends = torch.Tensor(arr_trends)

    df_weathers1 = df_weathers[df_weathers['year'] > 2015].reset_index(drop=True, inplace=False)
    df_weathers1 = df_weathers1[df_weathers1['year'] < year + 1].reset_index(drop=True, inplace=False)
    df_weathers1 = df_weathers1.drop(columns=['year', 'month', 'avgtgmin', 'tamax', 'tmmax', 'avgcatot', 'tamin', 'tmmin', 'taavg', 'sumssday', 'pa', 'avghm', 'ta', 'avgtamin', 'avgtamax', 'ps', 'daydur'])
    #df_weathers1 = df_weathers1.dropna(axis=1)
    arr_weathers = df_weathers1.to_numpy() / 10
    tensor_weathers = torch.Tensor(arr_weathers)

    mu1, log_var1 = model.vae_encoder1(tensor_econs.unsqueeze(0))
    z1 = model.reparameterize(mu1, log_var1)
    mu2, log_var2  = model.vae_encoder2(tensor_trends.unsqueeze(0))
    z2 = model.reparameterize(mu2, log_var2)
    mu3, log_var3  = model.vae_encoder3(tensor_weathers.unsqueeze(0))
    z3 = model.reparameterize(mu3, log_var3)

    tensor_econs, tensor_trends, tensor_weathers = model.rnn(z1, z2, z3)
    torch_envs = torch.hstack([tensor_econs, tensor_trends, tensor_weathers])
    values.extend(torch_envs.squeeze().tolist())
    values_list.append(values)

In [15]:
df_env = pd.DataFrame(values_list, columns = columns)

In [16]:
df_env.to_csv('data/envionment_variables.csv')

In [17]:
df_env.head()

Unnamed: 0,year,environment_0,environment_1,environment_2,environment_3,environment_4,environment_5,environment_6,environment_7,environment_8,...,environment_38,environment_39,environment_40,environment_41,environment_42,environment_43,environment_44,environment_45,environment_46,environment_47
0,2018,0.907988,0.25279,-0.836119,0.208836,0.689215,0.308029,0.222713,-0.143369,0.269238,...,0.664215,1.152391,-0.149386,0.789055,-1.348532,0.905948,4.394809,3.209903,3.618974,4.242476
1,2019,0.895566,0.154985,-0.677076,0.262682,0.664284,0.308673,0.23509,-0.044086,0.192558,...,0.457368,1.195705,-0.215413,0.457419,-1.123654,0.626728,4.370901,3.21224,3.61692,4.204389
2,2020,0.868523,0.293043,-0.856014,0.408743,0.712269,0.384764,0.237792,-0.049457,0.175862,...,0.576555,0.992868,-0.260887,0.578474,-1.061072,0.84063,4.407979,3.225611,3.633551,4.243101
3,2021,0.800536,0.205418,-0.814548,0.302683,0.727625,0.248605,0.116277,-0.09842,0.314634,...,0.539748,1.102761,-0.200335,0.732133,-1.103896,0.752479,4.385896,3.212321,3.608874,4.244657
4,2022,0.873671,0.207385,-0.783847,0.259146,0.637665,0.18302,0.190067,-0.056027,0.288552,...,0.515271,1.044406,-0.478153,0.518133,-1.070263,0.54614,4.337283,3.202262,3.593646,4.201489
