## Import

In [1]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/LG_AIMERS

/content/drive/.shortcut-targets-by-id/1bo8OXmxCtxx_kqaMVbLxgeD7-6fO--3a/LG_AIMERS


## Hyperparameter Setting

In [32]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### 데이터 불러오기

In [7]:
train_data = pd.read_csv('./Part2/train.csv').drop(columns=['ID','제품'])

In [8]:
brand = pd.read_csv('./Part2/brand_keyword_cnt.csv')

In [9]:
sales = pd.read_csv('./Part2/sales.csv')

In [10]:
cost = sales.iloc[:,6:] / train_data.iloc[:,4:]

### 데이터 전처리

In [11]:
cost.T.fillna(method='ffill',inplace=True)
cost.T.fillna(method='bfill',inplace=True)

In [12]:
cost_null_ids = cost.T.isnull().sum().sort_values(ascending=False)[:85].index

In [13]:
for idx in cost_null_ids:
  small_clf = train_data.iloc[idx,:]['소분류']
  same_clf_list = train_data[train_data['소분류']==small_clf].index
  cost.iloc[idx,:] = cost.iloc[same_clf_list,:].mean(axis=0)

In [14]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = cost.columns
# 칵 column의 min 및 max 계산
min_values = cost[numeric_cols].min(axis=1)
max_values = cost[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1  ##기존 코드에서는 0으로 처리함
# min-max scaling 수행
cost[numeric_cols] = (cost[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_cost = min_values.to_dict()
scale_max_dict_cost = max_values.to_dict()

In [None]:
# brand_null_ids = brand.set_index('브랜드').T.isnull().sum().sort_values(ascending=False)[:35].index

In [15]:
brand.fillna(0,inplace=True)

In [16]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = brand.columns[1:]
# 칵 column의 min 및 max 계산
min_values = brand[numeric_cols].min(axis=1)
max_values = brand[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1  ##기존 코드에서는 0으로 처리함
# min-max scaling 수행
brand[numeric_cols] = (brand[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_brand = min_values.to_dict()
scale_max_dict_brand = max_values.to_dict()

In [17]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_data.columns[4:]
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1  ##기존 코드에서는 0으로 처리함
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [18]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

label_encoder.fit(train_data['브랜드'])
train_data['브랜드'] = label_encoder.transform(train_data['브랜드'])
brand['브랜드'] = label_encoder.transform(brand['브랜드'])

In [None]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'],stride=1):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size

#     input_data = np.empty((num_rows * (len(data.columns) - window_size + 1) //2, train_size, len(data.iloc[0, :4]) + 2))
#     target_data = np.empty((num_rows * (len(data.columns) - window_size + 1)//2, predict_size))

    input_data = np.empty((num_rows * len(range(0,len(data.columns) - 4 - window_size + 1,stride)), train_size, len(data.iloc[0, :4]) + 3))
    target_data = np.empty((num_rows * len(range(0,len(data.columns) - 4 - window_size + 1,stride)), predict_size))

    for i in tqdm(range(num_rows)): #각 ID 마다
        encode_info = np.array(data.iloc[i, :4]) #분류 정보
        sales_data = np.array(data.iloc[i, 4:]) #판매 정보
        temp_brand = brand[brand['브랜드']==encode_info[-1]].values[0][1:]
        temp_cost = np.array(cost.iloc[i,:])

        assert len(temp_brand)==len(sales_data)

        for idx,j in enumerate(range(0,len(sales_data) - window_size + 1, stride)):
            brand_window = temp_brand[j : j + window_size]
            cost_window = temp_cost[j : j + window_size]
            sales_window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)),
                                         brand_window[:train_size],
                                         cost_window[:train_size],
                                         sales_window[:train_size]))
            # (분류정보 + 브랜드 4열이 90번 아래로 주르륵) + (90일치 학습데이터 열 추가)
            input_data[i * len(range(0,len(data.columns) - 4 - window_size + 1,stride)) + idx] = temp_data
            target_data[i * len(range(0,len(data.columns) - 4 - window_size + 1,stride)) + idx] = sales_window[train_size:]

    return input_data, target_data

In [19]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 3))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        temp_brand = brand[brand['브랜드']==encode_info[-1]].values[0][-train_size:]
        temp_cost = np.array(cost.iloc[i, -train_size:])

        sales_window = sales_data[-train_size : ]
        brand_window = temp_brand[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)),
                                     brand_window[:train_size],
                                     temp_cost[:train_size],
                                     sales_window[:train_size]))
        input_data[i] = temp_data

    return input_data

In [None]:
train_input, train_target = make_train_data(train_data,stride=2)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [20]:
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

In [None]:
len(train_input)

2780750

In [None]:
# ids = []

# for idx,t in enumerate(train_input):
#   if np.isnan(t).any():
#     ids.append(idx)

# np.delete(train_input,ids,axis=0)

In [None]:
# # Train / Validation Split
# data_len = len(train_input)
# val_input = train_input[-int(data_len*0.2):]
# val_target = train_target[-int(data_len*0.2):]
# train_input = train_input[:-int(data_len*0.2)]
# train_target = train_target[:-int(data_len*0.2)]

In [None]:
np.save('./train_inputs.npy',train_input)
np.save('./train_target.npy',train_target)

In [None]:
import pickle

with open('./scale_min_dict.pkl','wb') as f:
  pickle.dump(scale_min_dict,f)

with open('./scale_max_dict.pkl','wb') as f:
  pickle.dump(scale_max_dict,f)

In [21]:
train_input = np.load('./train_inputs.npy')
train_target = np.load('./train_targets.npy')

In [22]:
val_input = np.load('./val_inputs.npy')
val_target = np.load('./val_targets.npy')

In [9]:
import pickle

with open('./scale_min_dict.pkl','rb') as f:
  scale_min_dict = pickle.load(f)

with open('./scale_max_dict.pkl','rb') as f:
  scale_max_dict = pickle.load(f)

In [None]:
# Train / Validation Split

#per_item = int(len(train_input)/len(train_data))
per_item = int(len(train_input)/15890)
t_len = int(per_item * 0.8)
v_len = per_item - t_len

t_list = []
v_list = []

for i in range(0,len(train_input),per_item):
    t_list.extend(list(range(i,i+t_len)))
    v_list.extend(list(range(i+t_len,i+per_item)))

val_input = train_input[v_list]
val_target = train_target[v_list]
train_input = train_input[t_list]
train_target = train_target[t_list]
# train_input = np.delete(train_input,v_list)
# train_target = np.delete(train_target,v_list)

In [23]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((2224600, 90, 7),
 (2224600, 21),
 (556150, 90, 7),
 (556150, 21),
 (15890, 90, 7))

### Custom Dataset

In [24]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [33]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

### 모델 선언

In [34]:
class BaseModel(nn.Module):
    def __init__(self, input_size=7, hidden_size=512, num_layers=1, output_size=CFG['PREDICT_SIZE']):
        super(BaseModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x):
        # x shape: (B, TRAIN_WINDOW_SIZE, 5)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        # LSTM layer
        lstm_out, hidden = self.lstm(x, hidden)

        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]

        # Fully connected layer
        output = self.actv(self.fc(last_output))

        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))

### 모델 학습

In [35]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model

In [36]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

## Run !!

In [17]:
model = BaseModel(num_layers=1)
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.03278] Val Loss : [0.02328]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.03060] Val Loss : [0.02347]


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.02999] Val Loss : [0.02293]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.02950] Val Loss : [0.02304]


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.02851] Val Loss : [0.02044]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.02880] Val Loss : [0.02100]


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.02711] Val Loss : [0.01961]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.02658] Val Loss : [0.01877]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.02169] Val Loss : [0.01589]
Model Saved


  0%|          | 0/544 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.01993] Val Loss : [0.01645]


In [37]:
model = BaseModel(num_layers=3)
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.03265] Val Loss : [0.02316]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.03153] Val Loss : [0.02310]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.03114] Val Loss : [0.02399]


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.03074] Val Loss : [0.02369]


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.03012] Val Loss : [0.02387]


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.03007] Val Loss : [0.02282]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.02897] Val Loss : [0.02197]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.02066] Val Loss : [0.01638]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.01848] Val Loss : [0.01545]
Model Saved


  0%|          | 0/1087 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.01812] Val Loss : [0.01603]


## 모델 추론

In [38]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [39]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [40]:
pred = inference(infer_model, test_loader, device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [41]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
#pred = np.round(pred, 0).astype(int)

In [45]:
pred = np.round(pred, 0).astype(int)

## Submission

In [46]:
submit = pd.read_csv('./Part2/sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
3,3,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,1


In [48]:
submit.to_csv('./baseline_submit.csv', index=False)