# 2020_DACON_CUP

## 1. Import Library

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 2. Load Data

In [None]:
train = pd.read_csv("/content/drive/MyDrive/ML&DL_Project/Dacon/2020_DACON_CUP/open_data/train.csv", encoding = 'euc-kr')
train['DateTime'] = pd.to_datetime(train.DateTime)
train['date'] = train.DateTime.dt.date
train  = train.groupby('date').sum().reset_index()
train

In [None]:
train.info()

## 3. Data Processing

### 1) 월별 데이터 시각화

In [None]:
df = train.copy()
df['YearMonth'] = df['date'].dt.to_period('M')
year_month = df.groupby('YearMonth').mean().reset_index()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(14, 20))
axes[0].plot(year_month['YearMonth'].dt.strftime('%y%m'), year_month['사용자'], 'r')
axes[1].plot(year_month['YearMonth'].dt.strftime('%y%m'), year_month['세션'], 'g')
axes[2].plot(year_month['YearMonth'].dt.strftime('%y%m'), year_month['신규방문자'], 'b')
axes[3].plot(year_month['YearMonth'].dt.strftime('%y%m'), year_month['페이지뷰'], 'c')
for i, title in enumerate(['사용자', '세션', '신규방문자', '페이지뷰']):
    axes[i].set_title(title)
plt.show()

2020년 11월은 8일까지의 데이터만 가지고 있어서 2020년 10월까지의 데이터만

### 2) Data Scaling

In [None]:
mini = train.iloc[:,1:].min()
size = train.iloc[:,1:].max() - train.iloc[:,1:].min()
train.iloc[:,1:] = (train.iloc[:,1:] -  mini) / size

input_window = 30
output_window = 7

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 4))
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1: ].values

In [None]:
train

## 4. Model

### 1) LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, _ = self.lstm(x_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        
        
        out_time = self.time_fc(out_time[:,-7:, :])
        
        return out_time.view(-1,7,4)
    
model = LSTM(input_size = 4, hidden_size = 30).to(device)

In [None]:
window_x = torch.tensor(window_x).float().to(device)
window_y = torch.tensor(window_y).float().to(device)

# Train model
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.MSELoss(size_average = True)
num_epochs  = 500
train_error = []
for t in range(num_epochs):
    train_pred = model(window_x)
    loss = criterion(train_pred, window_y)
    train_error.append(loss)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if t % 10 == 0 and t !=0:
        print(f"{t} Epochs train MSE: {loss.item():1.5f}")

### 2) Keras

## 5. Prediction

In [None]:
submission = pd.read_csv("/content/drive/MyDrive/ML&DL_Project/Dacon/2020_DACON_CUP/open_data/submission.csv", encoding = 'euc-kr')
submission

In [None]:
pr_h = pd.period_range(start = '2020-11-09',
                       end = None,
                       periods = 1464,
                       freq = 'H')           
pr_df = pd.DataFrame(range(len(pr_h)),columns=['DateTime'])
pr_df['DateTime'] = pr_h
pr_df

In [None]:
pr_df['사용자'] = 0
pr_df['세션'] = 0
pr_df['신규방문자'] = 0
pr_df['페이지뷰'] = 0
pr_df

In [None]:
submission_df = pr_df.copy()

In [None]:
last_month = train.iloc[-30:,1:].values[np.newaxis,...] 
last_month = torch.tensor(last_month).float().to(device) 

for start in range((len(submission) - output_window)//7 + 2):
    start = start * 7
    next_week = model(last_month)
    last_month = torch.cat([last_month[:,7:,:], next_week], axis = 1)

    pred_week = next_week.cpu().detach().numpy().reshape(output_window,4)
    pred_week = pred_week * size.values + mini.values
    pred_week = pred_week.astype(int)
    
    if start/7 == (len(submission) - output_window)//7 + 1:
        submission.iloc[start :, 1:] = pred_week[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_week
submission

In [None]:
submission_df['DateTime'] = submission_df['DateTime'].astype(str)
submission_df

In [None]:
submission_df['DateTime'] = pd.to_datetime(submission_df.DateTime)
submission_df['date'] = submission_df.DateTime.dt.date
submission_df  = submission_df.groupby('date').sum().reset_index()
submission_df

In [None]:
submission.to_csv('/content/drive/MyDrive/ML&DL_Project/Dacon/2020_DACON_CUP/[MH]submission.csv', index = False, encoding = 'euc-kr')