## 套件安裝

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Colab Notebooks/DA/data') #切換該目錄
os.listdir() #確認目錄內容

Mounted at /content/drive


['light_test_source_labels.csv',
 'light_train_target_labels.csv',
 'light_train_source_labels.csv',
 'sample.csv',
 'test_source_events.csv',
 'train_source_events.csv',
 'train_target_events.csv',
 'model_weights',
 '.ipynb_checkpoints',
 'test_origin_fourfourplatform.npy',
 'test_origin_fourfourelse.npy',
 'test_origin_allfeature.npy',
 'train_origin_fourfourplatform.npy',
 'train_origin_fourfourelse.npy',
 'train_origin_allfeature.npy',
 'duration_data_scaled.npy',
 'train_duration_target.npy',
 'test_duration_data_scaled.npy',
 'test_5.npy',
 'train_5.npy',
 'xgb_test.npy',
 'xgb_train.npy',
 'submit.csv',
 't1_test.npy',
 't2_test.npy',
 't2_train.npy',
 't1_train.npy']

In [None]:
pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# always needed
import math, os, random, csv
import pandas as pd
import numpy as np

# log and save
import json, logging, pickle, sys, shutil, copy
# torch
import torch
import torch.nn
from torch.nn import Conv2d, MaxPool2d, Flatten, Linear, ReLU
import torchvision
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,ConcatDataset
from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from torchinfo import summary

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline
import seaborn as sns

# others
import matplotlib.pyplot as plt
from PIL import Image

# sklearn
from sklearn import preprocessing

# statistics
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import KFold
import statistics

# seeds
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
def show_loss(train_loss, valid_loss):
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    axes[0].plot(train_loss, color='blue', linewidth=2)
    axes[0].set_title('Training Loss', fontsize=16)
    axes[0].set_xlabel('iteration', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].grid(True)
    axes[0].spines['top'].set_visible(False)
    axes[0].spines['right'].set_visible(False)

    # Plot validation loss
    axes[1].plot(valid_loss, color='red', linewidth=2)
    axes[1].set_title('Validation Loss', fontsize=16)
    axes[1].set_xlabel('iteration', fontsize=12)
    axes[1].set_ylabel('Loss', fontsize=12)
    axes[1].grid(True)
    axes[1].spines['top'].set_visible(False)
    axes[1].spines['right'].set_visible(False)
    # Adjust the spacing between subplots
    plt.subplots_adjust(wspace=0.3)
    plt.show()

In [None]:
config = {
    'seed': 507,
    'valid_ratio': 0.15,
    'test_ratio': 0.1,
    'n_epochs': 25,
    'batch_size': 32,
    'learning_rate': 3e-4,
    'early_stop': 10,
    'save_path': './model_weights/Combined.ckpt',
    'input_size': 4
}

## 資料長相（train有source, target / test只有source

In [None]:
#('./preprocessed_output/train_duration_target.npy')
#('./preprocessed_output/duration_data_scaled.npy')
#('./preprocessed_output/test_duration_data_scaled.npy')

In [None]:
# duration_mtx = np.load('./preprocessed_output/train_duration_target.npy')
# duration_mtx[3]

## 加載資料

In [None]:
class TrainDataset(Dataset):
    # data loading
    def __init__(self, file_name_X, file_name_Y, file_ls_X, trans_1, trans_2, xgb):
        #X = pd.read_csv(file_name_X).drop(columns=['user_id']).to_numpy()
        X = np.load(file_name_X)
        X = X.reshape(X.shape[0], -1, config['input_size']) #變成每七天一個vector feature input_size=28, seq_len=37
        Y = np.load(file_name_Y)
        xgb = np.load(xgb)
        trans_1 = np.load(trans_1)
        trans_2 = np.load(trans_2)
        ls_X = np.load(file_ls_X)
        ls_X = ls_X.reshape(ls_X.shape[0], -1, 5)
        self.x = torch.from_numpy(X).to(torch.float32)
        self.y = torch.from_numpy(Y).to(torch.float32)
        self.lsx = torch.from_numpy(ls_X).to(torch.float32)
        self.xgb = torch.from_numpy(xgb).to(torch.float32)
        self.t1 = torch.from_numpy(trans_1).to(torch.float32)
        self.t2 = torch.from_numpy(trans_2).to(torch.float32)
        self.n_samples = self.x.shape[0]
        self.input_size = config['input_size']

    # working for indexing
    def __getitem__(self, index):
        return self.x[index], self.y[index], self.lsx[index], self.t1[index], self.t2[index], self.xgb[index]
    def __len__(self):
        return self.n_samples

class TestDataset(Dataset):
    # data loading
    def __init__(self, file_name_X, file_ls_X, trans_1, trans_2, xgb):
        X = np.load(file_name_X)
        xgb = np.load(xgb)
        trans_1 = np.load(trans_1)
        trans_2 = np.load(trans_2)
        X = X.reshape(X.shape[0], -1, config['input_size'])
        ls_X = np.load(file_ls_X)
        ls_X = ls_X.reshape(ls_X.shape[0], -1, 5)
        self.x = torch.from_numpy(X).to(torch.float32)
        self.lsx = torch.from_numpy(ls_X).to(torch.float32)
        self.xgb = torch.from_numpy(xgb).to(torch.float32)
        self.t1 = torch.from_numpy(trans_1).to(torch.float32)
        self.t2 = torch.from_numpy(trans_2).to(torch.float32)
        self.n_samples = self.x.shape[0]
        self.input_size = config['input_size']

    # working for indexing
    def __getitem__(self, index):
        return self.x[index], self.lsx[index], self.t1[index], self.t2[index], self.xgb[index]
    def __len__(self):
        return self.n_samples

In [None]:
dataset_train = TrainDataset('duration_data_scaled.npy', 'train_duration_target.npy','train_5.npy','t1_train.npy','t2_train.npy','xgb_train.npy')
dataset_test = TestDataset('test_duration_data_scaled.npy', 'test_5.npy', 't1_test.npy','t2_test.npy', 'xgb_test.npy')

batch_size = config['batch_size'] #符合test loader的因數
valid_ratio = config['valid_ratio']
valid_set_size = int(valid_ratio * len(dataset_train)) #驗證集大小
train_set_size = len(dataset_train) - valid_set_size #訓練集大小
dataset_train, dataset_valid = torch.utils.data.random_split(dataset_train, [train_set_size, valid_set_size])

train_loader = DataLoader(dataset_train ,batch_size=batch_size, shuffle=True, drop_last=True)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(dataset_test, batch_size=batch_size,shuffle=False, drop_last=False)

In [None]:
for _, (data, labels, lsx, t1, t2, xgb) in enumerate(train_loader):
  print(data.shape, labels.shape, lsx.shape, t1.shape, t2.shape, xgb.shape) #train:1036 columns, test:28 columns
  break; ## seqlen根据数据特征数量调整，1036/28 = 37

torch.Size([32, 259, 4]) torch.Size([32, 28]) torch.Size([32, 259, 5]) torch.Size([32, 28]) torch.Size([32, 28]) torch.Size([32, 28])


## 模型

In [None]:
class SimpleLSTM(nn.Module):

    def __init__(self):
        super(SimpleLSTM, self).__init__()

        self.input_size = 5
        self.seq_len = 259

        self.n_layers = 2 # LSTM的层数
        self.hidden_dim = 256 # 隐状态的维度，即LSTM输出的隐状态的维度
        self.output_size = 28 #輸出為每筆資料有28個欄位

        self.lstm = nn.LSTM(input_size = self.input_size, # 输入的维度
                            hidden_size = self.hidden_dim, # LSTM输出的hidden_state的维度
                            num_layers = self.n_layers, # LSTM的层数
                            dropout= 0.25,
                            batch_first=True # 第一个维度是否是batch_size
                           )
        #self.fc_1 = nn.Linear(self.hidden_dim*self.seq_len, 128) #fc1
        self.fc_1 = nn.Linear(self.hidden_dim, 128) #fc1
        self.fc_2 =  nn.Linear(128, self.output_size) #fc2
        self.sigmoid = nn.Sigmoid() # 线性层输出后，还需要过一下sigmoid
        self.relu = nn.ReLU()

    def forward(self, x):
        """
        x: 本次的输入, 其size为(batch_size, 259, 4), 259 為sequence len, 4為input_dim
        其中h和c的size都为(n_layers, batch_size, hidden_dim), 即(2, 64, 128)
        """
        h_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device) #非雙向
        c_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)

        out, (ht,ct) = self.lstm(x, (h_0,c_0)) #out 為 (batch_size64, seqlen37, hidden_dim128)

        out = out[:, -1, :]
        out = out.reshape(batch_size, -1)
        out = self.fc_1(out) #first Dense
        out = self.relu(out) #relu
        out = self.fc_2(out)
        out = self.sigmoid(out)
        return out

class GRU(nn.Module):

    def __init__(self):
        super(GRU, self).__init__()

        self.input_size = config['input_size'] #一週七天*4 slot
        self.seq_len = 259

        self.n_layers = 2 # GRU的层数
        self.hidden_dim = 256 # 隐状态的维度，即GRU输出的隐状态的维度
        self.output_size = 28 #輸出為每筆資料有28個欄位

        self.gru = nn.GRU(input_size = self.input_size, # 输入的维度
                            hidden_size = self.hidden_dim, # GRU输出的hidden_state的维度
                            num_layers = self.n_layers, # GRU的层数
                            dropout= 0.25,
                            batch_first=True # 第一个维度是否是batch_size
                           )

        self.fc_1 = nn.Linear(self.hidden_dim, 128) #fc1
        self.fc_2 =  nn.Linear(128, self.output_size) #fc2
        self.sigmoid = nn.Sigmoid() # 线性层输出后，还需要过一下sigmoid
        self.relu = nn.ReLU()

    def forward(self, x):
        """
        x: 本次的输入，其size为(batch_size, 37, 28)，37 為sequence len, 28為input_dim
        其中h和c的size都为(n_layers, batch_size, hidden_dim), 即(2, 64, 128)
        """
        h_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device) #非雙向
        # c_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)

        out, ht = self.gru(x, h_0) #out 為 (batch_size64, seqlen37, hidden_dim128)

        out = out[:, -1, :]
        out = out.reshape(batch_size, -1)
        out = self.fc_1(out) #first Dense
        out = self.relu(out) #relu
        out = self.fc_2(out)
        out = self.sigmoid(out)
        return out

In [None]:
class Combined_model(nn.Module):
    def __init__(self, LSTM_1, LSTM_2, GRU_1, GRU_2):
        super(Combined_model, self).__init__()
        self.LSTM_1 = LSTM_1
        self.LSTM_2 = LSTM_2
        self.GRU_1 = GRU_1
        self.GRU_2 = GRU_2
        self.fc = nn.Linear(196, 28)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x1, x2, t1, t2, xgb):
        out1 = self.LSTM_1(x2)
        # print(out1.shape)
        out2 = self.LSTM_2(x2)
        out3 = self.GRU_1(x1)
        out4 = self.GRU_2(x1)
        x = torch.cat((out1, out2, out3, out4, t1, t2, xgb), dim=1)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

## 訓練過程

In [None]:
def train(train_loader, valid_loader, model, model_name, epochs_num):

  criterion = nn.BCELoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], eps=1e-08, weight_decay=0.00001)
  loss_train = []
  loss_valid = []
  best_loss, early_stop_count = math.inf, 0

  # self.x[index], self.y[index], self.lsx[index], self.t1[index], self.t2[index], self.xgb[index]
  for epoch in range(epochs_num):
    model.train() # Set model to train mode
    train_pbar = tqdm(train_loader, position=0, leave=True)

    for datas, labels, lsx, t1, t2, xgb in train_pbar: #each batch
      datas, labels, lsx, t1, t2, xgb = datas.to(device), labels.to(device), lsx.to(device), t1.to(device), t2.to(device), xgb.to(device)
      optimizer.zero_grad()
      outputs = model(datas, lsx, t1, t2, xgb)
      loss = criterion(outputs, labels)
      loss.backward() #反向傳播
      optimizer.step()
      loss_train.append(loss.detach().cpu().numpy())

    mean_loss_train = sum(loss_train)/len(loss_train)
    #-----------------------validation----------------------#
    model.eval()
    with torch.no_grad(): #代表這裡面不會有backpropagation梯度，不會有調整優化
      for datas, labels, lsx, t1, t2, xgb in valid_loader:
        datas, labels, lsx, t1, t2, xgb = datas.to(device), labels.to(device), lsx.to(device), t1.to(device), t2.to(device), xgb.to(device)
        outputs = model(datas, lsx, t1, t2, xgb)
        loss = criterion(outputs, labels)
        loss_valid.append(loss.detach().cpu().numpy())

      mean_loss_valid = sum(loss_valid)/len(loss_valid)
      print(f"Epoch [{epoch+1}/{epochs_num}]: Train loss: {mean_loss_train:.4f}, Valid loss: {mean_loss_valid:.4f}")
      if mean_loss_valid < best_loss:
        best_loss = mean_loss_valid
        torch.save(model.state_dict(), config['save_path'])
        print(f'Saving model with loss {best_loss:.4f}')
        early_stop_count = 0
      else:
        early_stop_count += 1
      if early_stop_count >= config['early_stop']:
        print('\n Model is not improving, Stop training session.')
        return

  show_loss(loss_train, loss_valid)

## 真正訓練


In [None]:
num_epoch = config['n_epochs']
model_ls_1 = SimpleLSTM().to(device)
model_ls_2 = SimpleLSTM().to(device)
model_gru_1 = GRU().to(device)
model_gru_2 = GRU().to(device)

model_ls_1.load_state_dict(torch.load("./model_weights/LSTM_2.ckpt"))
model_ls_2.load_state_dict(torch.load("./model_weights/LSTM_4.ckpt"))
model_gru_1.load_state_dict(torch.load("./model_weights/GRU_2.ckpt"))
model_gru_2.load_state_dict(torch.load("./model_weights/GRU_4.ckpt"))

model = Combined_model(model_ls_1, model_ls_2, model_gru_1, model_gru_2).to(device)
train(train_loader, valid_loader, model, "Combined_model", epochs_num = num_epoch)

  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [1/25]: Train loss: 0.3225, Valid loss: 0.2890
Saving model with loss 0.2890


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [2/25]: Train loss: 0.3056, Valid loss: 0.2854
Saving model with loss 0.2854


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [3/25]: Train loss: 0.2979, Valid loss: 0.2827
Saving model with loss 0.2827


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [4/25]: Train loss: 0.2933, Valid loss: 0.2814
Saving model with loss 0.2814


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [5/25]: Train loss: 0.2902, Valid loss: 0.2798
Saving model with loss 0.2798


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [6/25]: Train loss: 0.2876, Valid loss: 0.2784
Saving model with loss 0.2784


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [7/25]: Train loss: 0.2854, Valid loss: 0.2771
Saving model with loss 0.2771


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [8/25]: Train loss: 0.2834, Valid loss: 0.2759
Saving model with loss 0.2759


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [9/25]: Train loss: 0.2816, Valid loss: 0.2748
Saving model with loss 0.2748


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [10/25]: Train loss: 0.2799, Valid loss: 0.2738
Saving model with loss 0.2738


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [11/25]: Train loss: 0.2783, Valid loss: 0.2729
Saving model with loss 0.2729


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [12/25]: Train loss: 0.2768, Valid loss: 0.2720
Saving model with loss 0.2720


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [13/25]: Train loss: 0.2753, Valid loss: 0.2711
Saving model with loss 0.2711


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [14/25]: Train loss: 0.2739, Valid loss: 0.2703
Saving model with loss 0.2703


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [15/25]: Train loss: 0.2724, Valid loss: 0.2696
Saving model with loss 0.2696


  0%|          | 0/809 [00:00<?, ?it/s]

Epoch [16/25]: Train loss: 0.2710, Valid loss: 0.2690
Saving model with loss 0.2690


  0%|          | 0/809 [00:00<?, ?it/s]

## 產生輸出

In [None]:
model_ls_1 = SimpleLSTM().to(device)
model_ls_2 = SimpleLSTM().to(device)
model_gru_1 = GRU().to(device)
model_gru_2 = GRU().to(device)
model = Combined_model(model_ls_1, model_ls_2, model_gru_1, model_gru_2).to(device)
model.load_state_dict(torch.load('./model_weights/Combined_12.ckpt'))
model.eval()
output_list = []
with torch.no_grad():
  for data, lsx, t1, t2, xgb in test_loader:
    data, lsx, t1, t2, xgb = data.to(device), lsx.to(device), t1.to(device), t2.to(device), xgb.to(device)
    output_list.append(model(data, lsx, t1, t2, xgb).cpu())

In [None]:
flatten_data = np.concatenate(output_list, axis=0)
showans = pd.DataFrame(flatten_data)

In [None]:
sample = pd.read_csv("sample.csv")
answer = sample
for i in range(1,29):
    answer.iloc[:,i] = showans.iloc[:, i-1]
display(answer)

  answer.iloc[:,i] = showans.iloc[:, i-1]


Unnamed: 0,user_id,time_slot_0,time_slot_1,time_slot_2,time_slot_3,time_slot_4,time_slot_5,time_slot_6,time_slot_7,time_slot_8,...,time_slot_18,time_slot_19,time_slot_20,time_slot_21,time_slot_22,time_slot_23,time_slot_24,time_slot_25,time_slot_26,time_slot_27
0,30460,0.136288,0.085705,0.019214,0.427842,0.168958,0.066080,0.022516,0.616759,0.077352,...,0.022974,0.724521,0.072827,0.075245,0.025629,0.618372,0.050624,0.065789,0.016060,0.282963
1,30461,0.204280,0.546389,0.011290,0.010799,0.213028,0.407617,0.002016,0.002662,0.023321,...,0.002135,0.002285,0.024015,0.318412,0.002127,0.002621,0.023879,0.337252,0.005974,0.006505
2,30462,0.170092,0.227714,0.016523,0.049773,0.210666,0.263143,0.015040,0.073456,0.254715,...,0.025976,0.086442,0.258050,0.250440,0.018487,0.081047,0.226401,0.243056,0.025726,0.058556
3,30463,0.056051,0.033010,0.007883,0.006049,0.077017,0.044307,0.005041,0.004160,0.143646,...,0.008383,0.007221,0.170270,0.060841,0.007331,0.006015,0.159589,0.053758,0.013536,0.006454
4,30464,0.049972,0.091410,0.015634,0.007525,0.086638,0.141788,0.014555,0.004840,0.071748,...,0.013614,0.005553,0.057135,0.121193,0.013413,0.006250,0.047863,0.115987,0.022186,0.006859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7611,38071,0.065709,0.116049,0.017941,0.030103,0.117875,0.129660,0.007934,0.013192,0.040543,...,0.008666,0.016195,0.052199,0.124226,0.007769,0.021539,0.048501,0.118603,0.014414,0.020617
7612,38072,0.639472,0.601375,0.034275,0.033696,0.499519,0.424203,0.007678,0.007637,0.237665,...,0.008724,0.008282,0.223911,0.226424,0.005070,0.008764,0.225102,0.231784,0.018243,0.011962
7613,38073,0.654077,0.682941,0.049995,0.048425,0.508809,0.448746,0.010341,0.009170,0.168144,...,0.009632,0.009347,0.144483,0.225180,0.005768,0.009821,0.157961,0.229242,0.019434,0.013998
7614,38074,0.073681,0.134187,0.037545,0.022768,0.107770,0.141624,0.017237,0.012969,0.069730,...,0.018961,0.016339,0.083943,0.152278,0.018792,0.018747,0.078273,0.115737,0.027185,0.015984


In [None]:
answer.to_csv("submit.csv", index=False)

In [None]:
summary(model)

Layer (type:depth-idx)                   Param #
Combined_model                           --
├─SimpleLSTM: 1-1                        --
│    └─LSTM: 2-1                         795,648
│    └─Linear: 2-2                       32,896
│    └─Linear: 2-3                       3,612
│    └─Sigmoid: 2-4                      --
│    └─ReLU: 2-5                         --
├─SimpleLSTM: 1-2                        --
│    └─LSTM: 2-6                         795,648
│    └─Linear: 2-7                       32,896
│    └─Linear: 2-8                       3,612
│    └─Sigmoid: 2-9                      --
│    └─ReLU: 2-10                        --
├─GRU: 1-3                               --
│    └─GRU: 2-11                         595,968
│    └─Linear: 2-12                      32,896
│    └─Linear: 2-13                      3,612
│    └─Sigmoid: 2-14                     --
│    └─ReLU: 2-15                        --
├─GRU: 1-4                               --
│    └─GRU: 2-16                   