## 套件安裝

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Colab/資料分析期末') #切換該目錄
os.listdir() #確認目錄內容

Mounted at /content/drive


['LSTM_torch_duration.ipynb',
 'data',
 'models',
 'preprocessed_output',
 '過往東東',
 'share_data',
 'model選特徵0601.ipynb',
 'LSTM_GRU_torch_ensemble.ipynb',
 'test其餘特徵處理.ipynb',
 'train的其餘特徵.ipynb',
 'XGBoost.ipynb',
 'submit.csv',
 'LSTM_CV.ipynb']

In [2]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [3]:
# always needed
import math, os, random, csv
import pandas as pd
import numpy as np

# log and save
import json, logging, pickle, sys, shutil, copy
# torch
import torch
import torch.nn
from torch.nn import Conv2d, MaxPool2d, Flatten, Linear, ReLU
import torchvision
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,ConcatDataset
from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from torchinfo import summary

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline
import seaborn as sns

# others
import matplotlib.pyplot as plt
from PIL import Image

# sklearn
from sklearn import preprocessing

# statistics
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statistics

# seeds
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [5]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [6]:
def show_loss(train_loss, valid_loss):
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    axes[0].plot(train_loss, color='blue', linewidth=2)
    axes[0].set_title('Training Loss', fontsize=16)
    axes[0].set_xlabel('iteration', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].grid(True)
    axes[0].spines['top'].set_visible(False)
    axes[0].spines['right'].set_visible(False)

    # Plot validation loss
    axes[1].plot(valid_loss, color='red', linewidth=2)
    axes[1].set_title('Validation Loss', fontsize=16)
    axes[1].set_xlabel('iteration', fontsize=12)
    axes[1].set_ylabel('Loss', fontsize=12)
    axes[1].grid(True)
    axes[1].spines['top'].set_visible(False)
    axes[1].spines['right'].set_visible(False)
    # Adjust the spacing between subplots
    plt.subplots_adjust(wspace=0.3)
    plt.show()

In [26]:
config = {
    'seed': 507,
    'valid_ratio': 0.20,
    'test_ratio': 0.1,   
    'n_epochs': 40,             
    'batch_size': 32, 
    'learning_rate': 6e-4,              
    'early_stop': 10,  
    'save_path': './share_data/model_weights/LSTM_plat.ckpt',
    'input_size': 8
}

## 資料長相（train有source, target / test只有source

In [27]:
#('./preprocessed_output/train_duration_target.npy')
#('./preprocessed_output/duration_data_scaled.npy')
#('./preprocessed_output/test_duration_data_scaled.npy')

In [28]:
# duration_mtx = np.load('./preprocessed_output/train_duration_target.npy')
# duration_mtx[3]

## 加載資料

In [29]:
class TrainDataset(Dataset):
    # data loading
    def __init__(self, file_name_X, file_name_Y):
        #X = pd.read_csv(file_name_X).drop(columns=['user_id']).to_numpy()
        X = np.load(file_name_X)
        X = X.reshape(X.shape[0], -1, config['input_size']) #變成每七天一個vector feature input_size=28, seq_len=37
        Y = np.load(file_name_Y)
        self.x = torch.from_numpy(X).to(torch.float32)
        self.y = torch.from_numpy(Y).to(torch.float32)
        self.n_samples = self.x.shape[0]
        self.input_size = config['input_size']

    # working for indexing
    def __getitem__(self, index):
        return self.x[index], self.y[index] #符合lstm的輸入 (batch_first)
    def __len__(self):
        return self.n_samples

class TestDataset(Dataset):
    # data loading
    def __init__(self, file_name_X):
        X = np.load(file_name_X)
        X = X.reshape(X.shape[0], -1, config['input_size']) 
        self.x = torch.from_numpy(X).to(torch.float32)
        self.n_samples = self.x.shape[0]
        self.input_size = config['input_size']

    # working for indexing
    def __getitem__(self, index):
        return self.x[index] #符合lstm的輸入 seqlen再改
    def __len__(self):
        return self.n_samples

In [30]:
dataset_train = TrainDataset('./share_data/feature/0603_4+1/train_dur_plat.npy', './share_data/finaldura33-22/train_duration_target.npy')
dataset_test = TestDataset('./share_data/feature/0603_4+1/test_dur_plat.npy')

batch_size = config['batch_size'] #符合test loader的因數
valid_ratio = config['valid_ratio']
valid_set_size = int(valid_ratio * len(dataset_train)) #驗證集大小
train_set_size = len(dataset_train) - valid_set_size #訓練集大小
dataset_train, dataset_valid = torch.utils.data.random_split(dataset_train, [train_set_size, valid_set_size])

train_loader = DataLoader(dataset_train ,batch_size=batch_size, shuffle=True, drop_last=True)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(dataset_test, batch_size=batch_size,shuffle=False, drop_last=False)

In [31]:
for _, (data, labels) in enumerate(train_loader):
  print(data.shape, labels.shape) #train:1036 columns, test:28 columns 
  break; ## seqlen根据数据特征数量调整，1036/28 = 37

torch.Size([32, 259, 8]) torch.Size([32, 28])


## 模型
https://blog.csdn.net/qq_40728805/article/details/103959254?spm=1001.2101.3001.6650.3&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-3-103959254-blog-112799785.235%5Ev36%5Epc_relevant_default_base3&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-3-103959254-blog-112799785.235%5Ev36%5Epc_relevant_default_base3&utm_relevant_index=4 <br>
对于单层单向的LSTM， 其h_n最后一层输出h_n[-1,:,:]，和output最后一个时步的输出output[-1,:,:]相等。如果是分类任务的话，就可以把output[-1, :, :]或者output送到一个分类器分类。

比如：在做文字识别中，先对文本行图片提取feature，如shape为(B, 512, 1, 16)其中512是channel 维度，1是height，16是width，tensor处理为(B, 16, 512)或(16, B, 512)因为lstm要求输入是3D的，CNN的feature是4D的。

那这样，512就是input_size, 16就是seq_len
<br>
輸入LSTM:(batchsize,seqlen,inputsize) when batch_first=True 


https://discuss.pytorch.org/t/understanding-lstm-input/31110/5 LSTM input

In [32]:
class SimpleLSTM(nn.Module):

    def __init__(self):
        super(SimpleLSTM, self).__init__()

        self.input_size = config['input_size'] #一週七天*4 slot
        self.seq_len = 259

        self.n_layers = 2 # LSTM的层数
        self.hidden_dim = 256 # 隐状态的维度，即LSTM输出的隐状态的维度
        self.output_size = 28 #輸出為每筆資料有28個欄位
        
        self.lstm = nn.LSTM(input_size = self.input_size, # 输入的维度
                            hidden_size = self.hidden_dim, # LSTM输出的hidden_state的维度
                            num_layers = self.n_layers, # LSTM的层数
                            dropout= 0.25, 
                            batch_first=True # 第一个维度是否是batch_size
                           )
        #self.fc_1 = nn.Linear(self.hidden_dim*self.seq_len, 128) #fc1
        self.fc_1 = nn.Linear(self.hidden_dim, 128) #fc1
        self.fc_2 =  nn.Linear(128, self.output_size) #fc2
        self.sigmoid = nn.Sigmoid() # 线性层输出后，还需要过一下sigmoid
        self.relu = nn.ReLU()
    
    def forward(self, x):
        """
        x: 本次的输入, 其size为(batch_size, 259, 4), 259 為sequence len, 4為input_dim
        其中h和c的size都为(n_layers, batch_size, hidden_dim), 即(2, 64, 128)
        """
        h_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device) #非雙向
        c_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)

        out, (ht,ct) = self.lstm(x, (h_0,c_0)) #out 為 (batch_size64, seqlen37, hidden_dim128) 

        out = out[:, -1, :]
        out = out.reshape(batch_size, -1)
        out = self.fc_1(out) #first Dense
        out = self.relu(out) #relu
        out = self.fc_2(out) 
        out = self.sigmoid(out)
        return out

## 訓練過程

In [33]:
def train(train_loader, valid_loader, model, model_name, epochs_num):

  criterion = nn.BCELoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], eps=1e-08, weight_decay=0.00001)
  loss_train = []
  loss_valid = []
  best_loss, early_stop_count = math.inf, 0

  for epoch in range(epochs_num):
    model.train() # Set model to train mode
    train_pbar = tqdm(train_loader, position=0, leave=True)
    for datas, labels in train_pbar: #each batch 
      datas, labels = datas.to(device), labels.to(device) 
      optimizer.zero_grad() 
      outputs = model(datas) 
      loss = criterion(outputs, labels)  
      loss.backward() #反向傳播
      optimizer.step()
      loss_train.append(loss.detach().cpu().numpy())
    
    mean_loss_train = sum(loss_train)/len(loss_train)
    #-----------------------validation----------------------#
    model.eval()
    with torch.no_grad(): #代表這裡面不會有backpropagation梯度，不會有調整優化
      for datas, labels in valid_loader:
        datas, labels = datas.to(device), labels.to(device) 
        outputs = model(datas) 
        loss = criterion(outputs, labels)  
        loss_valid.append(loss.detach().cpu().numpy())
      
      mean_loss_valid = sum(loss_valid)/len(loss_valid)
      print(f"Epoch [{epoch+1}/{epochs_num}]: Train loss: {mean_loss_train:.4f}, Valid loss: {mean_loss_valid:.4f}")
      if mean_loss_valid < best_loss:
        best_loss = mean_loss_valid
        torch.save(model.state_dict(), config['save_path'])
        print(f'Saving model with loss {best_loss:.4f}')
        early_stop_count = 0
      else:
        early_stop_count += 1
      if early_stop_count >= config['early_stop']:
        print('\n Model is not improving, Stop training session.')
        return
  
  show_loss(loss_train, loss_valid)

## 真正訓練


In [34]:
num_epoch = config['n_epochs']
model = SimpleLSTM().to(device)
#model.load_state_dict(torch.load('./share_data/model_weights/LSTM_2.ckpt'))
train(train_loader, valid_loader, model, "SimpleLSTM", epochs_num = num_epoch)

  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [1/40]: Train loss: 0.3417, Valid loss: 0.3204
Saving model with loss 0.3204


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [2/40]: Train loss: 0.3291, Valid loss: 0.3169
Saving model with loss 0.3169


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [3/40]: Train loss: 0.3238, Valid loss: 0.3138
Saving model with loss 0.3138


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [4/40]: Train loss: 0.3200, Valid loss: 0.3114
Saving model with loss 0.3114


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [5/40]: Train loss: 0.3172, Valid loss: 0.3098
Saving model with loss 0.3098


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [6/40]: Train loss: 0.3147, Valid loss: 0.3078
Saving model with loss 0.3078


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [7/40]: Train loss: 0.3122, Valid loss: 0.3059
Saving model with loss 0.3059


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [8/40]: Train loss: 0.3099, Valid loss: 0.3042
Saving model with loss 0.3042


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [9/40]: Train loss: 0.3078, Valid loss: 0.3024
Saving model with loss 0.3024


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [10/40]: Train loss: 0.3060, Valid loss: 0.3011
Saving model with loss 0.3011


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [11/40]: Train loss: 0.3045, Valid loss: 0.2998
Saving model with loss 0.2998


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [12/40]: Train loss: 0.3031, Valid loss: 0.2988
Saving model with loss 0.2988


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [13/40]: Train loss: 0.3019, Valid loss: 0.2979
Saving model with loss 0.2979


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [14/40]: Train loss: 0.3008, Valid loss: 0.2971
Saving model with loss 0.2971


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [15/40]: Train loss: 0.2998, Valid loss: 0.2964
Saving model with loss 0.2964


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [16/40]: Train loss: 0.2989, Valid loss: 0.2957
Saving model with loss 0.2957


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [17/40]: Train loss: 0.2980, Valid loss: 0.2951
Saving model with loss 0.2951


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [18/40]: Train loss: 0.2973, Valid loss: 0.2947
Saving model with loss 0.2947


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [19/40]: Train loss: 0.2966, Valid loss: 0.2942
Saving model with loss 0.2942


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [20/40]: Train loss: 0.2959, Valid loss: 0.2939
Saving model with loss 0.2939


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [21/40]: Train loss: 0.2952, Valid loss: 0.2935
Saving model with loss 0.2935


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [22/40]: Train loss: 0.2946, Valid loss: 0.2932
Saving model with loss 0.2932


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [23/40]: Train loss: 0.2940, Valid loss: 0.2929
Saving model with loss 0.2929


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [24/40]: Train loss: 0.2934, Valid loss: 0.2927
Saving model with loss 0.2927


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [25/40]: Train loss: 0.2929, Valid loss: 0.2925
Saving model with loss 0.2925


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [26/40]: Train loss: 0.2923, Valid loss: 0.2923
Saving model with loss 0.2923


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [27/40]: Train loss: 0.2917, Valid loss: 0.2922
Saving model with loss 0.2922


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [28/40]: Train loss: 0.2912, Valid loss: 0.2921
Saving model with loss 0.2921


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [29/40]: Train loss: 0.2906, Valid loss: 0.2921
Saving model with loss 0.2921


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [30/40]: Train loss: 0.2900, Valid loss: 0.2920
Saving model with loss 0.2920


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [31/40]: Train loss: 0.2894, Valid loss: 0.2921


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [32/40]: Train loss: 0.2888, Valid loss: 0.2921


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [33/40]: Train loss: 0.2882, Valid loss: 0.2922


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [34/40]: Train loss: 0.2875, Valid loss: 0.2924


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [35/40]: Train loss: 0.2869, Valid loss: 0.2925


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [36/40]: Train loss: 0.2862, Valid loss: 0.2927


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [37/40]: Train loss: 0.2855, Valid loss: 0.2929


  0%|          | 0/761 [00:00<?, ?it/s]

Epoch [38/40]: Train loss: 0.2847, Valid loss: 0.2932


  0%|          | 0/761 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

## 產生輸出

In [None]:
model.load_state_dict(torch.load(config['save_path']))
model.eval()
output_list = []
with torch.no_grad():
  for datas in test_loader:
    datas = datas.to(device) 
    output_list.append(model(datas).cpu())

In [None]:
flatten_data = np.concatenate(output_list, axis=0)
showans = pd.DataFrame(flatten_data)

In [None]:
sample = pd.read_csv("./data/sample.csv")
answer = sample
for i in range(1,29):
    answer.iloc[:,i] = showans.iloc[:, i-1]
display(answer)

  answer.iloc[:,i] = showans.iloc[:, i-1]


Unnamed: 0,user_id,time_slot_0,time_slot_1,time_slot_2,time_slot_3,time_slot_4,time_slot_5,time_slot_6,time_slot_7,time_slot_8,...,time_slot_18,time_slot_19,time_slot_20,time_slot_21,time_slot_22,time_slot_23,time_slot_24,time_slot_25,time_slot_26,time_slot_27
0,30460,0.123907,0.224272,0.003279,0.003082,0.142430,0.321396,0.000994,0.001031,0.096523,...,0.000935,0.001194,0.111894,0.353946,0.000901,0.001270,0.079652,0.266952,0.003793,0.002512
1,30461,0.102939,0.185393,0.004005,0.003548,0.129835,0.282556,0.001458,0.001383,0.096030,...,0.001416,0.001594,0.109022,0.309991,0.001345,0.001753,0.077026,0.241336,0.005250,0.003044
2,30462,0.103387,0.175322,0.003193,0.003263,0.131468,0.276981,0.001150,0.001266,0.097116,...,0.001134,0.001491,0.111643,0.306096,0.001027,0.001591,0.076865,0.238410,0.004379,0.002862
3,30463,0.089631,0.141037,0.020358,0.012543,0.106749,0.202722,0.011563,0.005643,0.054132,...,0.009105,0.006975,0.062891,0.175116,0.009711,0.008316,0.042861,0.135850,0.019545,0.008518
4,30464,0.106915,0.144447,0.006049,0.005515,0.124612,0.202685,0.002817,0.002208,0.068567,...,0.002325,0.002851,0.078381,0.181470,0.002133,0.003142,0.053185,0.144932,0.006955,0.004335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7611,38071,0.079378,0.088654,0.023474,0.018693,0.093897,0.138168,0.016458,0.011105,0.068622,...,0.013516,0.012630,0.080952,0.123675,0.014205,0.015380,0.058333,0.095301,0.024091,0.012063
7612,38072,0.118578,0.153997,0.005977,0.004893,0.117522,0.190515,0.001745,0.001685,0.138839,...,0.001522,0.001771,0.140761,0.196097,0.001672,0.001942,0.097596,0.131858,0.004355,0.002738
7613,38073,0.115622,0.145717,0.006663,0.005578,0.114456,0.183145,0.002048,0.002001,0.140734,...,0.001788,0.002116,0.142947,0.187821,0.001985,0.002334,0.099291,0.127435,0.004945,0.003096
7614,38074,0.071805,0.082415,0.028543,0.020624,0.082159,0.133344,0.019759,0.012574,0.059109,...,0.016245,0.014303,0.072575,0.118348,0.018199,0.016710,0.051813,0.091005,0.027687,0.012947


In [None]:
answer.to_csv("submit.csv", index=False)

In [None]:
summary(model)

Layer (type:depth-idx)                   Param #
SimpleLSTM                               --
├─LSTM: 1-1                              802,816
├─Linear: 1-2                            32,896
├─Linear: 1-3                            3,612
├─Sigmoid: 1-4                           --
├─ReLU: 1-5                              --
Total params: 839,324
Trainable params: 839,324
Non-trainable params: 0