# User Colab Path Setting

In [1]:
colab_dir = '/content/drive/MyDrive/DeepLearning/Challange_comb_shift_flip/MagNet_comb_modelB_cycle_dataClean'  # example for colab

platform = 'auto' # auto detect platform (colab, windows_local, linux_local, unknown)
#platform = 'colab'
#platform = 'windows_local'
#platform = 'linux_local'
#platform = 'unknown'

# Trainnig process

### Defult path config

In [2]:
model_saved_name="model_tl.ckpt"
dataset_path="data/tl_dataset"

### Path config

In [3]:
import os

try:
    from google.colab import drive
    drive.mount("/content/drive")
except ImportError:
    if os.path.exists('c:/'):  # check if it is windows
        platform = 'windows_local'
    elif os.path.exists('/home/'):  # check if it is linux
        platform = 'linux_local'
    else:
        platform = 'unknown'
else:
    platform = 'colab'

if platform == 'colab':
  os.chdir(colab_dir)

print('\ncurrent execution path: ', os.getcwd())  #获取当前工作目录路径
print('\ncurrent platform: ', platform)  #获取当前工作目录路径

Mounted at /content/drive

current execution path:  /content/drive/MyDrive/DeepLearning/MagNet_Final_test_comb/MagNet_comb_modelB_cycle_dataClean

current platform:  colab


## Cuda check

In [4]:
import torch

gpu_num = 0
cuda_ready = False

if torch.cuda.is_available():
    cuda_ready = True
    print('cuda good!')
    gpu_num = torch.cuda.device_count()
    if (gpu_num < 1):
        print('GPU unavailable')
    else:
        print('GPU num: ', gpu_num)  # 查看GPU数量
        for gpu in range(gpu_num):
            print('GPU type: ', torch.cuda.get_device_name(gpu))  # 查看GPU名称
            print('GPU memory: {:.2f} Gbyte'.format(
                torch.cuda.get_device_properties(gpu).total_memory /
                1e9))  # 查看GPU总内存
else:
    cuda_ready = False
    print('cuda unavailable!')


cuda good!
GPU num:  1
GPU type:  NVIDIA A100-SXM4-40GB
GPU memory: 42.48 Gbyte


## Start coding

In [5]:
print(platform)
print(os.getcwd())
print(cuda_ready)
print(os.path.abspath(''))

colab
/content/drive/MyDrive/DeepLearning/MagNet_Final_test_comb/MagNet_comb_modelB_cycle_dataClean
True
/content/drive/MyDrive/DeepLearning/MagNet_Final_test_comb/MagNet_comb_modelB_cycle_dataClean


In [6]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt


import NW_LSTM
import NN_DataLoader

In [7]:
# Check if CUDA is available and if so, set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print("Device using ",device)

# Instantiate the model with appropriate dimensions
model = model = NW_LSTM.get_global_model().to(device)

# Print the model architecture and parameters number
print(model)
print("Total number of parameters: ", sum(p.numel() for p in model.parameters()))

# Load the pre-train model if it exists
try:
    model.load_state_dict(torch.load(model_saved_name))
    print("Pre-train model loaded")
except:
    print("No model found, start training from scratch")
    pass



Device using  cuda
LSTMSeq2One(
  (lstm): LSTM(1, 30, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=32, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=196, bias=True)
  (fc3): Linear(in_features=196, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=96, bias=True)
  (fc5): Linear(in_features=96, out_features=32, bias=True)
  (fc6): Linear(in_features=32, out_features=32, bias=True)
  (fc7): Linear(in_features=32, out_features=16, bias=True)
  (fc8): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (elu): ELU(alpha=1.0)
  (sigmoid): Sigmoid()
)
Total number of parameters:  90653
Pre-train model loaded


### Define training para

In [8]:
# Define the loss function and optimizer
#loss_fn = nn.MSELoss()
loss_fn = NW_LSTM.RelativeLoss()
#loss_fn = NW_LSTM.RelativeLoss_abs()
optimizer = optim.AdamW(model.parameters(), lr=2e-4)

# lr scheduler
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=1, last_epoch=-1)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200, eta_min=0, last_epoch=-1)

### Training loop

In [9]:
# Default para in desktop env
epochs = 10
valid_batch_size=1000

if platform == "colab":
  epochs = 1500
  valid_batch_size=3000


train_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                            "/train.mat"),
                                          batch_size=128)

# Get validation data
valid_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                            "/valid.mat"),
                                            batch_size=valid_batch_size)
valid_inputs, valid_targets = next(iter(valid_dataloader))
valid_inputs, valid_targets = valid_inputs.to(device), valid_targets.to(device)



# estimate time used for training
import time
t0 = time.perf_counter()

# Save the model with the lowest validation loss
with torch.no_grad():
    valid_outputs = model(valid_inputs)
    # Compute loss
    minium_loss = loss_fn(valid_outputs, valid_targets)

# Train the model
for epoch in range(epochs):


    # estimate time used for one epoch(s)
    t_epoch = time.perf_counter() - t0
    t0 = time.perf_counter()

    # Train one epoch
    for i, (train_inputs, train_targets) in enumerate(train_dataloader):
        # Move data to device
        train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)

        # Forward pass
        train_outputs = model(train_inputs)

        # Compute loss
        loss = loss_fn(train_outputs, train_targets)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Compute validation loss
    if epoch > 0:
        with torch.no_grad():
            valid_outputs = model(valid_inputs)
            # Compute loss
            valid_loss = loss_fn(valid_outputs, valid_targets)

        if valid_loss < minium_loss:
            minium_loss = valid_loss
            torch.save(model.state_dict(), model_saved_name)
            print(f"  Model saved , Validation Loss: {valid_loss.item():.3e}, lr: {optimizer.param_groups[0]['lr']:.3e}")

    # update lr
    scheduler.step()

    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item():.3e}, "
            #   f"Validation Loss: {valid_loss.item():.3e} ,"
              f"Remain time: {t_epoch/60 * (epochs - epoch - 1):.1f} min")


  Model saved , Validation Loss: 5.886e-03, lr: 2.000e-04
  Model saved , Validation Loss: 3.331e-03, lr: 2.000e-04
  Model saved , Validation Loss: 3.038e-03, lr: 1.999e-04
  Model saved , Validation Loss: 2.414e-03, lr: 1.998e-04
  Model saved , Validation Loss: 1.936e-03, lr: 1.997e-04
  Model saved , Validation Loss: 1.876e-03, lr: 1.996e-04
Epoch 10/1500, Training Loss: 2.073e-03, Remain time: 13.0 min
  Model saved , Validation Loss: 1.757e-03, lr: 1.988e-04
  Model saved , Validation Loss: 1.244e-03, lr: 1.969e-04
Epoch 20/1500, Training Loss: 7.850e-04, Remain time: 13.5 min
  Model saved , Validation Loss: 1.090e-03, lr: 1.941e-04
  Model saved , Validation Loss: 9.721e-04, lr: 1.935e-04
  Model saved , Validation Loss: 9.389e-04, lr: 1.930e-04
  Model saved , Validation Loss: 8.019e-04, lr: 1.924e-04
  Model saved , Validation Loss: 7.883e-04, lr: 1.911e-04
  Model saved , Validation Loss: 7.581e-04, lr: 1.898e-04
Epoch 30/1500, Training Loss: 5.417e-04, Remain time: 12.9 min

## GPU monitor
### nvidia-smi -l 3