# User Colab Path Setting

In [11]:
colab_dir = '/content/drive/MyDrive/DeepLearning/Grp_Small/TL/MagNet_small_modelE_cycle'  # example for colab

platform = 'auto' # auto detect platform (colab, windows_local, linux_local, unknown)
#platform = 'colab'
#platform = 'windows_local'
#platform = 'linux_local'
#platform = 'unknown'

# Trainnig process

### Defult path config

In [12]:
model_saved_name="model_tl.ckpt"
dataset_path="data/tl_dataset"

### Path config

In [13]:
import os

try:
    from google.colab import drive
    drive.mount("/content/drive")
except ImportError:
    if os.path.exists('c:/'):  # check if it is windows
        platform = 'windows_local'
    elif os.path.exists('/home/'):  # check if it is linux
        platform = 'linux_local'
    else:
        platform = 'unknown'
else:
    platform = 'colab'

if platform == 'colab':
  os.chdir(colab_dir)

print('\ncurrent execution path: ', os.getcwd())  #获取当前工作目录路径
print('\ncurrent platform: ', platform)  #获取当前工作目录路径

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

current execution path:  /content/drive/MyDrive/DeepLearning/Grp_Small/TL/MagNet_small_modelE_cycle

current platform:  colab


## Cuda check

In [14]:
import torch

gpu_num = 0
cuda_ready = False

if torch.cuda.is_available():
    cuda_ready = True
    print('cuda good!')
    gpu_num = torch.cuda.device_count()
    if (gpu_num < 1):
        print('GPU unavailable')
    else:
        print('GPU num: ', gpu_num)  # 查看GPU数量
        for gpu in range(gpu_num):
            print('GPU type: ', torch.cuda.get_device_name(gpu))  # 查看GPU名称
            print('GPU memory: {:.2f} Gbyte'.format(
                torch.cuda.get_device_properties(gpu).total_memory /
                1e9))  # 查看GPU总内存
else:
    cuda_ready = False
    print('cuda unavailable!')


cuda good!
GPU num:  1
GPU type:  Tesla V100-SXM2-16GB
GPU memory: 16.94 Gbyte


## Start coding

In [15]:
print(platform)
print(os.getcwd())
print(cuda_ready)
print(os.path.abspath(''))

colab
/content/drive/MyDrive/DeepLearning/Grp_Small/TL/MagNet_small_modelE_cycle
True
/content/drive/MyDrive/DeepLearning/Grp_Small/TL/MagNet_small_modelE_cycle


In [16]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt


import NW_LSTM
import NN_DataLoader

In [17]:
# Check if CUDA is available and if so, set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print("Device using ",device)

# Instantiate the model with appropriate dimensions
model = model = NW_LSTM.get_global_model().to(device)

# Print the model architecture and parameters number
print(model)
print("Total number of parameters: ", sum(p.numel() for p in model.parameters()))

# Load the pre-train model if it exists
try:
    model.load_state_dict(torch.load(model_saved_name))
    print("Pre-train model loaded")
except:
    print("No model found, start training from scratch")
    pass



Device using  cuda
LSTMSeq2One(
  (lstm): LSTM(3, 30, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=30, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
  (fc5): Linear(in_features=8, out_features=8, bias=True)
  (fc6): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU()
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (elu): ELU(alpha=1.0)
)
Total number of parameters:  16449
Pre-train model loaded


### Define training para

In [18]:
def train_model(epoch_num=700,lr=2e-4,method="forward"):

    # Define the loss function and optimizer
    #loss_fn = nn.MSELoss()
    loss_fn = NW_LSTM.RelativeLoss()
    #loss_fn = NW_LSTM.RelativeLoss_abs()
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # lr scheduler
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=1, last_epoch=-1)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200, eta_min=0, last_epoch=-1)

    # Default para in desktop env
    epochs = 10
    valid_batch_size=1000

    if platform == "colab":
      epochs = epoch_num
      valid_batch_size=3000


    train_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                                "/train.mat"),
                                            batch_size=128)

    # Get validation data
    valid_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                                "/valid.mat"),
                                                batch_size=valid_batch_size)
    valid_inputs, valid_targets = next(iter(valid_dataloader))
    valid_inputs, valid_targets = valid_inputs.to(device), valid_targets.to(device)



    # estimate time used for training
    import time
    t0 = time.perf_counter()

    # Save the model with the lowest validation loss
    with torch.no_grad():
        valid_outputs = model(valid_inputs)
        # Compute loss
        minium_loss = loss_fn(valid_outputs, valid_targets)

    # Train the model
    for epoch in range(epochs):


        # estimate time used for one epoch(s)
        t_epoch = time.perf_counter() - t0
        t0 = time.perf_counter()

        # Train one epoch
        for i, (train_inputs, train_targets) in enumerate(train_dataloader):
            # Move data to device
            train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)

            # Forward pass
            if method == "forward":
                train_outputs = model(train_inputs)
            elif method == "valid":
                train_outputs = model.valid(train_inputs)

            # Compute loss
            loss = loss_fn(train_outputs, train_targets)

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Compute validation loss
        if epoch > 0:
            with torch.no_grad():
                valid_outputs = model(valid_inputs)
                # Compute loss
                valid_loss = loss_fn(valid_outputs, valid_targets)

            if valid_loss < minium_loss:
                minium_loss = valid_loss
                torch.save(model.state_dict(), model_saved_name)
                print(f"  Model saved , Validation Loss: {valid_loss.item():.3e}, lr: {optimizer.param_groups[0]['lr']:.3e}")

        # update lr
        scheduler.step()

        # Print loss every 10 epochs
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item():.3e}, "
                #   f"Validation Loss: {valid_loss.item():.3e} ,"
                f"Remain time: {t_epoch/60 * (epochs - epoch - 1):.1f} min")


### Training loop

In [19]:
#train_model(epoch_num=10,lr=2e-4,method="forward")

In [20]:
#train_model(epoch_num=800,lr=10e-4,method="forward")
train_model(epoch_num=3000,lr=1e-4,method="forward")

  Model saved , Validation Loss: 2.106e-03, lr: 9.999e-05
  Model saved , Validation Loss: 1.765e-03, lr: 9.990e-05
Epoch 10/3000, Training Loss: 1.631e-03, Remain time: 15.9 min
Epoch 20/3000, Training Loss: 1.366e-03, Remain time: 15.4 min
Epoch 30/3000, Training Loss: 1.220e-03, Remain time: 15.2 min
Epoch 40/3000, Training Loss: 1.675e-03, Remain time: 23.5 min
Epoch 50/3000, Training Loss: 2.373e-03, Remain time: 15.0 min
Epoch 60/3000, Training Loss: 1.033e-03, Remain time: 16.1 min
Epoch 70/3000, Training Loss: 1.405e-03, Remain time: 15.5 min
Epoch 80/3000, Training Loss: 1.299e-03, Remain time: 19.7 min
Epoch 90/3000, Training Loss: 9.782e-04, Remain time: 22.8 min
Epoch 100/3000, Training Loss: 1.249e-03, Remain time: 15.9 min
Epoch 110/3000, Training Loss: 1.109e-03, Remain time: 15.3 min
Epoch 120/3000, Training Loss: 1.035e-03, Remain time: 14.3 min
Epoch 130/3000, Training Loss: 1.493e-03, Remain time: 20.3 min
Epoch 140/3000, Training Loss: 1.023e-03, Remain time: 14.0 m

## GPU monitor
### nvidia-smi -l 3