In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/train.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/test.csv


In [2]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import r2_score
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
# 读取数据
train_csv = '/kaggle/input/leap-atmospheric-physics-ai-climsim/train.csv'
test_csv = '/kaggle/input/leap-atmospheric-physics-ai-climsim/test.csv'
subm_spl = '/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission.csv'
out_csv = 'submission.csv'

read_chunk_size = 100000 # 一次性读取100000行数据

# 训练参数
num_epochs = 50
max_patience = 3
batch_size = 360
num_workers = 256
lr = 0.0005

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def graph(acc, loss, title):
    """ 绘制准确率和损失曲线

    Args:
        acc (list): [train, val]/[test]
        loss (list): [train, val]/[test]
        title (str): Title
    """
    assert len(acc) == len(loss), 'Length of acc and loss must be the same'
    global save_run_dir
    if len(acc) == 2:
        plt.subplot(1, 2, 1)
        plt.plot(acc[0], label='Training Accuracy')
        plt.plot(acc[1], label='Validation Accuracy')
        plt.title(title)
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
        plt.subplot(1, 2, 2)
        plt.plot(loss[0], label='Training Loss')
        plt.plot(loss[1], label='Validation Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
    elif len(acc) == 1:
        plt.subplot(2, 1, 1)
        plt.plot(acc[0], label='Test Accuracy')
        plt.title(title)
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
        plt.subplot(2, 1, 2)
        plt.plot(loss[0], label='Test Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
    plt.show()

In [5]:
import time

class Timer:
    def __init__(self):
        self.t0 = 0
        self.t1 = 0
        self.times = []
        self.infos = []
        
    def start(self, info = 'Run'):
        self.t1 = self.t0
        self.infos.append(f'{len(self.infos)} {info}')
        self.t0 = time.time()
        
    def stop(self):
        self.t1 = time.time()
        t = self.t1 - self.t0
        self.times.append(t)
        print(f'{self.infos[-1]} Time Cost: {t:.3f}s')
        
    def get_stats(self):
        for info, tm in zip(self.infos, self.times):
            print(f'{info}\t{tm:.3f}s')
        print(f'Total: {sum(self.times):.3f} ')
    
    def clear(self, idx=0):
        if idx == 0:
            self.infos.clear()
            self.times.clear()
            return
        info = self.infos.pop(idx - (0 if idx < 0 else 1))
        tm = self.times.pop(idx - (0 if idx < 0 else 1))
        return info, tm
        
timer = Timer()

In [6]:
# 读取数据
timer.start(f'Read dataset chunk size {read_chunk_size}')
train_chunks = pd.read_csv(train_csv, chunksize = read_chunk_size)
train_data = next(train_chunks)
timer.stop()
# train_data = next(train_chunks)
# train_data = next(train_chunks)
cols = train_data.columns

train_data.shape

0 Read dataset chunk size 100000 Time Cost: 39.211s


(100000, 925)

# 划分数据集

1. 划分数据的输入输出
2. 划分 训练集，验证集(，测试集)

In [7]:
# 划分数据集

def split_io(dframe, in_cols = cols[1:557], out_cols = cols[557:]):
    # 划分输入输出
    in_df = dframe[in_cols]
    out_df = dframe[out_cols]
    return in_df, out_df

def split_tvt(dframe, ratio=[0.8], shuffle=False):
    # 设定好 train (和 val) 集的比例，剩余的均归到test/val
    assert sum(ratio) <= 1, "Ratio sum for train and val cannot be bigger than 1"
    assert len(ratio) > 0, "Ratio cannot be empty"
     # 根据是否需要测试集来调整比例
    train_rat = ratio[0]
    val_rat = (1 - train_rat) if len(ratio) == 1 else ratio[1]
    test_rat = (1 - train_rat - val_rat) if len(ratio) == 2 else 0
    
    data_size = dframe.shape[0]
    train_size = int(data_size * train_rat)
    val_size = (data_size - train_size) if len(ratio) == 1 else int(data_size * val_rat)
    
    if shuffle:
        dframe = dframe.sample(frac=1).reset_index(drop=True)
        
    train = dframe.iloc[:train_size]
    val = dframe.iloc[train_size:train_size + val_size]
    
    if test_rat != 0:
        test = dframe.iloc[train_size + val_size:]
        return train, val, test
    else:
        return train, val

In [8]:
# 划分数据集
train_set, val_set, test_set = split_tvt(train_data, [0.7, 0.2], True)
train_in, train_out = split_io(train_set)
val_in, val_out = split_io(val_set)
test_in, test_out = split_io(test_set)

print(f'{val_set.shape = }')
print(f'{test_set.shape = }')
print(f'{train_in.shape = }')
print(f'{train_out.shape = }')
print(f'{val_in.shape = }')
print(f'{val_out.shape = }')
# print(f'{val_out.iloc[0] = }')

del(train_data)
del(train_set)
del(val_set)
del(test_set)

val_set.shape = (20000, 925)
test_set.shape = (10000, 925)
train_in.shape = (70000, 556)
train_out.shape = (70000, 368)
val_in.shape = (20000, 556)
val_out.shape = (20000, 368)


In [9]:
class MLP_Dataset(Dataset):
    def __init__(self, dsin, dsout, transform = transforms.Compose([
        transforms.ToTensor(),
    ])):
        self.dsin = dsin
        self.dsout = dsout
        self.transform =transform
    
    def __len__(self):
        return self.dsin.shape[0]
    
    def __getitem__(self, idx):
        data, targ =  self.dsin.iloc[idx], self.dsout.iloc[idx]
        data, targ = data.to_numpy().reshape((1, 556)), targ.to_numpy().reshape((1, 368))
        data, targ = torch.tensor(data), torch.tensor(targ)
        data, targ = data.to(torch.float32), targ.to(torch.float32)
        return data, targ

In [10]:
train_dataset = MLP_Dataset(train_in, train_out)
# print(f'{train_dataset[0][0].shape = } {train_dataset[0][1].shape = } {len(train_dataset) = }')
train_loader =  DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
print(f'{len(train_loader) = }')

val_dataset = MLP_Dataset(val_in, val_out)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(f'{len(val_loader) = }')

len(train_loader) = 195
len(val_loader) = 56




In [11]:
def get_chunk_loader(chunk, test = False):
    tmr = Timer()
    tmr.start('Load datasets...')
    train_set, val_set = split_tvt(chunk)
    train_in, train_out = split_io(train_set)
    val_in, val_out = split_io(val_set)
    print(f'{train_in.shape = }')
    print(f'{train_out.shape = }')
    print(f'{val_in.shape = }')
    print(f'{val_out.shape = }')
    train_dataset = MLP_Dataset(train_in, train_out)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    print(f'{len(train_loader) = }')

    val_dataset = MLP_Dataset(val_in, val_out)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    print(f'{len(val_loader) = }')
    tmr.stop()
    return train_loader, val_loader

# Define MLP

In [12]:
class MLP(nn.Module):
    def __init__(self, layers=5, hidden=144):
        super(MLP, self).__init__()
        self.hidden = hidden
        self.linear = nn.Sequential(
            nn.Linear(556, hidden),
            nn.ReLU(),
            nn.Dropout(),
            *[nn.LazyLinear(hidden),
            nn.LazyBatchNorm1d(),
            nn.ReLU(),
            nn.Dropout(),] * (layers-2),
            nn.LazyLinear(368),
        )
    
    def forward(self, x):
        x = self.linear(x)
        return x

In [13]:
net = MLP(12, 512).to(device)
try:
    weights = torch.load('/kaggle/input/dict.pth', map_location=device)
    net.load_state_dict(weights)
    print("Weights loaded. ")
except:
    print("Weights not loaded. ")
print(net)



# Train Model

In [14]:
def r2score(pred, targ):
    targ_mean = torch.mean(targ)
    ss_total = torch.sum((targ - targ_mean) ** 2)
    ss_residual = torch.sum((targ - pred) ** 2)
    r2 = 1 - (ss_residual / ss_total)
    return r2

criterion = nn.MSELoss()
    
optimizer = optim.Adam(net.parameters(), lr=lr)

In [15]:
lowest_loss = float('inf')

accus, losses = [[],[]], [[],[]]
epoch = 0
last_epoch = 0

In [None]:
try:
    for idx, chunk in enumerate(train_chunks):
    # if True:
        patience = 0
        timer.start(f'Train {num_epochs} epochs on chunk {idx}')
        train_loader, val_loader = get_chunk_loader(chunk)
        while epoch < num_epochs:
            epoch += 1
            t0 = time.time()
            net.train()
            train_loss = 0.0
            train_accu = 0.0
            num_train_batches = 0

            for inp, outp in tqdm(train_loader):
                inp, outp = inp.to(device, non_blocking = True), outp.to(device, non_blocking = True)
                out_h = net(inp)
                crit = criterion(out_h, outp)
                loss = crit.item()
                train_loss += loss
                accu = r2score(out_h, outp)
                train_accu += accu
                crit.backward()
                optimizer.step()
                optimizer.zero_grad()
                num_train_batches += 1
            avg_train_loss = train_loss / num_train_batches
            losses[0].append(avg_train_loss)
            avg_train_accu = train_accu / num_train_batches
            accus[0].append(avg_train_accu.item())

            net.eval()
            val_loss = 0.0
            val_accu = 0.0
            num_val_batches = 0

            with torch.no_grad():
                for inp, outp in tqdm(val_loader):
    #                 inp, outp = [inps.to(device, non_blocking=True) for inps in inp], outp.cuda()
                    inp, outp = inp.to(device, non_blocking = True), outp.to(device, non_blocking = True)
                    out_h = net(inp)
                    crit = criterion(out_h, outp)
                    val_loss += crit.item()
                    accu = r2score(out_h, outp)
                    val_accu += accu
                    num_val_batches += 1
            try:
                last_val_loss = avg_val_loss
            except:
                last_val_loss = lowest_loss
            avg_val_loss = val_loss / num_val_batches
            losses[1].append(avg_val_loss)
            avg_val_accu = val_accu / num_val_batches
            accus[1].append(avg_val_accu.item())

            if avg_val_loss < lowest_loss:
                torch.save(net.state_dict(), 'best.pth')  # 保存模型参数而不是整个模型
                lowest_loss = avg_val_loss

            t1 = time.time()
            print(f'Chunk {idx}:{epoch - last_epoch} | Epoch {epoch}/{num_epochs} > Time Cost: {t1-t0:.2f}s | patience: {patience} \n\t', 
                  f'Train Loss: {avg_train_loss:.3f} | Val Loss: {avg_val_loss:.3f}\n\t',
                  f'Train Accu: {avg_train_accu:.3f} | Val Accu: {avg_val_accu:.3f}')
            if avg_train_loss < avg_val_loss and last_val_loss < avg_val_loss:
                patience += 1
            else:
                patience = 0
            if patience >= max_patience:
                print(f'{max_patience} epochs had val loss bigger than train loss and validation loss increased. Exit for next chunk of data')
                patience = 0
                last_epoch = epoch
                del(chunk)
                break
        else:
            break
except:
    print('Train End')
timer.stop()
torch.save(net.state_dict(), "latest.pth") 
graph(accus, losses, 'Train')



train_in.shape = (80000, 556)
train_out.shape = (80000, 368)
val_in.shape = (20000, 556)
val_out.shape = (20000, 368)
len(train_loader) = 223
len(val_loader) = 56
0 Load datasets... Time Cost: 0.211s


100%|██████████| 223/223 [00:16<00:00, 13.75it/s] 
100%|██████████| 56/56 [00:09<00:00,  5.79it/s]


Chunk 0:1 | Epoch 1/50 > Time Cost: 25.92s | patience: 0 
	 Train Loss: 623.656 | Val Loss: 669.767
	 Train Accu: 0.097 | Val Accu: 0.025


100%|██████████| 223/223 [00:16<00:00, 13.44it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.54it/s]


Chunk 0:2 | Epoch 2/50 > Time Cost: 26.74s | patience: 0 
	 Train Loss: 472.603 | Val Loss: 638.246
	 Train Accu: 0.316 | Val Accu: 0.071


100%|██████████| 223/223 [00:17<00:00, 12.95it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.41it/s]


Chunk 0:3 | Epoch 3/50 > Time Cost: 27.61s | patience: 0 
	 Train Loss: 328.560 | Val Loss: 398.219
	 Train Accu: 0.524 | Val Accu: 0.419


100%|██████████| 223/223 [00:16<00:00, 13.14it/s] 
100%|██████████| 56/56 [00:11<00:00,  4.74it/s]


Chunk 0:4 | Epoch 4/50 > Time Cost: 28.84s | patience: 0 
	 Train Loss: 253.281 | Val Loss: 215.553
	 Train Accu: 0.633 | Val Accu: 0.685


100%|██████████| 223/223 [00:17<00:00, 12.91it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.32it/s]


Chunk 0:5 | Epoch 5/50 > Time Cost: 27.84s | patience: 0 
	 Train Loss: 177.163 | Val Loss: 178.483
	 Train Accu: 0.744 | Val Accu: 0.740


100%|██████████| 223/223 [00:20<00:00, 10.65it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.31it/s]


Chunk 0:6 | Epoch 6/50 > Time Cost: 31.52s | patience: 0 
	 Train Loss: 117.509 | Val Loss: 114.427
	 Train Accu: 0.830 | Val Accu: 0.834


100%|██████████| 223/223 [00:16<00:00, 13.37it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.11it/s]


Chunk 0:7 | Epoch 7/50 > Time Cost: 27.67s | patience: 0 
	 Train Loss: 95.477 | Val Loss: 117.898
	 Train Accu: 0.862 | Val Accu: 0.829


100%|██████████| 223/223 [00:16<00:00, 13.51it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.51it/s]


Chunk 0:8 | Epoch 8/50 > Time Cost: 26.70s | patience: 1 
	 Train Loss: 88.490 | Val Loss: 145.356
	 Train Accu: 0.872 | Val Accu: 0.789


100%|██████████| 223/223 [00:20<00:00, 10.73it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.52it/s]


Chunk 0:9 | Epoch 9/50 > Time Cost: 30.95s | patience: 2 
	 Train Loss: 84.431 | Val Loss: 133.699
	 Train Accu: 0.878 | Val Accu: 0.806


100%|██████████| 223/223 [00:16<00:00, 13.51it/s] 
100%|██████████| 56/56 [00:09<00:00,  5.63it/s]


Chunk 0:10 | Epoch 10/50 > Time Cost: 26.47s | patience: 0 
	 Train Loss: 82.238 | Val Loss: 146.833
	 Train Accu: 0.881 | Val Accu: 0.788


100%|██████████| 223/223 [00:16<00:00, 13.24it/s] 
100%|██████████| 56/56 [00:09<00:00,  5.67it/s]


Chunk 0:11 | Epoch 11/50 > Time Cost: 26.76s | patience: 1 
	 Train Loss: 80.328 | Val Loss: 131.357
	 Train Accu: 0.884 | Val Accu: 0.810


100%|██████████| 223/223 [00:17<00:00, 13.03it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.45it/s]


Chunk 0:12 | Epoch 12/50 > Time Cost: 27.41s | patience: 0 
	 Train Loss: 80.093 | Val Loss: 127.489
	 Train Accu: 0.884 | Val Accu: 0.815


100%|██████████| 223/223 [00:18<00:00, 12.24it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.37it/s]


Chunk 0:13 | Epoch 13/50 > Time Cost: 28.68s | patience: 0 
	 Train Loss: 78.788 | Val Loss: 131.102
	 Train Accu: 0.886 | Val Accu: 0.810


100%|██████████| 223/223 [00:16<00:00, 13.59it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.53it/s]


Chunk 0:14 | Epoch 14/50 > Time Cost: 26.56s | patience: 1 
	 Train Loss: 78.329 | Val Loss: 130.880
	 Train Accu: 0.886 | Val Accu: 0.811


100%|██████████| 223/223 [00:16<00:00, 13.58it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.50it/s]


Chunk 0:15 | Epoch 15/50 > Time Cost: 26.62s | patience: 0 
	 Train Loss: 77.376 | Val Loss: 131.178
	 Train Accu: 0.888 | Val Accu: 0.810


100%|██████████| 223/223 [00:17<00:00, 12.91it/s] 
100%|██████████| 56/56 [00:09<00:00,  5.61it/s]


Chunk 0:16 | Epoch 16/50 > Time Cost: 27.29s | patience: 1 
	 Train Loss: 76.804 | Val Loss: 113.231
	 Train Accu: 0.889 | Val Accu: 0.836


100%|██████████| 223/223 [00:16<00:00, 13.57it/s] 
100%|██████████| 56/56 [00:09<00:00,  5.67it/s]


Chunk 0:17 | Epoch 17/50 > Time Cost: 26.34s | patience: 0 
	 Train Loss: 75.403 | Val Loss: 137.754
	 Train Accu: 0.891 | Val Accu: 0.801


100%|██████████| 223/223 [00:16<00:00, 13.53it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.55it/s]


Chunk 0:18 | Epoch 18/50 > Time Cost: 26.60s | patience: 1 
	 Train Loss: 74.771 | Val Loss: 138.123
	 Train Accu: 0.892 | Val Accu: 0.800


100%|██████████| 223/223 [00:16<00:00, 13.66it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.55it/s]


Chunk 0:19 | Epoch 19/50 > Time Cost: 26.43s | patience: 2 
	 Train Loss: 74.928 | Val Loss: 120.867
	 Train Accu: 0.891 | Val Accu: 0.825


100%|██████████| 223/223 [00:16<00:00, 13.43it/s] 
100%|██████████| 56/56 [00:10<00:00,  5.48it/s]


Chunk 0:20 | Epoch 20/50 > Time Cost: 26.85s | patience: 0 
	 Train Loss: 74.304 | Val Loss: 130.896
	 Train Accu: 0.892 | Val Accu: 0.810


  0%|          | 0/223 [00:00<?, ?it/s]

In [None]:
# 清理
del(inp)
del(outp)
del(train_dataset)
del(train_loader)
del(val_dataset)
del(val_loader)

# Test Model

In [None]:
try:
    del(net)
    print('Latest net released!')
except:
    print('No model trained')

In [None]:
net = Net().to(device)
# net.load_state_dict(torch.load('/kaggle/input/atmospred/pytorch/trained.pth/1/20240502-001.pth', map_location=device))
net.load_state_dict(torch.load('best.pth', map_location=device))
print('Load best model.')

In [None]:
test_dataset = MLP_Dataset(test_in, test_out)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
print(f'{len(test_dataset) = }')

In [None]:
timer.start('Run Test')
net.eval()
test_loss = 0.0
test_accu = 0.0
num_test_batches = 0

with torch.no_grad():
    for i, (inp, outp) in enumerate(tqdm(test_loader)):
        inp, outp = [inps.to(device, non_blocking=True) for inps in inp], outp.to(device)
        out_h = net(inp)
        crit = criterion(out_h, outp)
        test_loss += crit.item()
        test_accu += r2score(out_h, outp)
        num_test_batches += 1

avg_test_loss = test_loss / num_test_batches
avg_test_accu = test_accu / num_test_batches

print(f'Test Loss: {avg_test_loss:.3f} | Test Accu: {avg_test_accu:.3f}')
timer.stop()

In [None]:
# 清理变量
del(test_dataset)
del(test_loader)
del(out_h)
del(inp, outp)