In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/train.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/test.csv


In [2]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import r2_score
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
# 读取数据
train_csv = '/kaggle/input/leap-atmospheric-physics-ai-climsim/train.csv'
test_csv = '/kaggle/input/leap-atmospheric-physics-ai-climsim/test.csv'
subm_spl = '/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission.csv'
out_csv = 'submission.csv'

read_chunk_size = 100000 # 一次性读取100000行数据

# 训练参数
num_epochs = 50
max_patience = 3
batch_size = 360
num_workers = 256
lr = 0.0005

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def graph(acc, loss, title):
    """ 绘制准确率和损失曲线

    Args:
        acc (list): [train, val]/[test]
        loss (list): [train, val]/[test]
        title (str): Title
    """
    assert len(acc) == len(loss), 'Length of acc and loss must be the same'
    global save_run_dir
    if len(acc) == 2:
        plt.subplot(1, 2, 1)
        plt.plot(acc[0], label='Training Accuracy')
        plt.plot(acc[1], label='Validation Accuracy')
        plt.title(title)
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
        plt.subplot(1, 2, 2)
        plt.plot(loss[0], label='Training Loss')
        plt.plot(loss[1], label='Validation Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
    elif len(acc) == 1:
        plt.subplot(2, 1, 1)
        plt.plot(acc[0], label='Test Accuracy')
        plt.title(title)
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
        plt.subplot(2, 1, 2)
        plt.plot(loss[0], label='Test Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
    plt.show()

In [5]:
import time

class Timer:
    def __init__(self):
        self.t0 = 0
        self.t1 = 0
        self.times = []
        self.infos = []
        
    def start(self, info = 'Run'):
        self.t1 = self.t0
        self.infos.append(f'{len(self.infos)} {info}')
        self.t0 = time.time()
        
    def stop(self):
        self.t1 = time.time()
        t = self.t1 - self.t0
        self.times.append(t)
        print(f'{self.infos[-1]} Time Cost: {t:.3f}s')
        
    def get_stats(self):
        for info, tm in zip(self.infos, self.times):
            print(f'{info}\t{tm:.3f}s')
        print(f'Total: {sum(self.times):.3f} ')
    
    def clear(self, idx=0):
        if idx == 0:
            self.infos.clear()
            self.times.clear()
            return
        info = self.infos.pop(idx - (0 if idx < 0 else 1))
        tm = self.times.pop(idx - (0 if idx < 0 else 1))
        return info, tm
        
timer = Timer()

In [None]:
# 读取数据
timer.start(f'Read dataset chunk size {read_chunk_size}')
train_chunks = pd.read_csv(train_csv, chunksize = read_chunk_size)
train_data = next(train_chunks)
timer.stop()
# train_data = next(train_chunks)
# train_data = next(train_chunks)
cols = train_data.columns

train_data.shape

# 划分数据集

1. 划分数据的输入输出
2. 划分 训练集，验证集(，测试集)

In [None]:
# 划分数据集

def split_io(dframe, in_cols = cols[1:557], out_cols = cols[557:]):
    # 划分输入输出
    in_df = dframe[in_cols]
    out_df = dframe[out_cols]
    return in_df, out_df

def split_tvt(dframe, ratio=[0.8], shuffle=False):
    # 设定好 train (和 val) 集的比例，剩余的均归到test/val
    assert sum(ratio) <= 1, "Ratio sum for train and val cannot be bigger than 1"
    assert len(ratio) > 0, "Ratio cannot be empty"
     # 根据是否需要测试集来调整比例
    train_rat = ratio[0]
    val_rat = (1 - train_rat) if len(ratio) == 1 else ratio[1]
    test_rat = (1 - train_rat - val_rat) if len(ratio) == 2 else 0
    
    data_size = dframe.shape[0]
    train_size = int(data_size * train_rat)
    val_size = (data_size - train_size) if len(ratio) == 1 else int(data_size * val_rat)
    
    if shuffle:
        dframe = dframe.sample(frac=1).reset_index(drop=True)
        
    train = dframe.iloc[:train_size]
    val = dframe.iloc[train_size:train_size + val_size]
    
    if test_rat != 0:
        test = dframe.iloc[train_size + val_size:]
        return train, val, test
    else:
        return train, val

In [None]:
# 划分数据集
train_set, val_set, test_set = split_tvt(train_data, [0.7, 0.2], True)
train_in, train_out = split_io(train_set)
val_in, val_out = split_io(val_set)
test_in, test_out = split_io(test_set)

print(f'{val_set.shape = }')
print(f'{test_set.shape = }')
print(f'{train_in.shape = }')
print(f'{train_out.shape = }')
print(f'{val_in.shape = }')
print(f'{val_out.shape = }')
# print(f'{val_out.iloc[0] = }')

del(train_data)
del(train_set)
del(val_set)
del(test_set)

In [None]:
class MLP_Dataset(Dataset):
    def __init__(self, dsin, dsout, transform = transforms.Compose([
        transforms.ToTensor(),
    ])):
        self.dsin = dsin
        self.dsout = dsout
        self.transform =transform
    
    def __len__(self):
        return self.dsin.shape[0]
    
    def __getitem__(self, idx):
        data, targ =  self.dsin.iloc[idx], self.dsout.iloc[idx]
        data, targ = data.to_numpy().reshape((1, 556)), targ.to_numpy((1, 368))
        if self.transform != None:
            data, targ = self.transform(data), self.transform(targ)
        data, targ = data.astype(torch.float32), targ.astype(torch.float32)
        return data, targ

In [None]:
train_dataset = MLP_Dataset(train_in, train_out)
# print(f'{train_dataset[0][0].shape = } {train_dataset[0][1].shape = } {len(train_dataset) = }')
train_loader =  DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
print(f'{len(train_loader) = }')

val_dataset = MLP_Dataset(val_in, val_out)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(f'{len(val_loader) = }')

In [None]:
def get_chunk_loader(chunk, test = False):
    tmr = Timer()
    tmr.start('Load datasets...')
    train_set, val_set = split_tvt(chunk)
    train_in, train_out = split_io(train_set)
    val_in, val_out = split_io(val_set)
    print(f'{train_in.shape = }')
    print(f'{train_out.shape = }')
    print(f'{val_in.shape = }')
    print(f'{val_out.shape = }')
    train_dataset = MLP_Dataset(train_in, train_out)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    print(f'{len(train_dataset) = }')

    val_dataset = MLP_Dataset(val_in, val_out)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    print(f'{len(val_dataset) = }')
    tmr.stop()
    return train_loader, val_loader

In [None]:
class Layer(nn.Module):
    def __init__(self, hidden):
        super(Layer, self).__init__()
        self.layer = nn.Sequential(
            nn.LazyLinear(hidden),
            nn.LazyBatchNorm1d(),
            nn.ReLU(),
            nn.Dropout(),
        )
        
    def forward(self, x):
        x = self.layer(x)
        return x

class MLP(nn.Module):
    def __init__(self, layers=5, hidden=144):
        super(MLP, self).__init__()
        self.hidden = hidden
        self.linear = nn.Sequential(
            *[Layer(hidden),] * (layers-1),
            nn.LazyLinear(368),
        )
    
    def forward(self, x):
        x = self.linear(x)
        return x

In [None]:
net = MLP()

In [None]:
criterion = nn.MSELoss()
    
optimizer = optim.Adam(net.parameters(), lr=lr)

In [None]:
lowest_loss = float('inf')

accus, losses = [[],[]], [[],[]]
epoch = 0
last_epoch = 0

In [None]:
net = net.to(device)
for idx, chunk in enumerate(train_chunks):
    patience = 0
    timer.start(f'Train {num_epochs} epochs on chunk {idx}')
    train_loader, val_loader = get_chunk_loader(chunk)
    while epoch < num_epochs:
        epoch += 1
        t0 = time.time()
        net.train()
        train_loss = 0.0
        train_accu = 0.0
        num_train_batches = 0

        for inp, outp in tqdm(train_loader):
#             inp, outp = [inps.to(device, non_blocking=True) for inps in inp], outp.to(device)
            inp, outp = inp.to(device, non_blocking = True), outp.to(device, non_blocking = True)
            out_h = net(inp)
            crit = criterion(out_h, outp)
            loss = crit.item()
            train_loss += loss
            accu = r2_score(out_h, outp)
            train_accu += accu
            crit.backward()
            optimizer.step()
            optimizer.zero_grad()
            num_train_batches += 1
        avg_train_loss = train_loss / num_train_batches
        losses[0].append(avg_train_loss)
        avg_train_accu = train_accu / num_train_batches
        accus[0].append(avg_train_accu)

        net.eval()
        val_loss = 0.0
        val_accu = 0.0
        num_val_batches = 0

        with torch.no_grad():
            for inp, outp in tqdm(val_loader):
#                 inp, outp = [inps.to(device, non_blocking=True) for inps in inp], outp.cuda()
                inp, outp = inp.to(device, non_blocking = True), outp.to(device, non_blocking = True)
                out_h = net(inp)
                crit = criterion(out_h, outp)
                val_loss += crit.item()
                accu = r2_score(out_h, outp)
                val_accu += accu
                num_val_batches += 1
        try:
            last_val_loss = avg_val_loss
        except:
            last_val_loss = lowest_loss
        avg_val_loss = val_loss / num_val_batches
        losses[1].append(avg_val_loss)
        avg_val_accu = val_accu / num_val_batches
        accus[1].append(avg_val_accu)

        if avg_val_loss < lowest_loss:
            torch.save(net.state_dict(), 'best.pth')  # 保存模型参数而不是整个模型
            lowest_loss = avg_val_loss

        t1 = time.time()
        print(f'Chunk {idx} | Epoch {epoch - last_epoch}/{epoch} > Time Cost: {t1-t0:.2f}s | patience: {patience} \n\t', 
              f'Train Loss: {avg_train_loss:.3f} | Val Loss: {avg_val_loss:.3f}\n\t',
              f'Train Accu: {avg_train_accu:.3f} | Val Accu: {avg_val_accu:.3f}')
        if avg_train_loss < avg_val_loss and last_val_loss < avg_val_loss:
            patience += 1
#         else:
#             patience = 0
        if patience >= max_patience:
            print(f'{max_patience} epochs had val loss bigger than train loss. Exit for next chunk of data')
            patience = 0
            last_epoch = epoch
            del(chunk)
            break
    timer.stop()
    torch.save(net.state_dict(), "latest.pth") 
graph(accus, losses, 'Train')