In [1]:
import polars
import pandas
import numpy as np
import torch
import optuna
import logging
import lightning.pytorch as pl
from torch.utils.data import Dataset, DataLoader, random_split
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
from tqdm import tqdm
from datasets import LEAPDataset, LEAPTestDataset
from ptlit import PTLit
from utils import r2_score

In [2]:
src_seq = torch.load("../data/src_seq_p.pt").to(torch.float32)
src_scl = torch.load("../data/src_scl_p.pt").to(torch.float32)
labeln = torch.load("../data/labeln.pt").to(torch.float32)
labelc = torch.load("../data/labelc.pt").to(torch.float32)
labelcc = torch.load("../data/labelc1e-5.pt").to(torch.float32)
labelmu = torch.load("../data/labelc1e-5mu.pt").to(torch.float32)
labelstd = torch.load("../data/labelc1e-5std.pt").to(torch.float32)
mask = torch.load("../data/weight.pt").to(torch.float32).bool()
train_ds = LEAPTestDataset(src_seq, src_scl)
train_loader = DataLoader(train_ds, batch_size=4096, shuffle=False)

In [45]:
models = []
models.append(PTLit.load_from_checkpoint("../ckpt/kf/lin-c5-0-epoch=53-val_score=0.738.ckpt").float())
models.append(PTLit.load_from_checkpoint("../ckpt/te/large-c5-2-epoch=54-val_score=0.747.ckpt").float())
models.append(PTLit.load_from_checkpoint("../ckpt/jnet/large-c5-2-epoch=52-val_score=0.746.ckpt").float())

In [7]:
checkpoint_callback = ModelCheckpoint(
    dirpath='ckpt/',
    filename='te-base-{epoch:02d}-{val_loss:.2f}',
    save_top_k=-1,
    monitor='val_loss',
    mode='min'
)
logger = TensorBoardLogger(save_dir="logger")
trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=1,
    accelerator="gpu",
    devices=[6]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
torch.set_float32_matmul_precision('high')
pred = []
for model in models:
    pred.append(trainer.predict(model, train_loader))

In [48]:
preds = []
inverse_mask = ~mask.bool()
for p in pred:
    ps = torch.cat(p)
    ps[:, inverse_mask] = 0.
    preds.append(ps)

In [51]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
num_models = len(preds)
num_targets = labelc.size(1)
ps_f = torch.zeros_like(preds[0])
alphas = torch.zeros(num_targets, num_models)
def objective(trial, i):
    weights = []
    remaining_sum = 10
    for j in range(num_models - 1):
        w_i = trial.suggest_float(f'weight_{j}', 0, remaining_sum, step=.5)
        w = w_i / 10.
        weights.append(w)
        remaining_sum -= w_i

    col = sum(weight * preds[m][:, i] for m, weight in enumerate(weights))
    return r2_score(col, labeln[:, i])

for i in tqdm(range(num_targets)):
    if mask[i]:
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, i), n_trials=200)
        best_weights = []
        remaining_sum = 10.0
        for j in range(num_models - 1):
            w = study.best_params[f'weight_{j}']
            best_weights.append(w / 10.)
            remaining_sum -= w
        best_weights.append(remaining_sum / 10.) 
        alphas[i] = torch.tensor(best_weights)
        # print(alphas[i])
        ps_f[:, i] = sum(weight * preds[m][:, i] for m, weight in enumerate(best_weights))

100%|██████████| 368/368 [18:23<00:00,  3.00s/it]


tensor(0.6916)


In [5]:
test_seq = torch.load("../data/test_seq_p.pt").to(torch.float32)
test_scl = torch.load("../data/test_scl_p.pt").to(torch.float32)
test_ds = LEAPTestDataset(test_seq, test_scl)
test_loader = DataLoader(test_ds, batch_size=1024, shuffle=False)

In [None]:
test_pred = []
for model in models:
    test_pred.append(trainer.predict(model, test_loader))

In [11]:
num_models = len(test_pred)
test_ps_list = [torch.cat(test_pred[m]) for m in range(num_models)]
test_ps_final = torch.zeros_like(test_ps_list[0])
for i in range(test_ps_final.size(1)):
    weighted_sum = torch.zeros_like(test_ps_list[0][:, i])
    for m in range(num_models):
        a = alphas[i, m].to(torch.float64)
        weighted_sum += test_ps_list[m][:, i] * a
    test_ps_final[:, i] = weighted_sum
test_ps_final = test_ps_final * labelstd + labelmu
test_ps_final[:, inverse_mask] = 0.

In [12]:
df = polars.read_csv('~/leap/data/train.csv')
test_df = polars.read_csv('~/leap/data/test.csv')
ss = polars.read_csv('~/leap/data/sample_submission.csv', n_rows=1)
ss2 = polars.read_csv("~/leap/data/sample_submission.csv")

In [13]:
SRC_COLS = df.columns[1:557]
TGT_COLS = df.columns[557:]

for col in SRC_COLS:
    df = df.with_columns(polars.col(col).cast(polars.Float64))
    test_df = test_df.with_columns(polars.col(col).cast(polars.Float64))

for col in TGT_COLS:
    df = df.with_columns(polars.col(col).cast(polars.Float64))
    ss = ss.with_columns(polars.col(col).cast(polars.Float64))
    ss2 = ss2.with_columns(polars.col(col).cast(polars.Float64))

In [None]:
ss = pandas.read_csv("~/leap/data/sample_submission.csv")
ss.iloc[:,1:] = test_ps_final.numpy()
use_cols = []
for i in range(27):
    use_cols.append(f"ptend_q0002_{i}")

# test_df = test_df.to_pandas()
for col in use_cols:
    ss[col] = - test_df[col.replace("ptend", "state")] * ss2[col] / 1200.

test_polars = polars.from_pandas(ss[["sample_id"]+TGT_COLS])
test_polars.write_csv("../outputs/emc555.csv")

In [20]:
! kaggle competitions submit -c leap-atmospheric-physics-ai-climsim -f ../outputs/emn6.csv -m "7361"

100%|██████████████████████████████████████| 4.07G/4.07G [00:46<00:00, 94.7MB/s]
Successfully submitted to LEAP - Atmospheric Physics using AI (ClimSim)