In [1]:
import polars as pl
import gc
import pickle
from pathlib import Path, PosixPath
from tqdm.auto import tqdm

import sys
sys.path.append('..')

from src.utils import seed_everything, get_logger, get_config, TimeUtil
from src.utils.competition_utils import clipping_input
from src.data import DataProvider, FeatureEngineering, Preprocessor, HFPreprocessor, PostProcessor
from src.train import get_dataloader, Trainer

In [2]:
# コマンドライン引数
config_name = 'exp_146'
run_mode = 'hf'  # hf, full, dev, debug

In [3]:
config = get_config(config_name, config_dir=Path('../config'))
logger = get_logger(config.output_path)
logger.info(f"Start EXP={config.exp}...")
config.run_mode = run_mode

seed_everything(config.seed)

[ [32m2024-10-14 06:26:03[0m | [1mINFO ] Start EXP=151...[0m


In [4]:
# 実験のための変更
config.run_mode = 'dev'
config.epochs = 40
config.first_cycle_epochs = 40
config.add_epochs = 10
config.add_first_cycle_epochs = 10

config.input_path = Path('../data/input')
config.add_path = Path('../data/input/additional')
config.output_path = Path(f'../data/output/{config.exp}')
config.oof_path = Path(f'../data/oof/{config.exp}')
config.output_path.mkdir(exist_ok=True, parents=True)
config.oof_path.mkdir(exist_ok=True, parents=True)

In [5]:
with TimeUtil.timer('Data Loading...'):
    dpr = DataProvider(config)
    train_df, test_df = dpr.load_data()


with TimeUtil.timer('Feature Engineering...'):
    fer = FeatureEngineering(config)
    train_df = fer.feature_engineering(train_df)
    test_df = fer.feature_engineering(test_df)


with TimeUtil.timer('Scaling and Clipping Features...'):
    ppr = Preprocessor(config)
    train_df, test_df = ppr.scaling(train_df, test_df)
    input_cols, target_cols = ppr.input_cols, ppr.target_cols
    if config.task_type == 'grid_pred':
        train_df = train_df.drop(target_cols)

    valid_df = train_df.filter(pl.col('fold') == 0)
    train_df = train_df.filter(pl.col('fold') != 0)
    valid_df, input_clip_dict = clipping_input(train_df, valid_df, input_cols)
    test_df, _ = clipping_input(None, test_df, input_cols, input_clip_dict)
    pickle.dump(input_clip_dict, open(config.output_path / 'input_clip_dict.pkl', 'wb'))


with TimeUtil.timer('Converting to arrays for NN...'):
    array_data = ppr.convert_numpy_array(train_df, valid_df, test_df)
    del train_df, valid_df, test_df
    gc.collect()


if config.run_mode == 'hf':
    with TimeUtil.timer('HF Data Preprocessing...'):
        del array_data['train_ids'], array_data['X_train'], array_data['y_train']
        gc.collect()

        hf_ppr = HFPreprocessor(config)
        hf_ppr.shrink_file_size()
        hf_ppr.convert_numpy_array(unlink_parquet=True)



[Data Loading...] done [73.6GB(19.0%)(+73.097GB)] 25.5372 s
[Feature Engineering...] done [83.5GB(14.9%)(+9.832GB)] 17.6414 s
[Scaling and Clipping Features...] done [74.9GB(10.0%)(-8.555GB)] 33.1294 s
[Converting to arrays for NN...] done [100.6GB(20.1%)(+25.668GB)] 116.2043 s


In [6]:
with TimeUtil.timer('Creating Torch DataLoader...'):
    if config.run_mode == 'hf':
        train_loader = get_dataloader(
            config,
            hf_read_type='npy',
            is_train=True
        )
    else:
        train_loader = get_dataloader(
            config,
            array_data['train_ids'],
            array_data['X_train'],
            array_data['y_train'],
            is_train=True
        )
    valid_loader = get_dataloader(
        config,
        array_data['valid_ids'],
        array_data['X_valid'],
        array_data['y_valid'],
        is_train=False
    )
    test_loader = get_dataloader(
        config,
        array_data['test_ids'],
        array_data['X_test'],
        is_train=False
    )
    del array_data
    gc.collect()

[Creating Torch DataLoader...] done [100.6GB(20.1%)(+0.000GB)] 0.1103 s


# Trainer

In [7]:
# First Training
trainer = Trainer(config, logger)
best_score, best_cw_score, best_epochs = trainer.train(
    train_loader,
    valid_loader,
    colwise_mode=True,
)
logger.info(
    f"First Training Results: best_score={best_score}, best_cw_score={best_cw_score}, best_epochs={best_epochs}"
)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/6750 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

[ [32m2024-10-14 06:52:18[0m | [1mINFO ] [Valid] : Epoch=0, Loss=0.17575, Score=0.67290, Best Col-Wise Score=0.67290[0m
[ [32m2024-10-14 07:06:24[0m | [1mINFO ] [Train] : Epoch=0, Loss=0.18005, LR=4.99280e-04[0m


  0%|          | 0/6750 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

[ [32m2024-10-14 07:15:19[0m | [1mINFO ] [Valid] : Epoch=1, Loss=0.16446, Score=0.69192, Best Col-Wise Score=0.69334[0m


KeyboardInterrupt: 

In [None]:
# 0.67290
# 0.69334

# Additional Training

In [None]:
# Additional Training
config.loss_type = config.add_loss_type
config.epochs = config.add_epochs
config.lr = config.add_lr
config.first_cycle_epochs = config.add_first_cycle_epochs

trained_weights = sorted(
    config.output_path.glob(f"model_eval*.pth"),
    key=lambda x: int(x.stem.split('_')[-1].replace('eval', ''))
)

trainer = Trainer(config, logger)
best_score, best_cw_score, best_epochs = trainer.train(
    train_loader,
    valid_loader,
    colwise_mode=True,
    retrain=True,
    retrain_weight_name=trained_weights[-1].stem,
    retrain_best_score=best_score,
)
logger.info(f'Additional Training Results: best_score={best_score}, best_cw_score={best_cw_score}, best_epochs={best_epochs}')



# Inference

In [None]:
# Inference
pred_df = trainer.test_predict(test_loader, eval_method="single")
pred_df.write_csv(config.output_path / 'submission.csv')

# PostProcess
oof_df = pl.read_parquet(config.oof_path / 'oof.parquet')
por = PostProcessor(config, logger)
oof_df, sub_df = por.postprocess(oof_df, pred_df)
logger.info(f'OOF: {oof_df.shape}, Submission: {sub_df.shape}')

  torch.load(self.config.output_path / f"model{self.save_suffix}_best.pth")


  0%|          | 0/153 [00:00<?, ?it/s]