In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# # 결측치 처리  갈룬이 전처리도 담당
# train.fillna(train.mean(), inplace=True)
# test.fillna(train.mean(), inplace=True)


Encoding features: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.05it/s]


In [3]:
!pip install autogluon

Collecting autogluon
  Using cached autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2
  Using cached autogluon.core-0.8.2-py3-none-any.whl (224 kB)
Collecting autogluon.timeseries[all]==0.8.2
  Using cached autogluon.timeseries-0.8.2-py3-none-any.whl (116 kB)
Collecting autogluon.tabular[all]==0.8.2
  Using cached autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
Collecting autogluon.multimodal==0.8.2
  Using cached autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
Collecting autogluon.features==0.8.2
  Using cached autogluon.features-0.8.2-py3-none-any.whl (62 kB)
Collecting autogluon.common==0.8.2
  Using cached autogluon.common-0.8.2-py3-none-any.whl (61 kB)
Collecting ray[default]<2.4,>=2.3
  Using cached ray-2.3.1-cp310-cp310-win_amd64.whl (21.7 MB)
Collecting hyperopt<0.2.8,>=0.2.7
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting openmim<0.4.0,>=0.3.7
  Using cached openmim-0.3.9-py2.py3-none-any.whl (52 kB)
Collecting nlpaug



In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_data = TabularDataset(train)
save_path = 'automodel'
predictor =  TabularPredictor(
    label = 'CI_HOUR',
    path=save_path,
    eval_metric = 'mean_absolute_error',
    problem_type = 'regression'
)
predictor.fit(
    train_data = train_data,
    presets = 'best_quality',
    time_limit = 60*10
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "automodel\"
AutoGluon Version:  0.8.2
Python Version:     3.10.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   165.51 GB / 510.77 GB (32.4%)
Train Data Rows:    367441
Train Data Columns: 30
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3471.49 MB
	Train Data (Original)  Memory Usage: 79.37 MB (2.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitt

[1000]	valid_set's l1: 53.8001
[2000]	valid_set's l1: 51.0345
[3000]	valid_set's l1: 49.2384
[4000]	valid_set's l1: 47.7656
[5000]	valid_set's l1: 46.6918


	Ran out of time, early stopping on iteration 5242. Best iteration is:
	[5242]	valid_set's l1: 46.5197


[1000]	valid_set's l1: 53.2308
[2000]	valid_set's l1: 50.8454
[3000]	valid_set's l1: 48.8834
[4000]	valid_set's l1: 47.4115
[5000]	valid_set's l1: 46.2512


	Ran out of time, early stopping on iteration 5366. Best iteration is:
	[5366]	valid_set's l1: 45.9405


[1000]	valid_set's l1: 53.6147
[2000]	valid_set's l1: 51.0085
[3000]	valid_set's l1: 49.0484
[4000]	valid_set's l1: 47.4275
[5000]	valid_set's l1: 46.3981


	Ran out of time, early stopping on iteration 5437. Best iteration is:
	[5437]	valid_set's l1: 45.9737


[1000]	valid_set's l1: 53.5056
[2000]	valid_set's l1: 50.6571
[3000]	valid_set's l1: 48.8563
[4000]	valid_set's l1: 47.5435
[5000]	valid_set's l1: 46.3855


	Ran out of time, early stopping on iteration 5524. Best iteration is:
	[5521]	valid_set's l1: 45.9303


[1000]	valid_set's l1: 52.3073
[2000]	valid_set's l1: 49.9314
[3000]	valid_set's l1: 48.0838
[4000]	valid_set's l1: 46.7015
[5000]	valid_set's l1: 45.6238


	Ran out of time, early stopping on iteration 5411. Best iteration is:
	[5410]	valid_set's l1: 45.2256


[1000]	valid_set's l1: 52.8485
[2000]	valid_set's l1: 50.3994
[3000]	valid_set's l1: 48.5282
[4000]	valid_set's l1: 47.1111
[5000]	valid_set's l1: 46.0842


	Ran out of time, early stopping on iteration 5472. Best iteration is:
	[5463]	valid_set's l1: 45.6713


[1000]	valid_set's l1: 53.2766
[2000]	valid_set's l1: 50.9645
[3000]	valid_set's l1: 49.258
[4000]	valid_set's l1: 47.8948
[5000]	valid_set's l1: 46.7531


	Ran out of time, early stopping on iteration 5812. Best iteration is:
	[5812]	valid_set's l1: 46.0382


[1000]	valid_set's l1: 53.0121
[2000]	valid_set's l1: 50.5725
[3000]	valid_set's l1: 48.7652
[4000]	valid_set's l1: 47.4708
[5000]	valid_set's l1: 46.3952
[6000]	valid_set's l1: 45.5175


	Ran out of time, early stopping on iteration 6351. Best iteration is:
	[6351]	valid_set's l1: 45.2493
	-45.8186	 = Validation score   (-mean_absolute_error)
	345.03s	 = Training   runtime
	44.02s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 3.98s of the 203.62s of remaining time.
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	Ran out of time, early stopping on iteration 1. Best iteration is:
	[1]	valid_set's l1: 79.8982
	Time limit exceeded... Skipping LightGBM_BAG_L1.
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 2.75s of the 202.38s of remaining time.
	Time limit exceeded... Skipping RandomForestMSE_BAG_L1.
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEns

[1000]	valid_set's l1: 39.2999
[2000]	valid_set's l1: 38.8874


	Ran out of time, early stopping on iteration 2587. Best iteration is:
	[2587]	valid_set's l1: 38.7538


[1000]	valid_set's l1: 38.9131
[2000]	valid_set's l1: 38.368


	Ran out of time, early stopping on iteration 2583. Best iteration is:
	[2573]	valid_set's l1: 38.196


[1000]	valid_set's l1: 39.2862
[2000]	valid_set's l1: 38.7537


	Ran out of time, early stopping on iteration 2552. Best iteration is:
	[2548]	valid_set's l1: 38.6043


[1000]	valid_set's l1: 39.5885
[2000]	valid_set's l1: 39.0983


	Ran out of time, early stopping on iteration 2650. Best iteration is:
	[2649]	valid_set's l1: 38.8766


[1000]	valid_set's l1: 38.776
[2000]	valid_set's l1: 38.352


	Ran out of time, early stopping on iteration 2824. Best iteration is:
	[2818]	valid_set's l1: 38.1505


[1000]	valid_set's l1: 39.58
[2000]	valid_set's l1: 39.114


	Ran out of time, early stopping on iteration 2901. Best iteration is:
	[2878]	valid_set's l1: 38.8078


[1000]	valid_set's l1: 39.4437
[2000]	valid_set's l1: 38.8428


	Ran out of time, early stopping on iteration 3027. Best iteration is:
	[3027]	valid_set's l1: 38.6253


[3000]	valid_set's l1: 38.6369
[1000]	valid_set's l1: 39.9258
[2000]	valid_set's l1: 39.4676
[3000]	valid_set's l1: 39.2279


	Ran out of time, early stopping on iteration 3385. Best iteration is:
	[3364]	valid_set's l1: 39.1533
	-38.6459	 = Validation score   (-mean_absolute_error)
	176.88s	 = Training   runtime
	12.25s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 3.31s of the 3.27s of remaining time.
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	Ran out of time, early stopping on iteration 1. Best iteration is:
	[1]	valid_set's l1: 78.3544
	Time limit exceeded... Skipping LightGBM_BAG_L2.
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 2.01s of the 1.97s of remaining time.
	Time limit exceeded... Skipping RandomForestMSE_BAG_L2.
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsembl

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x19fc2465600>

In [6]:
predictor.leaderboard()

                 model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    LightGBMXT_BAG_L2 -38.645944      56.267176  521.910774               12.249727         176.878206            2       True          3
1  WeightedEnsemble_L3 -38.645944      56.276688  521.925996                0.009512           0.015222            3       True          4
2    LightGBMXT_BAG_L1 -45.818565      44.017449  345.032568               44.017449         345.032568            1       True          1
3  WeightedEnsemble_L2 -45.818565      44.025463  345.045965                0.008014           0.013398            2       True          2


Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,-38.645944,56.267176,521.910774,12.249727,176.878206,2,True,3
1,WeightedEnsemble_L3,-38.645944,56.276688,521.925996,0.009512,0.015222,3,True,4
2,LightGBMXT_BAG_L1,-45.818565,44.017449,345.032568,44.017449,345.032568,1,True,1
3,WeightedEnsemble_L2,-45.818565,44.025463,345.045965,0.008014,0.013398,2,True,2


In [7]:
preds = predictor.predict(test)

In [8]:
submit = pd.read_csv('sample_submission.csv')
submit['CI_HOUR'] = preds
submit.to_csv('autogluon_submit.csv', index=False)