In [1]:
!pip install autogluon
!pip install numpy pandas
!pip install scikit-learn



In [2]:
from autogluon.tabular import TabularPredictor, TabularDataset
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_df = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv') # 
# test_data = pd.read_csv('./test data location.csv')
train_data = TabularDataset(train_df)
time_limit = 3600 * 1 # hrs
label = 'HeatingEnergyUsage_cumsum'

# 'r2' for r2 score & 'root_mean_squared_error' for RMSE
eval_metric = 'accuracy' 
r2_metric = 'r2'
rmse_metric = 'root_mean_squared_error'

output_directory = './autogluon_model_log' # output directory
predictor_directory = './autogluon_model_predict' # predictore directory

# autogluon train
predictor = TabularPredictor(
    label=label, 
    eval_metric=rmse_metric, 
    path=output_directory
).fit(
    train_data,
    presets='best_quality', 
    time_limit=time_limit, 
    ag_args_fit={'num_gpus': 0, 'num_cpus': 8}
    # training for a specific model ref: 'https://auto.gluon.ai/stable/api/autogluon.tabular.TabularPredictor.fit.html'
    #hyperparameters={'GBM': {}},
    #num_bag_folds=2,      
    #num_stack_levels=2,
    )

# extra train
# additional_hyperparmeter = 'GBM'
# predictor.fit_extra(hyperparameters=additional_hyperparmeter)

### result (leaderboard)
leaderboard = predictor.leaderboard(silent=False)
print(leaderboard)

log_path = os.path.join(output_directory, 'train.log')
with open(log_path, 'w') as f:
    f.write(str(leaderboard))

print(f"Logs saved to {log_path}")

predictor.save(predictor_directory)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "./autogluon_model_log/"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:22:52 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T8103
Disk Space Avail:   82.63 GB / 245.11 GB (33.7%)
Train Data Rows:    84840
Train Data Columns: 49
Label Column: HeatingEnergyUsage_cumsum
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (4171031.468732149, 0.0, 144977.03704, 572925.17288)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem

                     model      score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -11107.687858     127.494137  3207.342537                0.001119           0.491773            3       True         15
1          LightGBM_BAG_L2  -11412.895315      93.466238  2513.165130                3.488126         165.989647            2       True         12
2   RandomForestMSE_BAG_L2  -13255.193648      91.997628  2418.562939                2.019516          71.387456            2       True         13
3          CatBoost_BAG_L2  -14572.137813      90.115980  2765.831546                0.137868         418.656063            2       True         14
4        LightGBMXT_BAG_L2  -14810.196022     121.985377  2969.473661               32.007264         622.298178            2       True         11
5      WeightedEnsemble_L2  -18117.391854      18.424870  1442.120617                0.001496           0.751714

In [7]:
### data preprocess
# input data import
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')

# 데이터셋을 data, target으로 변수분리
data = input_data.iloc[:,:-1]
target = input_data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [8]:
### RMSE, R2 evaluateß
# AutoGluon 모델을 저장
loaded_predictor = TabularPredictor.load(output_directory)  # 모델의 저장 경로를 지정

# 테스트 데이터 준비
test_df = TabularDataset(X_test) 
prediction = loaded_predictor.predict(test_df)

# RMSE, R2 계산
rmse = np.sqrt(mean_squared_error(y_test, prediction))
r2score = r2_score(y_test, prediction)

In [9]:
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 3640.7831230542015
R2_score: 0.9999618204466719
