In [1]:
import optuna


In [15]:
import numpy as np
import pandas as pd
import xgboost as xgb
import os
from sklearn.model_selection import *
import sklearn
import numpy as np
from sklearn.metrics import mean_squared_error
# evaluation


In [21]:
def mdape(model, X, y_true):
    y_predict = model.predict(X)
    mdape = np.median((np.abs(np.subtract(y_true, y_predict)) / y_predict)) * 100
    return mdape


In [3]:
def RMSE(clf, X, y_true):
    y_predict = clf.predict(X)
    mse = mean_squared_error(y_true, y_predict)
    rmse = np.sqrt(mse)
    return rmse


In [4]:
def load_data(data_path, test_size):
    # Load X, Y
    df = pd.read_csv(data_path)
    data = df['PM2.5'].values
    X_data = []
    y_data = []
    for i in range(len(data) - 169):
        X_data.append(data[i:i+168])
        y_data.append(data[i+168])

    # Split with ratio
    # best 10
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_data, np.array(y_data), test_size=test_size, random_state=10, shuffle=False)

    return X_train, X_valid, y_train, y_valid


In [16]:
def objective(trial, input_path):
    
    n_estimators = trial.suggest_int('n_estimators', 200, 200)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    max_features = trial.suggest_int('max_features', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split',3, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',3, 5)
    learning_rate = trial.suggest_uniform('learning_rate', 0.05, 0.4)


    model =  xgb.XGBRegressor(n_estimators= n_estimators,
                              max_depth=max_depth,
                              max_features='auto',
                              min_samples_split=min_samples_split,
                              min_samples_leaf=min_samples_leaf,
                              learning_rate=learning_rate,
                              verbosity=0)

    X_train, X_valid, y_train, y_valid = load_data(
        data_path=input_path, test_size=0.2)

    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
              early_stopping_rounds=20)

    return RMSE(model, X_valid, y_valid)


# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# trial = study.best_trial

# print('Accuracy: {}'.format(trial.value))
# print("Best hyperparameters: {}".format(trial.params))


In [17]:
def save_trial(trial, input_path, save_path):
    model = xgb.XGBRegressor(n_estimators=trial.params['n_estimators'],
                             max_depth=trial.params['max_depth'],
                             max_features=trial.params['max_features'],
                             min_samples_split=trial.params['min_samples_split'],
                             min_samples_leaf=trial.params['min_samples_leaf'],
                             learning_rate=trial.params['learning_rate'],
                             verbosity=0)

    X_train, X_valid, y_train, y_valid = load_data(
        data_path=input_path, test_size=0.2)

    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
              early_stopping_rounds=20)

    model.save_model(save_path)


In [None]:
data_path = 'data/processed/data-train/input'
for file_train in os.listdir(data_path):
    save_dir_model = os.path.join(
        'saved/models', file_train[:-4] + '.json')

    input_path = os.path.join(
        data_path, file_train)

    func = lambda trial: objective(trial, input_path)

    study = optuna.create_study(direction='minimize')
    study.optimize(func, n_trials=100)

    trial = study.best_trial

    save_trial(trial, input_path, save_dir_model)
    print('Done ', save_dir_model)


In [21]:
!ls

LICENSE    config  docs      notebooks	     requirements.txt  run.sh  src
README.md  data    note.txt  prediction.zip  results	       saved


In [22]:
%ls

LICENSE    [0m[01;34mconfig[0m/  [01;34mdocs[0m/     [01;34mnotebooks[0m/      requirements.txt  [01;32mrun.sh[0m*  [01;34msrc[0m/
README.md  [01;34mdata[0m/    note.txt  [01;31mprediction.zip[0m  [01;34mresults[0m/          [01;34msaved[0m/


In [23]:
%pwd

'/home/zephy_manjaro/My-Workspace/Code/others/ai4vn2022-air-quality-forecasting'