In [None]:
import numpy as np
import pandas as pd

from data import preprocess_data, postprocessing
from func import deviation_metric, get_timestamp

from lightgbm import LGBMRegressor, Booster
from sklearn.model_selection import StratifiedKFold, train_test_split

import optuna

import pickle

In [None]:
hyper_search = False
training = False
inference = True

In [None]:
# Custom eval metric for CatBoost
# Based on https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
def eval_metric(y_true, y_pred):
    return "deviation_metric", deviation_metric(np.expm1(y_true), np.expm1(y_pred)), False

In [None]:
def objective(trial):
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    eps = trial.suggest_loguniform('eps', 0.01, 0.05) # dbscan param
    
    data_kwargs = {'cluster': eps,
                   'clean_floor_num': True,
                   'clean_region_city': True,
                   'remove_type_0': True,
                   'log_target': True,
                   'encode_cat': True}
    
    train_pre, test_pre, num_columns, cat_columns, target = preprocess_data(train, test, **data_kwargs)
    X_columns = num_columns + cat_columns
    
    # If Kfold
    #splitter = StratifiedKFold(n_splits=5)
    #splits = splitter.split(train_pre, train_pre["realty_type"]) # Stratify by realty type for stability
    
    # If holdout set
    indices = np.arange(train_pre.shape[0])
    splits = [train_test_split(indices, test_size=0.2, random_state=42, stratify = train_pre["realty_type"])]
    
    lightgbm_params = {'num_leaves': trial.suggest_int('num_leaves', 16, 2048, log = True), 
                       'max_depth': trial.suggest_int('max_depth', 4, 12),
                       'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
                       'n_estimators': 5000,
                       'min_child_samples': trial.suggest_int('min_child_samples', 10, 1000, log = True), 
                       'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-6, 100),
                       'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-6, 100),
                       'subsample': 0.8,
                       'subsample_freq': 1,
                       'importance_type': 'gain'}
    
    eval_scores = []
    iters = []
    
    for train_index, val_index in splits:
        train_split = train_pre.iloc[train_index]
        val_split = train_pre.iloc[val_index]
        
        model = LGBMRegressor(**lightgbm_params)
        
        model.fit(train_split[X_columns], 
                  train_split[target], 
                  eval_set = (val_split[X_columns], val_split[target]), 
                  early_stopping_rounds = 100,
                  eval_metric = eval_metric,
                  verbose = 500,
                  categorical_feature = cat_columns)
        
        # Return loss, iter
        eval_scores.append(model.best_score_["valid_0"]["deviation_metric"])
        iters.append(model.best_iteration_)
        #print(model.best_score_["valid_0"]["deviation_metric"], model.best_iteration_)
        
    trial.set_user_attr("es_mean_iter", np.mean(iters))
    return np.mean(eval_scores)

In [None]:
if hyper_search:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20, show_progress_bar = True)

In [None]:
if hyper_search:
    print(study.best_params)
    print(study.best_value)

In [None]:
if hyper_search:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
    
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

In [None]:
%%time

data_kwargs = {'cluster': 0.0271,
               'clean_floor_num': True,
               'clean_region_city': True,
               'remove_type_0': True,
               'log_target': True,
               'encode_cat': True}

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train_pre, test_pre, num_columns, cat_columns, target = preprocess_data(train, test, **data_kwargs)
X_columns = num_columns + cat_columns

lightgbm_params = {'num_leaves': 169, 
                       'max_depth': 5,
                       'learning_rate': 0.01416,
                       'n_estimators': 5000,
                       'min_child_samples': 23, 
                       'reg_alpha': 1e-6,
                       'reg_lambda': 0.000128,
                       'subsample': 0.8,
                       'subsample_freq': 1,
                       'importance_type': 'gain'}

if training:
    splitter = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)
    splits = splitter.split(train_pre, train_pre["realty_type"])
    
    timestamp = get_timestamp()
    
    eval_scores = []
    feature_importances = []

    for i, (train_index, val_index) in enumerate(splits):
        train_split = train_pre.iloc[train_index]
        val_split = train_pre.iloc[val_index]

        model = LGBMRegressor(**lightgbm_params)
        
        model.fit(train_split[X_columns], 
                  train_split[target], 
                  eval_set = (val_split[X_columns], val_split[target]), 
                  early_stopping_rounds = 100,
                  eval_metric = eval_metric,
                  verbose = 500,
                  categorical_feature = cat_columns)

        model.booster_.save_model(f"models/lgbm_fold_{i}_{timestamp}.lgbm")

        # Return loss, iter
        eval_scores.append(model.best_score_["valid_0"]["deviation_metric"])
        feature_importances.append(model.feature_importances_)

    print("Avg eval score", np.mean(eval_scores))

In [None]:
# Calculating average feature importances
if training:
    df = pd.DataFrame(feature_importances, columns = X_columns)
    df_mean = df.mean(axis=0).sort_values(ascending = True)
    df_mean.plot(kind='barh', figsize=(10, 15))

In [None]:
if inference:
    modelnames = ["lgbm_fold_0_2021-10-11-14-46.lgbm", "lgbm_fold_1_2021-10-11-14-46.lgbm", "lgbm_fold_2_2021-10-11-14-46.lgbm", "lgbm_fold_3_2021-10-11-14-46.lgbm", "lgbm_fold_4_2021-10-11-14-46.lgbm"]
    timestamp = get_timestamp()
    sample_submission = pd.read_csv('data/test_submission.csv')
    predictions = []
    
    for modelname in modelnames:
        model = Booster(model_file=f"models/{modelname}")
        predictions.append(np.expm1(model.predict(test_pre[X_columns])))
        
    sample_submission[target] = np.median(np.array(predictions), axis = 0) * 0.94
    sample_submission.to_csv(f'submissions/lgbm_{timestamp}.csv', index = False)