In [None]:
import numpy as np
import pandas as pd

from data import preprocess_data, postprocessing
from func import deviation_metric_one_sample, get_timestamp

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, train_test_split

import optuna

In [None]:
hyper_search = False
training = False
inference = True

In [None]:
# Custom eval metric for CatBoost
# Based on https://catboost.ai/en/docs/concepts/python-usages-examples
class UserDefinedMetric(object):
    def is_max_optimal(self):
        # Returns whether great values of metric are better
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is a one dimensional indexed container.

        # weight parameter can be None.
        # Returns pair (error, weights sum)
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * deviation_metric_one_sample(np.expm1(target[i]), np.expm1(approx[i])) # expm1 because using log1p in loss

        return error_sum, weight_sum

    def get_final_error(self, error, weight):
        # Returns final value of metric based on error and weight
        return error / (weight + 1e-38)

# Catboost with custom metric is spamming numba warnings
import warnings
warnings.filterwarnings("ignore")

## Optuna hyper search

In [None]:
def objective(trial):
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    eps = trial.suggest_loguniform('eps', 0.01, 0.05) # dbscan param
    
    data_kwargs = {'cluster': eps,
                   'clean_floor_num': True,
                   'clean_region_city': True,
                   'remove_type_0': True,
                   'log_target': True,
                   'encode_cat': True}
    
    train_pre, test_pre, num_columns, cat_columns, target = preprocess_data(train, test, **data_kwargs)
    X_columns = num_columns + cat_columns
    
    # If Kfold
    #splitter = StratifiedKFold(n_splits=5)
    #splits = splitter.split(train_pre, train_pre["realty_type"]) # Stratify by realty type for stability
    
    # If holdout set
    indices = np.arange(train_pre.shape[0])
    splits = [train_test_split(indices, test_size=0.2, random_state=42, stratify = train_pre["realty_type"])]
    
    catboost_params = {'depth': trial.suggest_int('depth', 4, 10), 
                       'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
                       'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 100), 
                       'early_stopping_rounds': trial.suggest_discrete_uniform('early_stopping_rounds', 50, 100, 10)}
    
    eval_scores = []
    iters = []
    
    for train_index, val_index in splits:
        train_split = train_pre.iloc[train_index]
        val_split = train_pre.iloc[val_index]
        
        train_pool = Pool(train_split[X_columns], label = train_split[target], cat_features = cat_columns)
        val_pool = Pool(val_split[X_columns], label = val_split[target], cat_features = cat_columns)
        
        catboost_kwargs = {'use_best_model': False, 
                   'iterations': 5000,
                   'cat_features': cat_columns,
                   'eval_metric': UserDefinedMetric(),
                   'verbose': 500,
                   'subsample': 0.8,
                   'colsample_bylevel': 0.8}
        
        model = CatBoostRegressor(**catboost_kwargs,
                                  **catboost_params)
        
        model.fit(train_pool, eval_set = val_pool)
        
        # Return loss, iter
        eval_scores.append(model.evals_result_["validation"]["UserDefinedMetric"][-1])
        iters.append(model.tree_count_)
        
    trial.set_user_attr("es_mean_iter", np.mean(iters))
    return np.mean(eval_scores)

In [None]:
if hyper_search:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20, show_progress_bar = True)

In [None]:
if hyper_search:
    print(study.best_params)
    print(study.best_value)

In [None]:
if hyper_search:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
    
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

## Training

In [None]:
%%time

data_kwargs = {'cluster': 0.0184,
               'clean_floor_num': True,
               'clean_region_city': True,
               'remove_type_0': True,
               'log_target': True,
               'encode_cat': True}

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train_pre, test_pre, num_columns, cat_columns, target = preprocess_data(train, test, **data_kwargs)
X_columns = num_columns + cat_columns

catboost_params = {'depth': 10, 
                   'learning_rate': 0.015,
                   'l2_leaf_reg': 0.5178, 
                   'early_stopping_rounds': 100}
    

catboost_kwargs = {'use_best_model': False, 
                   'iterations': 5000,
                   'cat_features': cat_columns,
                   'eval_metric': UserDefinedMetric(),
                   'verbose': 500,
                   'subsample': 0.8,
                   'colsample_bylevel': 0.8}

if training:
    splitter = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)
    splits = splitter.split(train_pre, train_pre["realty_type"])
    
    timestamp = get_timestamp()
    
    eval_scores = []
    feature_importances = []

    for i, (train_index, val_index) in enumerate(splits):
        train_split = train_pre.iloc[train_index]
        val_split = train_pre.iloc[val_index]

        train_pool = Pool(train_split[X_columns], label = train_split[target], cat_features = cat_columns)
        val_pool = Pool(val_split[X_columns], label = val_split[target], cat_features = cat_columns)

        model = CatBoostRegressor(**catboost_kwargs,
                                  **catboost_params)

        model.fit(train_pool, eval_set = val_pool)

        model.save_model(f"models/cb_fold_{i}_{timestamp}.cbm")

        # Return loss, iter
        eval_scores.append(model.evals_result_["validation"]["UserDefinedMetric"][-1])
        feature_importances.append(model.get_feature_importance())

    print("Avg eval score", np.mean(eval_scores))

In [None]:
# Calculating average feature importances
if training:
    df = pd.DataFrame(feature_importances, columns = X_columns)
    df_mean = df.mean(axis=0).sort_values(ascending = True)
    df_mean.plot(kind='barh', figsize=(10, 15))

## Inference

In [None]:
if inference:
    modelnames = ["cb_fold_0_2021-10-10-11-24.cbm", "cb_fold_1_2021-10-10-11-24.cbm", "cb_fold_2_2021-10-10-11-24.cbm", "cb_fold_3_2021-10-10-11-24.cbm", "cb_fold_4_2021-10-10-11-24.cbm"]
    timestamp = get_timestamp()
    sample_submission = pd.read_csv('data/test_submission.csv')
    predictions = []
    
    for modelname in modelnames:
        model = CatBoostRegressor()
        model.load_model(f"models/{modelname}")
        predictions.append(np.expm1(model.predict(test_pre[X_columns])))
        
    sample_submission[target] = np.median(np.array(predictions), axis = 0) * 0.94
    sample_submission.to_csv(f'submissions/cb_{timestamp}.csv', index = False)