#### Catboost Pool Parameter Search
_By Nick Brooks, June 2018_

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from IPython.display import display

#CatBoost
import hyperopt 
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from catboost import cv as catcv

# Unsupervised Models
from sklearn.decomposition import PCA, TruncatedSVD, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re

# Specify index/ target name
id_col = "ID"
target_var = "target"

In [None]:
def get_data(Debug = False):
    print("Load Data")
    nrows = None
    if Debug is True: nrows= 500
    train = pd.read_csv("../input/santander-value-prediction-challenge/train.csv", index_col = id_col, nrows=nrows)
    train["log_compiled_leak"] = np.log1p(pd.read_csv("../input/breaking-lb-fresh-start-with-lag-selection/train_leak.csv", nrows=nrows)["compiled_leak"].values)
    traindex = train.index
    test_df = pd.read_csv("../input/santander-value-prediction-challenge/test.csv", index_col = id_col, nrows=nrows)
    test_df["log_compiled_leak"] = np.log1p(pd.read_csv("../input/breaking-lb-fresh-start-with-lag-selection/test_leak.csv", nrows=nrows)["compiled_leak"].values)
    testdex = test_df.index
    y = np.log1p(train[target_var]).copy()
    train.drop(target_var,axis=1,inplace=True)
    print('Train shape: {} Rows, {} Columns'.format(*train.shape))
    print('Test shape: {} Rows, {} Columns'.format(*test_df.shape))
    
    return train, traindex, test_df, testdex, y
train, traindex, test_df, testdex, y = get_data(Debug = False)

In [None]:
print("Combine Train and Test")
df = pd.concat([train,test_df],axis=0)
del train,test_df
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

## Modeling Stage

In [None]:
# Feature Names
feat_names = df.columns

# Modeling Datasets
test_df = df.loc[testdex,:]
X = df.loc[traindex,:]
print("Starting Catboost. Train shape: {}, Test shape: {}".format(X.shape,test_df.shape))
print("Feature Num: ",len(feat_names))

# Create a dataframe to save results from CV tuning
results = pd.DataFrame(columns = ["Rounds","Score","STDV", "LB", "Parameters"])

## Dimensionality Reduction
Number of components and method determined by my [notebook](https://www.kaggle.com/nicapotato/comparing-dimensional-reduction-methods-lr) comparing dimensionality methods.

In [None]:
# pca = PCA(random_state=23, n_components = 186)
# X = pca.fit_transform(X)
# test_df = pca.transform(test_df)

## Caboost CV

The goal here is to use cross-validation to ensure that I have parameters that generalize well.

In [None]:
print("Parameter Tuning / Ideal Boosting Rounds")
cat_params = {"eval_metric":'RMSE',
               "iterations": 4000,
               "od_wait": 150,
               "random_seed": 42,
               "logging_level": "Verbose",
               "metric_period": 75,
               "od_type": 'Iter',
               "od_wait": 100
             }
model = CatBoostRegressor(**cat_params)

# Cross-Validation
catpool = Pool(X,y)
cv_data = catcv(catpool, model.get_params(), fold_count=5)

optimal_round = cv_data['test-RMSE-mean'].idxmin()
print("Best Iteration: ",optimal_round + 1)
print("Best Score: {} +/- {}".format(cv_data['test-RMSE-mean'][optimal_round],cv_data['test-RMSE-std'][optimal_round]))

# Append Scores
results = results.append({"Rounds": optimal_round,
                          "Score": cv_data['test-RMSE-mean'][optimal_round],
                          "STDV": cv_data['test-RMSE-std'][optimal_round],
                          "LB": None,
                          "Parameters": cat_params}, ignore_index=True)

Now that the ideal parameters are found, I will run them on the full data and submit. Now, I there is always more tuning to be done, so fork the notebook and turn some parametric nobes! The results will be stored and track your progress with the table below.

In [None]:
pd.set_option('max_colwidth', 800)
display(results)

## Catboost Submission Model [Optimal Rounds]

In [None]:
print("Train Submission Model")
cat_params["iterations"] = optimal_round + 1
model = CatBoostRegressor(**cat_params)
model.fit(X,y)

In [None]:
catpred = np.expm1(model.predict(test_df))
submission = pd.DataFrame({'ID':testdex,'target':catpred})
submission.to_csv('catboost.csv',index=False)
submission.head()