<a href="https://colab.research.google.com/github/paulgureghian/CatBoost/blob/master/CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Created by Paul A. Gureghian in Mar 2019.**

**This notebook will demonstrate the implementation of GBDT using the 'CatBoost' library.** 

In [1]:
### Install CatBoost
!pip -q install catboost

[K    100% |████████████████████████████████| 60.1MB 595kB/s 
[?25h

In [0]:
### Import packages
import timeit
from catboost.datasets import epsilon
from catboost import CatBoostClassifier

### **Prepare dataset.**

*   **I will use the 'Epsilon' dataset from CatBoost.** 

In [0]:
### Split dataset into 'train' and 'test' sets
train, test = epsilon()

X_train, y_train = train.iloc[:,1:], train[0]
X_test, y_test = test.iloc[:,1:], test[0]

In [4]:
### Define 'train_on_cpu' function
def train_on_cpu():  
  model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.03
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=10
  );   
      
cpu_time = timeit.timeit('train_on_cpu()', 
                         setup="from __main__ import train_on_cpu", 
                         number=1)

print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))

0:	learn: 0.6878004	test: 0.6878645	best: 0.6878645 (0)	total: 8.51s	remaining: 14m 2s
10:	learn: 0.6459486	test: 0.6467108	best: 0.6467108 (10)	total: 1m 30s	remaining: 12m 15s
20:	learn: 0.6170315	test: 0.6180805	best: 0.6180805 (20)	total: 2m 49s	remaining: 10m 39s
30:	learn: 0.5947894	test: 0.5960342	best: 0.5960342 (30)	total: 4m 7s	remaining: 9m 10s
40:	learn: 0.5766997	test: 0.5780390	best: 0.5780390 (40)	total: 5m 24s	remaining: 7m 47s
50:	learn: 0.5616803	test: 0.5631138	best: 0.5631138 (50)	total: 6m 39s	remaining: 6m 23s
60:	learn: 0.5485252	test: 0.5499754	best: 0.5499754 (60)	total: 7m 52s	remaining: 5m 2s
70:	learn: 0.5371519	test: 0.5385685	best: 0.5385685 (70)	total: 9m 5s	remaining: 3m 42s
80:	learn: 0.5267357	test: 0.5282210	best: 0.5282210 (80)	total: 10m 17s	remaining: 2m 24s
90:	learn: 0.5175479	test: 0.5190800	best: 0.5190800 (90)	total: 11m 31s	remaining: 1m 8s
99:	learn: 0.5100183	test: 0.5116455	best: 0.5116455 (99)	total: 12m 39s	remaining: 0us

bestTest = 0.5

In [5]:
### Define 'train_on_gpu' function
def train_on_gpu():
  model = CatBoostClassifier(iterations =100, learning_rate = 0.03, task_type ='GPU')
  
  model.fit(X_train, y_train, eval_set =(X_test, y_test), verbose =10)
  
gpu_time = timeit.timeit('train_on_gpu()', setup ="from __main__ import train_on_gpu", number =1)

print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))

print('GPU speedup over CPU: ' + '%.2f' % (cpu_time / gpu_time) + 'x') 

0:	learn: 0.6877673	test: 0.6878639	best: 0.6878639 (0)	total: 329ms	remaining: 32.6s
10:	learn: 0.6457423	test: 0.6464577	best: 0.6464577 (10)	total: 2.54s	remaining: 20.5s
20:	learn: 0.6163274	test: 0.6174585	best: 0.6174585 (20)	total: 4.62s	remaining: 17.4s
30:	learn: 0.5943048	test: 0.5956977	best: 0.5956977 (30)	total: 6.6s	remaining: 14.7s
40:	learn: 0.5763314	test: 0.5778144	best: 0.5778144 (40)	total: 8.57s	remaining: 12.3s
50:	learn: 0.5607703	test: 0.5623939	best: 0.5623939 (50)	total: 10.6s	remaining: 10.2s
60:	learn: 0.5478194	test: 0.5495077	best: 0.5495077 (60)	total: 12.5s	remaining: 8s
70:	learn: 0.5360011	test: 0.5377425	best: 0.5377425 (70)	total: 14.4s	remaining: 5.88s
80:	learn: 0.5258042	test: 0.5275755	best: 0.5275755 (80)	total: 16.3s	remaining: 3.83s
90:	learn: 0.5165443	test: 0.5183855	best: 0.5183855 (90)	total: 18.3s	remaining: 1.81s
99:	learn: 0.5089660	test: 0.5108718	best: 0.5108718 (99)	total: 20s	remaining: 0us
bestTest = 0.5108718359
bestIteration = 99