<a href="https://colab.research.google.com/github/paulgureghian/CatBoost/blob/master/CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Created by Paul A. Gureghian in Mar 2019.**

**This notebook will demonstrate the implementation of GBDT using the 'CatBoost' gradient boosting library.** 

In [0]:
### Install CatBoost
!pip -q install catboost

In [0]:
### Import packages
import timeit
from catboost.datasets import epsilon
from catboost import CatBoostClassifier

### **Prepare dataset.**

*   **I will use the 'Epsilon' dataset from CatBoost.** 

In [0]:
### Split dataset into 'train' and 'test' sets
train, test = epsilon()

X_train, y_train = train.iloc[:,1:], train[0]
X_test, y_test = test.iloc[:,1:], test[0]

In [0]:
### Define 'train_on_cpu' function
def train_on_cpu():  
  model = CatBoostClassifier(iterations=100, learning_rate=0.03)
  
  model.fit( X_train, y_train, eval_set=(X_test, y_test), verbose=10)    
      
cpu_time = timeit.timeit('train_on_cpu()', setup="from __main__ import train_on_cpu", number=1)
                                         
print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))

0:	learn: 0.6878004	test: 0.6878645	best: 0.6878645 (0)	total: 8.02s	remaining: 13m 14s
10:	learn: 0.6459486	test: 0.6467108	best: 0.6467108 (10)	total: 1m 29s	remaining: 12m 4s
20:	learn: 0.6170315	test: 0.6180805	best: 0.6180805 (20)	total: 2m 45s	remaining: 10m 23s
30:	learn: 0.5947894	test: 0.5960342	best: 0.5960342 (30)	total: 4m 1s	remaining: 8m 56s
40:	learn: 0.5766997	test: 0.5780390	best: 0.5780390 (40)	total: 5m 19s	remaining: 7m 40s
50:	learn: 0.5616803	test: 0.5631138	best: 0.5631138 (50)	total: 6m 32s	remaining: 6m 17s
60:	learn: 0.5485252	test: 0.5499754	best: 0.5499754 (60)	total: 7m 46s	remaining: 4m 57s
70:	learn: 0.5371519	test: 0.5385685	best: 0.5385685 (70)	total: 8m 58s	remaining: 3m 40s
80:	learn: 0.5267357	test: 0.5282210	best: 0.5282210 (80)	total: 10m 9s	remaining: 2m 22s
90:	learn: 0.5175479	test: 0.5190800	best: 0.5190800 (90)	total: 11m 24s	remaining: 1m 7s
99:	learn: 0.5100183	test: 0.5116455	best: 0.5116455 (99)	total: 12m 30s	remaining: 0us

bestTest = 0.

In [0]:
### Define 'train_on_gpu' function
def train_on_gpu():
  model = CatBoostClassifier(iterations =100, learning_rate = 0.03, task_type ='GPU')
  
  model.fit(X_train, y_train, eval_set =(X_test, y_test), verbose =10)
  
gpu_time = timeit.timeit('train_on_gpu()', setup ="from __main__ import train_on_gpu", number =1)

print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))

print('GPU speedup over CPU: ' + '%.2f' % (cpu_time / gpu_time) + 'x') 

0:	learn: 0.6877673	test: 0.6878639	best: 0.6878639 (0)	total: 299ms	remaining: 29.6s
10:	learn: 0.6457424	test: 0.6464577	best: 0.6464577 (10)	total: 2.51s	remaining: 20.3s
20:	learn: 0.6163269	test: 0.6174577	best: 0.6174577 (20)	total: 4.62s	remaining: 17.4s
30:	learn: 0.5943042	test: 0.5956971	best: 0.5956971 (30)	total: 6.64s	remaining: 14.8s
40:	learn: 0.5763306	test: 0.5778136	best: 0.5778136 (40)	total: 8.63s	remaining: 12.4s
50:	learn: 0.5607695	test: 0.5623932	best: 0.5623932 (50)	total: 10.6s	remaining: 10.2s
60:	learn: 0.5478189	test: 0.5495073	best: 0.5495073 (60)	total: 12.6s	remaining: 8.06s
70:	learn: 0.5360006	test: 0.5377420	best: 0.5377420 (70)	total: 14.5s	remaining: 5.93s
80:	learn: 0.5258040	test: 0.5275750	best: 0.5275750 (80)	total: 16.5s	remaining: 3.87s
90:	learn: 0.5165431	test: 0.5183841	best: 0.5183841 (90)	total: 18.5s	remaining: 1.83s
99:	learn: 0.5089638	test: 0.5108693	best: 0.5108693 (99)	total: 20.2s	remaining: 0us
bestTest = 0.5108692578
bestIteratio