# CatBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [123]:
import pandas as pd
dataset = pd.read_csv('insurance.csv')

In [124]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [125]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Handling categorical variables

Sex column

In [126]:
dataset['sex'].unique()

array(['female', 'male'], dtype=object)

In [127]:
dataset['sex'] = dataset['sex'].apply(lambda x: 0 if x == 'female' else 1)

In [128]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


Smoker column

In [129]:
dataset['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [130]:
dataset['smoker'] = dataset['smoker'].apply(lambda x: 0 if x == 'no' else 1)

In [131]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [132]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [133]:
region_dummies = pd.get_dummies(dataset['region'], drop_first = True)

In [134]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,False,False,True
1,False,True,False
2,False,True,False
3,True,False,False
4,True,False,False
...,...,...,...
1333,True,False,False
1334,False,False,False
1335,False,True,False
1336,False,False,True


In [135]:
dataset = pd.concat([region_dummies, dataset], axis = 1)

In [136]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,False,False,True,19,0,27.9,0,1,southwest,16884.924
1,False,True,False,18,1,33.77,1,0,southeast,1725.5523
2,False,True,False,28,1,33.0,3,0,southeast,4449.462
3,True,False,False,33,1,22.705,0,0,northwest,21984.47061
4,True,False,False,32,1,28.88,0,0,northwest,3866.8552


In [137]:
dataset.drop(['region'], axis = 1, inplace = True)

In [138]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,False,False,True,19,0,27.9,0,1,16884.924
1,False,True,False,18,1,33.77,1,0,1725.5523
2,False,True,False,28,1,33.0,3,0,4449.462
3,True,False,False,33,1,22.705,0,0,21984.47061
4,True,False,False,32,1,28.88,0,0,3866.8552


In [139]:
dataset['northwest'] = dataset['northwest'].apply(lambda x:0 if x == False else 1)

In [140]:
dataset['southeast'] = dataset['southeast'].apply(lambda x:0 if x == False else 1)

In [141]:
dataset['southwest'] = dataset['southwest'].apply(lambda x:0 if x == False else 1)

In [142]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,0,0,1,19,0,27.9,0,1,16884.924
1,0,1,0,18,1,33.77,1,0,1725.5523
2,0,1,0,28,1,33.0,3,0,4449.462
3,1,0,0,33,1,22.705,0,0,21984.47061
4,1,0,0,32,1,28.88,0,0,3866.8552


### Creating the Training Set and the Test Set

Getting the inputs and output

In [143]:
X = dataset.iloc[:, :-1].values

In [144]:
y = dataset.iloc[:, -1].values

In [145]:
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

In [146]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

Getting the Training Set and the Test Set

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Part 2 - Building and training the model

### Building the model

In [148]:
!pip install catboost



In [149]:
import catboost as cb
model = cb.CatBoostRegressor()

### Training the model

In [150]:
model.fit(X_train, y_train)

Learning rate set to 0.041383
0:	learn: 11611.5326660	total: 4.69ms	remaining: 4.68s
1:	learn: 11297.2362282	total: 6.1ms	remaining: 3.04s
2:	learn: 10987.8561010	total: 7.11ms	remaining: 2.36s
3:	learn: 10664.1180964	total: 8.07ms	remaining: 2.01s
4:	learn: 10377.3027972	total: 8.95ms	remaining: 1.78s
5:	learn: 10078.6082882	total: 9.74ms	remaining: 1.61s
6:	learn: 9809.1374130	total: 10.5ms	remaining: 1.49s
7:	learn: 9571.6815432	total: 11.2ms	remaining: 1.39s
8:	learn: 9319.9322507	total: 12.1ms	remaining: 1.33s
9:	learn: 9081.2252419	total: 12.9ms	remaining: 1.27s
10:	learn: 8862.0378680	total: 13.7ms	remaining: 1.23s
11:	learn: 8630.0769266	total: 14.6ms	remaining: 1.2s
12:	learn: 8437.0370569	total: 15.3ms	remaining: 1.16s
13:	learn: 8239.7925079	total: 16.1ms	remaining: 1.13s
14:	learn: 8052.4841061	total: 16.9ms	remaining: 1.11s
15:	learn: 7864.5778395	total: 17.9ms	remaining: 1.1s
16:	learn: 7693.6490256	total: 18.6ms	remaining: 1.08s
17:	learn: 7521.2723681	total: 19.4ms	rema

<catboost.core.CatBoostRegressor at 0x7b0da9fb0c10>

### Inference

In [151]:
y_pred = model.predict(X_test)

## Part 3: Evaluating the model

### R-Squared

In [152]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [153]:
r2

0.8943206977287299

### Adjusted R-Squared

In [154]:
k = X_test.shape[1]
n = len(X_test)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

In [155]:
adj_r2

0.8910564721759494

### k-Fold Cross Validation

In [156]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator = model,
                      X = X,
                      y = y,
                      scoring = 'r2',
                      cv = 10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4:	learn: 10484.7813211	total: 15.6ms	remaining: 3.1s
5:	learn: 10170.2061515	total: 22.9ms	remaining: 3.79s
6:	learn: 9882.1673821	total: 45.9ms	remaining: 6.51s
7:	learn: 9606.3664267	total: 54.7ms	remaining: 6.78s
8:	learn: 9344.0173099	total: 56.8ms	remaining: 6.26s
9:	learn: 9090.5544258	total: 59ms	remaining: 5.84s
10:	learn: 8860.6987446	total: 61.5ms	remaining: 5.53s
11:	learn: 8619.8829962	total: 66.1ms	remaining: 5.44s
12:	learn: 8410.8987434	total: 80.1ms	remaining: 6.08s
13:	learn: 8194.6867796	total: 82.1ms	remaining: 5.78s
14:	learn: 8000.5011819	total: 84.3ms	remaining: 5.54s
15:	learn: 7808.4554127	total: 86.5ms	remaining: 5.32s
16:	learn: 7633.2294400	total: 88.7ms	remaining: 5.13s
17:	learn: 7458.7752491	total: 92.6ms	remaining: 5.05s
18:	learn: 7297.3863227	total: 94.8ms	remaining: 4.89s
19:	learn: 7146.5091822	total: 98.5ms	remaining: 4.83s
20:	learn: 7002.1975514	total: 101ms	remaining: 4.69s
21:	lear

In [157]:
print("R-Squared: {:.2f} %".format(r2s.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2s.std()*100))

R-Squared: 84.45 %
Standard Deviation: 4.31 %


### Grid Search

In [158]:
from sklearn.model_selection import GridSearchCV
parameters = [{'learning_rate': [0.008,0.009,0.01],
               'depth': [4,7,10],
               'l2_leaf_reg': [2,6,10],
               'random_strength': [0,5,10]}]
grid_search = GridSearchCV(estimator = model,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10)
grid_search.fit(X, y)
best_r2 = grid_search.best_score_
best_parameters = grid_search.best_params_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
99:	learn: 7859.4825071	total: 311ms	remaining: 2.8s
100:	learn: 7836.4050938	total: 317ms	remaining: 2.82s
101:	learn: 7804.1993964	total: 322ms	remaining: 2.84s
102:	learn: 7780.5796101	total: 328ms	remaining: 2.85s
103:	learn: 7750.2970158	total: 331ms	remaining: 2.85s
104:	learn: 7732.0851732	total: 336ms	remaining: 2.87s
105:	learn: 7713.8135383	total: 342ms	remaining: 2.88s
106:	learn: 7694.6885083	total: 345ms	remaining: 2.88s
107:	learn: 7676.2398418	total: 350ms	remaining: 2.89s
108:	learn: 7637.0231728	total: 352ms	remaining: 2.87s
109:	learn: 7609.1812649	total: 355ms	remaining: 2.87s
110:	learn: 7588.0119258	total: 358ms	remaining: 2.87s
111:	learn: 7562.4503298	total: 359ms	remaining: 2.84s
112:	learn: 7534.5640389	total: 360ms	remaining: 2.83s
113:	learn: 7497.3322744	total: 361ms	remaining: 2.8s
114:	learn: 7466.2678168	total: 362ms	remaining: 2.78s
115:	learn: 7437.5873362	total: 363ms	remaining: 2.76s
116

  _data = np.array(data, dtype=dtype, copy=copy,


99:	learn: 6904.1840176	total: 74.7ms	remaining: 672ms
100:	learn: 6873.7505090	total: 75.2ms	remaining: 670ms
101:	learn: 6844.0589144	total: 75.8ms	remaining: 667ms
102:	learn: 6814.7261927	total: 76.3ms	remaining: 665ms
103:	learn: 6785.6259856	total: 76.8ms	remaining: 662ms
104:	learn: 6756.4223959	total: 81.2ms	remaining: 692ms
105:	learn: 6727.6675079	total: 82.1ms	remaining: 692ms
106:	learn: 6699.5326372	total: 82.6ms	remaining: 690ms
107:	learn: 6671.1012985	total: 84.4ms	remaining: 697ms
108:	learn: 6643.3792437	total: 84.9ms	remaining: 694ms
109:	learn: 6615.9817239	total: 85.6ms	remaining: 693ms
110:	learn: 6588.5133482	total: 86.4ms	remaining: 692ms
111:	learn: 6561.5915778	total: 87.1ms	remaining: 690ms
112:	learn: 6534.8247822	total: 87.8ms	remaining: 689ms
113:	learn: 6508.2096810	total: 88.5ms	remaining: 688ms
114:	learn: 6481.8971728	total: 89.2ms	remaining: 687ms
115:	learn: 6455.8845951	total: 89.9ms	remaining: 685ms
116:	learn: 6430.4868973	total: 90.7ms	remaining:

In [159]:
print("Best R-Squared: {:.2f} %".format(best_r2*100))
print("Best Parameters:", best_parameters)

Best R-Squared: 86.16 %
Best Parameters: {'depth': 4, 'l2_leaf_reg': 2, 'learning_rate': 0.008, 'random_strength': 0}
