<a href="https://colab.research.google.com/github/nitin-khandagale/blogposts/blob/master/catboost_vs_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
df = pd.read_csv('credit_risk.csv')

In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [0]:
df = df.drop('Unnamed: 0', axis='columns')

In [0]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [0]:
df.isnull().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [0]:
df.shape

(1000, 10)

In [0]:
df['Saving accounts'].value_counts()

little        603
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64

In [0]:
df['Saving accounts'].unique()

array([nan, 'little', 'quite rich', 'rich', 'moderate'], dtype=object)

In [0]:
x = df.drop('Risk', axis='columns')
y = df['Risk']

In [0]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(df['Risk'])

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [0]:
import category_encoders as ce

cat_columns = x.select_dtypes('object').columns
cat_columns

Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object')

In [0]:
target_enc = ce.TargetEncoder(cols=cat_columns)
target_enc.fit(x_train[cat_columns], y_train)

TargetEncoder(cols=Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object'),
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', min_samples_leaf=1, return_df=True,
              smoothing=1.0, verbose=0)

In [0]:
x_train_final = x_train.join(target_enc.transform(x_train[cat_columns]).add_suffix('_target'))
x_test_final = x_test.join(target_enc.transform(x_test[cat_columns]).add_suffix('_target'))

x_train_final = x_train_final.drop(cat_columns, axis='columns')
x_test_final = x_test_final.drop(cat_columns, axis='columns')

In [0]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [0]:
my_model = XGBClassifier()

params = {
  'max_depth': range (2, 10, 1),
  'n_estimators': range(20, 320, 20),
  'max_depth' : range(20, 50, 10)}

In [0]:
def tuner(model, x_train, x_test, y_train, y_test, params):  
  grid_model = GridSearchCV(estimator=model,
                           param_grid=params,                           
                           n_jobs=10,
                           cv=10,
                           verbose=True)
  grid_model.fit(x_train_final, y_train)
  predictions = grid_model.predict(x_test_final)

  score = grid_model.score(x_test_final, y_test)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, predictions)

  return score, mse, rmse, mae

In [0]:
result = tuner(my_model, x_train_final, x_test_final, y_train, y_test, params)

Fitting 10 folds for each of 45 candidates, totalling 450 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   26.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done 450 out of 450 | elapsed:  1.2min finished


In [0]:
result

(0.732, 0.268, 0.5176871642217914, 0.268)

In [0]:
from catboost import CatBoostClassifier

In [0]:
my_model_cb = CatBoostClassifier()

In [0]:
result_3 = tuner(my_model_cb, x_train_final, x_test_final, y_train,y_test, params)

In [0]:
result_3

(0.74, 0.26, 0.5099019513592785, 0.26)