In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('cleaned_data.csv')

In [4]:
data.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           15
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
country              41
salary                2
dtype: int64

In [5]:
X = data.drop("salary", axis=1)
y = data["salary"]


In [14]:
cols = data.columns

In [22]:
cols

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'salary'],
      dtype='object')

In [17]:
cat_cols = data.select_dtypes(exclude="number").columns

In [20]:
cat_cols = [i for i in range(0,len(cols)) if cols[i] in cat_cols]

In [21]:
cat_cols

[1, 3, 5, 6, 7, 8, 9, 13]

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [24]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
)


clf.fit(X_train, y_train,
        cat_features=cat_cols,
        eval_set=(X_test, y_test),
        verbose=False
        )

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())


CatBoost model is fitted: True
CatBoost model parameters:
{'learning_rate': 0.1, 'iterations': 5}


In [29]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(
    iterations=20,
    #     verbose=5,
)

clf.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test),
)


Learning rate set to 0.388738
0:	learn: 0.5243335	test: 0.5290848	best: 0.5290848 (0)	total: 63.3ms	remaining: 1.2s
1:	learn: 0.4403290	test: 0.4475775	best: 0.4475775 (1)	total: 125ms	remaining: 1.12s
2:	learn: 0.3926565	test: 0.4022230	best: 0.4022230 (2)	total: 184ms	remaining: 1.04s
3:	learn: 0.3634062	test: 0.3740471	best: 0.3740471 (3)	total: 253ms	remaining: 1.01s
4:	learn: 0.3453319	test: 0.3557421	best: 0.3557421 (4)	total: 325ms	remaining: 975ms
5:	learn: 0.3297643	test: 0.3407475	best: 0.3407475 (5)	total: 375ms	remaining: 876ms
6:	learn: 0.3206801	test: 0.3312787	best: 0.3312787 (6)	total: 436ms	remaining: 811ms
7:	learn: 0.3142004	test: 0.3251179	best: 0.3251179 (7)	total: 510ms	remaining: 765ms
8:	learn: 0.3098021	test: 0.3209506	best: 0.3209506 (8)	total: 568ms	remaining: 695ms
9:	learn: 0.3058314	test: 0.3176975	best: 0.3176975 (9)	total: 635ms	remaining: 635ms
10:	learn: 0.3034834	test: 0.3156067	best: 0.3156067 (10)	total: 707ms	remaining: 579ms
11:	learn: 0.3017797	t

<catboost.core.CatBoostClassifier at 0x1fb1f4171d0>

In [31]:
print(clf.predict_proba(X_test))

[[0.97928412 0.02071588]
 [0.95199638 0.04800362]
 [0.65683494 0.34316506]
 ...
 [0.00877222 0.99122778]
 [0.83526678 0.16473322]
 [0.58422268 0.41577732]]


In [33]:
print(clf.predict(X_test))

[0 0 0 ... 1 0 0]


In [34]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=50,
    random_seed=42,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1fb2095f860>