# Карасов Николай Дмитриевич

In [7]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostClassifier
from feature_selector import FeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from utils import *

## Заметки по данным

Столбцы с `x10` по `x42` состоят из списков длины `6`. Очень много списков вида `['nan', 'nan', 'nan', 'nan', 'nan', 'nan']`.

Столбцы с `x8` и `x9` состоят из списков, в которых числа разделены пробелами и знаком переноса строки `\n`.

В столбце `x2` содержатся даты. Будем считать, что это дата подписания договора или что-то подобное. На основе этого столбца построим новый признак **стаж**.

Столбцы `x1`, `x3`, `x4`, `x5`, `x6`, `x7`: очень похоже, что в них закодированы категориальные признаки.

Кроме того, в некоторых столбцах очень много пропусков, такие столбцы мы удалим.

## План действий

Ознакомимся с данными, преобрауем данные, протестируем модель на `train2`, обучив ее на `train1`, а затем обучив на `train1` и, валидируюясь на `train2`, предскажем вероятности для `test`.

## Знакомство с данными

In [99]:
train_1, train_2, test = get_data()
print(train_1.shape, train_2.shape, test.shape)

(91946, 44) (93189, 44) (23386, 43)


In [100]:
train_1.head(3)

Unnamed: 0,id,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42
0,-9196902500682081904,0,-2201458060675810556,01.09.2008,6312572492787112517,182,-6.347413e+18,2066976859146697559,0,[4770.1499999999996 885.49000000000001 1956.36...,...,"[nan, nan, nan, nan, nan, nan]","[3.0, 2.0, 1.0, nan, nan, 1.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[224.79, 224.79, 224.79, 224.79, 224.79, 224.79]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
1,-9196902500682081900,0,-2201458060675810556,05.02.2013,4293238556669161936,182,-8.465179e+18,-8648529837587014226,0,[328.93000000000001 3.9700000000000002 996.149...,...,"[nan, nan, nan, nan, nan, nan]","[4.0, nan, nan, 2.0, 3.0, 7.0]","[90.27, 90.27, 90.27, 90.27, 90.27, 90.27]","[249.94, 249.94, 249.94, 249.94, 155.64, 155.64]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[103.09, 103.09, 103.09, 103.09, 103.09, 103.09]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
2,-9196902500682081899,0,-2201458060675810556,01.09.2008,6312572492787112517,175,5.417474e+18,-5480331477604673780,1,[0.28000000000000003 0.39000000000000001 387.1...,...,"[nan, nan, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan, 2.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[196.61, 224.79, 197.55, 224.79, 224.79, 224.79]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"


Проверим, что все `id` уникальны.

In [101]:
check_unique(train_1, train_2, test)

Train_1
	Все id уникальны: True
Train_2
	Все id уникальны: True
Test
	Все id уникальны: True


В данных имеется очень сильный дисбаланс классов.

In [102]:
train_1.y.value_counts() / train_1.shape[0], train_2.y.value_counts() / train_2.shape[0]

(0    0.974822
 1    0.025178
 Name: y, dtype: float64,
 0    0.975813
 1    0.024187
 Name: y, dtype: float64)

Столбцы с `x10` по `x42` содержат в себе очень много значений `nan`. Преобразуем их так, чтобы было удобнее с ними работать.

- `[nan, nan, nan, nan, nan, nan,] -> pd.NA`

- `[nan, 1, nan, nan, 3, nan,] -> [None, 1, None, None, 3, None]`

Подобным образом преобразуем значения в `x8` и `x9`.

In [103]:
transform_columns_10_42(train_1)
transform_columns_10_42(train_2)

In [104]:
# '[4770.15 885.49 1956.37\n 36936.04 3478.56 2374.61 nan\n ...]'
# -> ([4770.15, 885.49, 1956.37], [36936.04, 3478.56, 2374.61, None], ...)
transform_columns_8_9(train_1)
transform_columns_8_9(train_2)

In [105]:
train_1.head(3)

Unnamed: 0,id,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42
0,-9196902500682081904,0,-2201458060675810556,01.09.2008,6312572492787112517,182,-6.347413e+18,2066976859146697559,0,"[4770.15, 885.49, 1956.37, 36936.04, 3478.56, ...",...,,"[3.0, 2.0, 1.0, None, None, 1.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[224.79, 224.79, 224.79, 224.79, 224.79, 224.79]",,,"[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]",,,
1,-9196902500682081900,0,-2201458060675810556,05.02.2013,4293238556669161936,182,-8.465179e+18,-8648529837587014226,0,"[328.93, 3.97, 996.15, 774.45, 480.49, 692.47,...",...,,"[4.0, None, None, 2.0, 3.0, 7.0]","[90.27, 90.27, 90.27, 90.27, 90.27, 90.27]","[249.94, 249.94, 249.94, 249.94, 155.64, 155.64]",,,"[103.09, 103.09, 103.09, 103.09, 103.09, 103.09]",,,
2,-9196902500682081899,0,-2201458060675810556,01.09.2008,6312572492787112517,175,5.417474e+18,-5480331477604673780,1,"[0.28, 0.39, 387.13, 449.77, 199.35, 1285.98, ...",...,,"[1.0, None, None, None, None, 2.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[196.61, 224.79, 197.55, 224.79, 224.79, 224.79]",,,"[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]",,,


Преобразуем даты столбца `x2` в стаж.

In [106]:
train_1 = transform_date(train_1)

Мы удалили 38 строк
Минимальный год: 2001, максимальный год: 3201
Отсечем года, которые меньше 1990, больше 2020
Минимальная и максимальная даты: 01.01.2001, 31.12.2016


In [107]:
train_1.x2 = pd.to_datetime(train_1.x2)
date_time = train_1.x2.mode()[0].strftime("%m.%d.%Y")

random_date = pd.to_datetime('2018-01-01')
train_1.x2 = train_1.x2.apply(lambda x: (random_date - x).days)

In [108]:
train_2 = transform_date_test(train_2, date_time, random_date)

In [109]:
train_1.describe()  

Unnamed: 0,id,y,x1,x2,x3,x4,x5,x6,x7
count,91907.0,91907.0,91907.0,91907.0,91907.0,91907.0,78749.0,91907.0,91907.0
mean,4.928208e+17,0.024906,-2.447493e+18,1815.825878,6.273064e+18,161.132601,-1.119024e+18,-1.341517e+18,1.260851
std,5.286733e+18,0.155838,1.681117e+18,885.977612,5.635636e+17,40.81148,6.406306e+18,5.355868e+18,1.73231
min,-9.196903e+18,0.0,-8.986902e+18,30.0,-6.78159e+18,0.0,-8.552464e+18,-9.204708e+18,0.0
25%,-4.12467e+18,0.0,-2.754919e+18,1109.0,6.312572e+18,163.0,-6.347413e+18,-6.335165e+18,0.0
50%,8.753468e+17,0.0,-2.201458e+18,1687.0,6.312572e+18,181.0,-5.150561e+18,-2.297459e+18,1.0
75%,5.044115e+18,0.0,-2.201458e+18,2412.0,6.312572e+18,182.0,6.439227e+18,2.830195e+18,2.0
max,9.162041e+18,1.0,7.60924e+18,6209.0,8.072835e+18,182.0,8.8784e+18,9.202926e+18,15.0


In [110]:
fs_1 = FeatureSelector(data=train_1, labels=train_1['y'])
fs_2 = FeatureSelector(data=train_2, labels=train_2['y'])

In [111]:
fs_1.identify_missing(missing_threshold=0.6)
fs_2.identify_missing(missing_threshold=0.6)

16 features with greater than 0.60 missing values.

16 features with greater than 0.60 missing values.



In [112]:
missing_features_1 = fs_1.ops['missing']
missing_features_2 = fs_2.ops['missing']

missing_features = set()

for f_1, f_2 in zip(missing_features_1, missing_features_2):
    missing_features.add(f_1)
    missing_features.add(f_2)
    print(f_1, f_2)

x10 x10
x11 x11
x12 x12
x23 x23
x24 x24
x25 x25
x27 x27
x29 x29
x30 x30
x31 x31
x32 x32
x33 x33
x37 x37
x40 x40
x41 x41
x42 x42


In [113]:
for column in ['x1', 'x3', 'x4', 'x5', 'x6', 'x7']:
    print(len(train_1[column].value_counts()))

17
9
183
21
577
16


Тогда план действий следующий: удалим `missing_features` и запустим CatBoost. Признаки `'x1', 'x3', 'x4', 'x5', 'x6', 'x7'` будем считать категориальными.

## Оцениваем модель

In [114]:
train_1 = train_1.drop(list(missing_features), axis=1)
train_2 = train_2.drop(list(missing_features), axis=1)

In [115]:
columns_res = []

for column in train_1.columns[11:]:
    train_1 = train_1.dropna(subset=[column])
    
    hist_columns = [column + '_min', column + '_mean', column + '_max', column + '_nan']
    columns_res += hist_columns
    
    train_1[column] = train_1[column].apply(lambda x: build_features_train(x))
    train_1[hist_columns] = pd.DataFrame(train_1[column].tolist(), index= train_1.index)
    
    min_1 = train_1[hist_columns[0]].mean()
    mean_1 = train_1[hist_columns[1]].mean()
    max_1 = train_1[hist_columns[2]].mean()
    nan_1 = train_1[hist_columns[3]].mean()
    
    train_2[column] = train_2[column].apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
    train_2[hist_columns] = pd.DataFrame(train_2[column].tolist(), index= train_2.index)

In [116]:
t = train_1.x8.apply(lambda x: len([i for i in x if i is not None]))
train_1 = train_1.drop(t[t == 0].index)

In [117]:
hist_columns_x8 = ['x8_min', 'x8_mean', 'x8_max', 'x8_nan']
hist_columns_x9 = ['x9_min', 'x9_mean', 'x9_max', 'x9_nan']

In [118]:
train_1.x8 = train_1.x8.apply(lambda x: build_features_train(x))

In [119]:
train_1[hist_columns_x8] = pd.DataFrame(train_1.x8.tolist(), index= train_1.index)

In [120]:
min_1 = train_1.x8_min.mean()
mean_1 = train_1.x8_mean.mean()
max_1 = train_1.x8_max.mean()
nan_1 = train_1.x8_nan.mean()

In [121]:
train_2.x8 = train_2.x8.apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
train_2[hist_columns_x8] = pd.DataFrame(train_2.x8.tolist(), index=train_2.index)

In [122]:
t = train_1.x9.apply(lambda x: len([i for i in x if i is not None]))
train_1 = train_1.drop(t[t == 0].index)

train_1.x9 = train_1.x9.apply(lambda x: build_features_train(x))
train_1[hist_columns_x9] = pd.DataFrame(train_1.x9.tolist(), index=train_1.index)

min_1 = train_1.x9_min.mean()
mean_1 = train_1.x9_mean.mean()
max_1 = train_1.x9_max.mean()
nan_1 = train_1.x9_nan.mean()

train_2.x9 = train_2.x9.apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
train_2[hist_columns_x9] = pd.DataFrame(train_2.x9.tolist(), index=train_2.index)

In [123]:
for column in ['x1', 'x3', 'x4', 'x5', 'x6', 'x7']:
    cur_mode = train_1[column].mode()[0]
    
    train_1[column] = train_1[column].astype(str)
    train_2[column] = train_2[column].astype(str)

In [124]:
labels_1 = train_1['y']
train_1 = train_1.drop(columns = ['y'])

labels_2 = train_2['y']
train_2 = train_2.drop(columns = ['y'])

In [125]:
columns_used = ['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x2'] + columns_res + hist_columns_x8 + hist_columns_x9
train_data = train_1[columns_used]
eval_data = train_2[columns_used]


cat_features = [0, 1, 2, 3, 4, 5]


train_label = labels_1
eval_label = labels_2


train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100,
                           learning_rate=1,
                           depth=3,
                           loss_function='MultiClass', 
                           eval_metric='AUC')
# Fit model
model.fit(train_dataset, verbose=False)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)

In [126]:
roc_auc_score(labels_2, preds_proba[:, 1])

0.8089464285287389

## Итоговое предсказание для теста

In [161]:
train_1, train_2, test = get_data()

In [163]:
train_1 = transform_date(train_1)
train_2 = transform_date(train_2)

Мы удалили 38 строк
Минимальный год: 2001, максимальный год: 3201
Отсечем года, которые меньше 1990, больше 2020
Минимальная и максимальная даты: 01.01.2001, 31.12.2016
Мы удалили 45 строк
Минимальный год: 1899, максимальный год: 3201
Отсечем года, которые меньше 1990, больше 2020
Минимальная и максимальная даты: 01.01.2001, 31.12.2016


In [164]:
train_1.x2 = pd.to_datetime(train_1.x2)
train_2.x2 = pd.to_datetime(train_2.x2)

date_time = train_1.x2.mode()[0].strftime("%m-%d-%Y")

random_date = pd.to_datetime('2018-01-01')
train_1.x2 = train_1.x2.apply(lambda x: (random_date - x).days)
train_2.x2 = train_2.x2.apply(lambda x: (random_date - x).days)

In [165]:
test.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42
0,-9196902500682081904,-2201458060675810556,2008-09-01 00:00:00,6312572492787112517,183,-6.347413e+18,2066976859146697559,0,[431.63 188.56999999999999 430.73000000000002 ...,[431.63 188.56999999999999 430.73000000000002 ...,...,"[nan, nan, nan, nan, nan, nan]","[1.0, nan, nan, 1.0, nan, 6.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[224.79, 224.79, 224.79, 224.79, 224.79, 224.79]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
1,-9196902500681081889,-2201458060675810556,2008-09-01 00:00:00,6312572492787112517,183,3.052557e+18,-8367157107954005236,0,[287.63 8051.1199999999999 5301.6700000000001 ...,[287.63 8051.1199999999999 5301.6700000000001 ...,...,"[nan, 0.0, nan, nan, nan, nan]","[nan, nan, nan, 2.0, 1.0, nan]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[199.65, 199.65, 199.65, 199.65, 199.65, 199.65]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
2,-9196902500679081891,-2201458060675810556,2011-03-01 00:00:00,6312572492787112517,137,-8.465179e+18,2066976859146697559,2,[45.520000000000003 728.21000000000004 2548.57...,[45.520000000000003 728.21000000000004 2548.57...,...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, 4.0, nan]","[75.18, 67.64, 75.18, 75.18, 75.18, 75.18]","[195.46, 61.35, 195.46, 222.28, 222.28, 222.28]","[107.87, 107.87, 107.87, 107.87, 107.87, 107.87]","[77.19, 77.19, 77.19, 77.19, 77.19, 77.19]","[88.01, 80.46, 88.01, 88.01, 88.01, 88.01]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
3,-9196902500679081889,-2201458060675810556,2008-09-01 00:00:00,6312572492787112517,183,3.052557e+18,-8367157107954005236,0,[0.01 0.01 0.0 0.01 0.01 0.01 0.0 0.01 0.01 0....,[0.01 0.01 0.0 0.01 0.01 0.01 0.0 0.01 0.01 0....,...,"[nan, nan, nan, nan, nan, nan]","[1.0, 1.0, nan, nan, 2.0, 1.0]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[199.65, 177.34, 199.65, 199.65, 199.65, 199.65]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
4,-9196902500677081902,-2201458060675810556,2008-09-01 00:00:00,6312572492787112517,183,-8.465179e+18,-4936635693953926999,0,[9.0600000000000005 88.730000000000004 33.8100...,[9.0600000000000005 88.730000000000004 33.8100...,...,"[nan, nan, nan, nan, nan, nan]","[2.0, nan, nan, 1.0, nan, nan]","[67.64, 67.64, 67.64, 67.64, 67.64, 67.64]","[199.65, 199.65, 199.65, 199.65, 199.65, 199.65]","[nan, nan, nan, nan, nan, nan]","[77.19, 77.19, 77.19, 77.19, 77.19, 77.19]","[80.46, 80.46, 80.46, 80.46, 80.46, 80.46]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"


In [166]:
test.x2 = test.x2.fillna(date_time)
mask = test['x2'].apply(lambda x: max(map(int, x[:10].split('-'))))
test.loc[2020 < mask, 'x2'] = date_time
test.loc[mask < 2000, 'x2'] = date_time
    
test.x2 = pd.to_datetime(test.x2)

test.x2 = test.x2.apply(lambda x: (random_date - x).days)

In [167]:
transform_columns_10_42(train_1)
transform_columns_10_42(train_2)
transform_columns_10_42(test)

In [168]:
train_1 = train_1.drop(list(missing_features), axis=1)
train_2 = train_2.drop(list(missing_features), axis=1)
test = test.drop(list(missing_features), axis=1)

In [169]:
columns_res = []

for column in train_1.columns[11:28]:
    train_1 = train_1.dropna(subset=[column])
    train_2 = train_2.dropna(subset=[column])
    
    hist_columns = [column + '_min', column + '_mean', column + '_max', column + '_nan']
    columns_res += hist_columns
    
    train_1[column] = train_1[column].apply(lambda x: build_features_train(x))
    train_1[hist_columns] = pd.DataFrame(train_1[column].tolist(), index=train_1.index)
    
    min_1 = train_1[hist_columns[0]].mean()
    mean_1 = train_1[hist_columns[1]].mean()
    max_1 = train_1[hist_columns[2]].mean()
    nan_1 = train_1[hist_columns[3]].mean()
    
    train_2[column] = train_2[column].apply(lambda x: build_features_train(x))
    train_2[hist_columns] = pd.DataFrame(train_2[column].tolist(), index=train_2.index)
    
    test[column] = test[column].apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
    test[hist_columns] = pd.DataFrame(test[column].tolist(), index=test.index)

In [170]:
transform_columns_8_9(train_1)
t = train_1.x8.apply(lambda x: len([i for i in x if i is not None]))
train_1 = train_1.drop(t[t == 0].index)

train_1.x8 = train_1.x8.apply(lambda x: build_features_train(x))
train_1[hist_columns_x8] = pd.DataFrame(train_1.x8.tolist(), index= train_1.index)

In [171]:
min_1 = train_1.x8_min.mean()
mean_1 = train_1.x8_mean.mean()
max_1 = train_1.x8_max.mean()
nan_1 = train_1.x8_nan.mean()

In [172]:
transform_columns_8_9(train_2)
t = train_2.x8.apply(lambda x: len([i for i in x if i is not None]))
train_2 = train_2.drop(t[t == 0].index)

train_2.x8 = train_2.x8.apply(lambda x: build_features_train(x))
train_2[hist_columns_x8] = pd.DataFrame(train_2.x8.tolist(), index= train_2.index)

In [173]:
transform_columns_8_9(test)

In [174]:
test.x8 = test.x8.apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
test[hist_columns_x8] = pd.DataFrame(test.x8.tolist(), index= test.index)

In [175]:
t = train_1.x9.apply(lambda x: len([i for i in x if i is not None]))
train_1 = train_1.drop(t[t == 0].index)
train_1.x9 = train_1.x9.apply(lambda x: build_features_train(x))
train_1[hist_columns_x9] = pd.DataFrame(train_1.x9.tolist(), index= train_1.index)

min_1 = train_1.x9_min.mean()
mean_1 = train_1.x9_mean.mean()
max_1 = train_1.x9_max.mean()
nan_1 = train_1.x9_nan.mean()

In [176]:
t = train_2.x9.apply(lambda x: len([i for i in x if i is not None]))
train_2 = train_2.drop(t[t == 0].index)
train_2.x9 = train_2.x9.apply(lambda x: build_features_train(x))
train_2[hist_columns_x9] = pd.DataFrame(train_2.x9.tolist(), index= train_2.index)

In [177]:
test.x9 = test.x9.apply(lambda x: build_features_test(x, min_1, mean_1, max_1, nan_1))
test[hist_columns_x9] = pd.DataFrame(test.x9.tolist(), index=test.index)

In [178]:
for column in ['x1', 'x3', 'x4', 'x5', 'x6', 'x7']:
    train_1[column] = train_1[column].astype(str)
    train_2[column] = train_2[column].astype(str)
    test[column] = test[column].astype(str)

In [179]:
labels_1 = train_1['y']
train_1 = train_1.drop(columns = ['y'])

labels_2 = train_2['y']
train_2 = train_2.drop(columns = ['y'])

In [180]:
columns_used = ['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x2'] + columns_res + hist_columns_x8 + hist_columns_x9

train_data = train_1[columns_used]
eval_data = train_2[columns_used]
test_data = test[columns_used]

cat_features = [0, 1, 2, 3, 4, 5]

train_label = labels_1
eval_label = labels_2

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

test_dataset = Pool(data=test_data,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100,
                           learning_rate=1,
                           depth=3,
                           loss_function='MultiClass', 
                           eval_metric='AUC')

model.fit(train_dataset, verbose=False)

preds_proba = model.predict_proba(test_dataset)

In [181]:
preds_proba.shape

(23386, 2)

In [182]:
answer = pd.DataFrame(
    {'id': [i for i in test.id],
     'result': [i for i in preds_proba[:, 1]],
    })

answer.set_index('id')

answer.head()

Unnamed: 0,id,result
0,-9196902500682081904,0.013634
1,-9196902500681081889,0.014162
2,-9196902500679081891,0.002414
3,-9196902500679081889,0.011602
4,-9196902500677081902,0.002633


In [183]:
answer.to_csv('submission_karasov.csv', sep=';', index=False)