In [1]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

warnings.simplefilter("ignore")

In [2]:
train = pd.read_csv('assignment_2_train.csv')
test = pd.read_csv('assignment_2_test.csv')

In [3]:
train.head(3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,


In [4]:
print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*train.shape))

train.shape = 180000 rows, 394 cols
test.shape = 180000 rows, 394 cols


In [5]:
target = train['isFraud']
test_target = test['isFraud']

train = train.drop(['isFraud', 'TransactionID'], axis=1)
test = test.drop(['isFraud', 'TransactionID'], axis=1)


**Задание 1:** отобрать только числовые признаки и обучить модель XGBoost с параметром booster = gbtree. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [6]:
numerical_features = train.select_dtypes(include=[np.number])
print(f"count of numeric_features {numerical_features.shape[1]}")

numerical_features = numerical_features.columns.tolist()

count of numeric_features 378


In [7]:
x_train_num, x_valid_num, y_train_num, y_valid_num = train_test_split(
    train[numerical_features], target, train_size=0.8, random_state=1
)

print("x_train.shape = {} rows, {} cols".format(*x_train_num.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid_num.shape))

x_train.shape = 144000 rows, 378 cols
x_valid.shape = 36000 rows, 378 cols


In [8]:
dtrain_xgb_nf = xgb.DMatrix(
    data=x_train_num, label=y_train_num
)
dvalid_xgb_nf = xgb.DMatrix(
    data=x_valid_num, label=y_valid_num
)
dtest_xgb_nf = xgb.DMatrix(data=test[numerical_features]
)

In [9]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 10,
    "nthread": 6,
    "seed": 27
}

In [10]:
model_xgb_nf = xgb.train(
    params=params,
    dtrain=dtrain_xgb_nf,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(dtrain_xgb_nf, "train"), (dvalid_xgb_nf, "valid")],
    verbose_eval=50,
    maximize=True,
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64988	valid-auc:0.65040
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[50]	train-auc:0.88183	valid-auc:0.87420
[100]	train-auc:0.90013	valid-auc:0.88967
[150]	train-auc:0.90824	valid-auc:0.89758
[200]	train-auc:0.91004	valid-auc:0.89892
Stopping. Best iteration:
[167]	train-auc:0.91004	valid-auc:0.89892



In [11]:
pred_xgb_nf = model_xgb_nf.predict(dtest_xgb_nf)

**Задание 2:** обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 1.

In [12]:
categorical_features = train.select_dtypes(include=[np.object])
print(f"Categorical Feature Count {categorical_features.shape[1]}")
categorical_features.head(n=2)

Categorical Feature Count 14


Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,W,discover,credit,,,T,T,T,M2,F,T,,,
1,W,mastercard,credit,gmail.com,,,,,M0,T,T,,,


In [13]:
train_cat = train.copy()
cat_features = train.select_dtypes(include='object').columns.tolist()
cat_features

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

In [14]:
for cat_colname in cat_features:
    train_cat = pd.concat([train_cat, pd.get_dummies(train_cat[cat_colname], prefix=cat_colname)], axis=1)
    train_cat = train_cat.drop([cat_colname], axis=1)


In [15]:
test_cat = test.copy()

for cat_colname in cat_features:
    test_cat = pd.concat([test_cat, pd.get_dummies(test_cat[cat_colname], prefix=cat_colname)], axis=1)
    test_cat = test_cat.drop([cat_colname], axis=1)

In [16]:
print("train_cat.shape = {} rows, {} cols".format(*train_cat.shape))
print("test_cat.shape = {} rows, {} cols".format(*test_cat.shape))

train_cat.shape = 180000 rows, 529 cols
test_cat.shape = 100001 rows, 525 cols


In [17]:
for colname in train_cat.columns:
    if not (colname in test_cat.columns):
        print(colname)
        train_cat.drop([colname], axis=1, inplace=True)

R_emaildomain_centurylink.net
R_emaildomain_frontiernet.net
R_emaildomain_netzero.net
R_emaildomain_twc.com


In [18]:
print("train_cat.shape = {} rows, {} cols".format(*train_cat.shape))
print("test_cat.shape = {} rows, {} cols".format(*test_cat.shape))

train_cat.shape = 180000 rows, 525 cols
test_cat.shape = 100001 rows, 525 cols


In [19]:
x_train_cat, x_valid_cat, y_train_cat, y_valid_cat = train_test_split(
    train_cat, target, train_size=0.8, random_state=1
)

In [20]:
dtrain_xgb_cf = xgb.DMatrix(np.asmatrix(x_train_cat), label=y_train_cat)
dvalid_xgb_cf = xgb.DMatrix(np.asmatrix(x_valid_cat), label=y_valid_cat)
dtest_xgb_cf = xgb.DMatrix(np.asmatrix(test_cat))

In [21]:
model_xgb_cf = xgb.train(
    params=params,
    dtrain=dtrain_xgb_cf,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(dtrain_xgb_cf, "train"), (dvalid_xgb_cf, "valid")],
    verbose_eval=50,
    maximize=True,
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64988	valid-auc:0.65040
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[50]	train-auc:0.88876	valid-auc:0.88085
[100]	train-auc:0.90947	valid-auc:0.89821
[150]	train-auc:0.91530	valid-auc:0.90275
Stopping. Best iteration:
[139]	train-auc:0.91530	valid-auc:0.90275



In [22]:
pred_xgb_cf = model_xgb_nf.predict(dtest_xgb_cf)

ValueError: feature_names mismatch: ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f349', 'f350', 'f351', 'f352', 'f353', 'f354', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f373', 'f374', 'f375', 'f376', 'f377', 'f378', 'f379', 'f380', 'f381', 'f382', 'f383', 'f384', 'f385', 'f386', 'f387', 'f388', 'f389', 'f390', 'f391', 'f392', 'f393', 'f394', 'f395', 'f396', 'f397', 'f398', 'f399', 'f400', 'f401', 'f402', 'f403', 'f404', 'f405', 'f406', 'f407', 'f408', 'f409', 'f410', 'f411', 'f412', 'f413', 'f414', 'f415', 'f416', 'f417', 'f418', 'f419', 'f420', 'f421', 'f422', 'f423', 'f424', 'f425', 'f426', 'f427', 'f428', 'f429', 'f430', 'f431', 'f432', 'f433', 'f434', 'f435', 'f436', 'f437', 'f438', 'f439', 'f440', 'f441', 'f442', 'f443', 'f444', 'f445', 'f446', 'f447', 'f448', 'f449', 'f450', 'f451', 'f452', 'f453', 'f454', 'f455', 'f456', 'f457', 'f458', 'f459', 'f460', 'f461', 'f462', 'f463', 'f464', 'f465', 'f466', 'f467', 'f468', 'f469', 'f470', 'f471', 'f472', 'f473', 'f474', 'f475', 'f476', 'f477', 'f478', 'f479', 'f480', 'f481', 'f482', 'f483', 'f484', 'f485', 'f486', 'f487', 'f488', 'f489', 'f490', 'f491', 'f492', 'f493', 'f494', 'f495', 'f496', 'f497', 'f498', 'f499', 'f500', 'f501', 'f502', 'f503', 'f504', 'f505', 'f506', 'f507', 'f508', 'f509', 'f510', 'f511', 'f512', 'f513', 'f514', 'f515', 'f516', 'f517', 'f518', 'f519', 'f520', 'f521', 'f522', 'f523', 'f524']
expected V129, V141, V54, V58, V242, V105, V39, V293, V248, V256, V171, V165, V234, V205, V213, V253, V264, V323, V249, V63, V110, D13, V135, V19, V286, V144, V145, V149, V188, V303, V28, V99, V336, C2, dist2, V108, V227, V317, V290, dist1, V17, V272, V197, V279, V130, V310, V94, C5, V82, V121, V190, V164, V244, V334, V128, V89, V337, V175, V43, V12, V180, V44, D15, V6, V152, V78, card1, V64, V103, V157, V277, V51, V139, V91, V280, V282, V119, TransactionAmt, V73, V231, V223, V146, V117, V274, V314, V72, V262, V222, V246, V106, V3, D5, D12, V127, V148, V92, V305, V102, V302, V33, V316, V183, V217, V186, V332, V114, V111, V167, C13, addr2, V225, V147, V170, V236, V132, V47, V40, V138, V276, V251, V81, V309, V257, V46, V75, V214, V288, V259, V265, V55, V178, V14, V319, V27, V24, V93, V166, V69, V8, V156, V4, V10, V172, V263, V113, V289, V313, D14, V5, V299, V45, V270, V77, V107, V20, V86, V52, V1, V196, V134, V287, V35, V41, V200, V182, V252, V325, V285, D4, V50, V80, V150, V60, V153, V315, V179, V34, V210, V61, V339, V159, V255, V70, V29, V30, card2, V97, V338, V154, V162, V11, V143, V232, V240, V254, V224, V215, V258, V16, V84, V267, V68, V209, C6, D11, V331, V48, V136, V273, V49, V294, V25, C3, C8, V296, V65, V193, V62, V327, V137, V318, V109, V37, V221, V32, V335, V98, V206, V306, V307, V125, V239, V281, card3, V104, V184, V230, V233, V187, V295, D3, D8, V250, V212, V284, V174, V26, V95, V155, V271, V176, V235, V56, V333, C14, V18, V241, V218, V324, V328, V198, V36, V131, V126, V308, V181, V161, V297, D2, V115, V300, V42, TransactionDT, V163, V245, V85, V168, V261, V38, V329, V194, V220, V192, C11, V133, C12, V208, C1, V13, C9, V9, D7, V266, V87, card5, V90, V247, V320, V7, V330, V74, V160, V116, V96, V21, V301, V207, V2, V260, V142, V169, V202, V268, V122, V298, V66, V304, V275, V292, V112, V100, V199, V278, V140, V291, D6, V151, V83, D1, V189, V326, V53, V67, V237, V226, V322, V312, V191, V229, V158, V185, V269, V57, V71, V88, V283, V123, C10, V211, V120, V173, V238, V101, V124, V22, V243, V118, addr1, C7, C4, V23, V311, V177, D9, V228, V79, V321, V15, V31, V203, V59, V195, V201, V216, V76, V219, D10, V204 in input data
training data did not have the following fields: f398, f495, f393, f107, f474, f23, f466, f22, f8, f448, f516, f140, f85, f416, f242, f248, f57, f435, f58, f505, f281, f436, f491, f350, f40, f104, f236, f324, f163, f502, f253, f377, f412, f116, f203, f504, f124, f133, f224, f125, f263, f213, f305, f478, f514, f185, f462, f386, f121, f366, f376, f298, f440, f461, f295, f232, f38, f403, f320, f17, f115, f267, f10, f73, f32, f110, f230, f86, f446, f344, f81, f241, f178, f369, f381, f493, f430, f135, f278, f51, f67, f74, f202, f5, f254, f134, f198, f445, f423, f293, f149, f7, f175, f76, f245, f262, f358, f354, f171, f521, f352, f146, f219, f425, f294, f410, f449, f72, f177, f520, f189, f363, f166, f35, f299, f208, f191, f14, f406, f118, f330, f250, f404, f486, f359, f463, f43, f297, f307, f415, f259, f260, f153, f271, f288, f56, f428, f221, f70, f244, f11, f26, f33, f201, f485, f243, f212, f487, f46, f419, f176, f206, f496, f37, f64, f334, f338, f424, f21, f157, f286, f473, f199, f42, f234, f309, f200, f19, f156, f361, f389, f313, f283, f108, f450, f456, f427, f61, f52, f311, f337, f339, f275, f333, f237, f374, f383, f147, f217, f518, f417, f362, f87, f136, f25, f510, f2, f341, f3, f122, f300, f173, f215, f447, f291, f402, f301, f18, f117, f172, f138, f444, f380, f457, f16, f451, f258, f227, f329, f90, f388, f481, f490, f75, f265, f382, f20, f345, f387, f370, f71, f340, f94, f158, f411, f88, f252, f357, f509, f159, f186, f272, f28, f348, f431, f469, f63, f287, f160, f109, f470, f261, f174, f401, f396, f225, f390, f66, f80, f44, f96, f155, f317, f255, f132, f319, f414, f192, f323, f464, f167, f508, f503, f392, f268, f207, f226, f492, f385, f282, f292, f48, f391, f327, f384, f251, f437, f439, f468, f238, f452, f460, f475, f190, f188, f99, f325, f112, f30, f458, f353, f310, f49, f129, f196, f500, f197, f472, f98, f512, f421, f483, f476, f438, f53, f351, f372, f84, f246, f434, f41, f231, f375, f453, f432, f137, f220, f154, f123, f194, f507, f429, f126, f420, f471, f180, f277, f405, f211, f162, f45, f400, f1, f143, f204, f519, f82, f13, f524, f365, f195, f62, f247, f266, f296, f378, f264, f12, f233, f368, f409, f433, f426, f193, f77, f523, f318, f113, f308, f142, f24, f306, f240, f55, f169, f515, f418, f183, f181, f321, f517, f326, f60, f130, f27, f184, f314, f280, f36, f349, f31, f371, f343, f164, f65, f161, f120, f289, f477, f106, f145, f511, f249, f127, f285, f332, f89, f151, f223, f407, f465, f513, f92, f488, f229, f141, f489, f422, f119, f442, f482, f222, f269, f168, f165, f284, f322, f480, f170, f152, f397, f484, f501, f139, f0, f187, f315, f6, f68, f331, f522, f144, f367, f395, f100, f467, f78, f148, f302, f209, f235, f356, f79, f347, f83, f506, f103, f328, f91, f114, f101, f335, f182, f364, f150, f97, f379, f346, f413, f276, f360, f59, f29, f50, f210, f179, f216, f131, f441, f47, f408, f15, f205, f312, f274, f459, f303, f111, f479, f455, f342, f239, f498, f494, f499, f214, f95, f228, f394, f304, f279, f497, f290, f316, f443, f9, f34, f454, f54, f273, f373, f399, f218, f4, f336, f102, f128, f256, f93, f355, f257, f270, f69, f105, f39

In [23]:
set(x_train_cat.columns) - set(test_cat.columns)

set()

In [24]:
test_cat.columns == x_train_cat.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [None]:
fpr, tpr, _ = roc_curve(test_target, pred_xgb_nf)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='test_xgb_num (area = %0.6f)' % roc_auc)

In [None]:
fpr, tpr, _ = roc_curve(test_target, pred_xgb_cf)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='test_xgb_cf (area = %0.6f)' % roc_auc)