In [1]:
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

import catboost as cb
import matplotlib.pyplot as plt

warnings.simplefilter("ignore")

In [2]:
train = pd.read_csv('assignment_2_train.csv')

In [3]:
test = pd.read_csv('assignment_2_test.csv')

In [4]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print("train.shape = {} rows, {} cols".format(*train.shape))

print("test.shape = {} rows, {} cols".format(*test.shape))

train.shape = 180000 rows, 394 cols
test.shape = 100001 rows, 394 cols


In [6]:
target = train['isFraud']
train = train.drop(['isFraud', 'TransactionID'], axis=1)

**Задание 1:** отобрать только числовые признаки и обучить модель XGBoost с параметром booster = gbtree. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [7]:
numerical_features = train.select_dtypes(include=[np.number])
print(f"count of numeric_features {numerical_features.shape[1]}")

count of numeric_features 378


In [8]:
x_train, x_valid = train_test_split(
    numerical_features, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))

x_train.shape = 144000 rows, 378 cols
x_valid.shape = 36000 rows, 378 cols


In [9]:
dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)
dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

In [10]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 10,
    "nthread": 6,
    "seed": 27
}

In [11]:
model_xgb_nf = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    verbose_eval=50,
    maximize=True,
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64988	valid-auc:0.65040
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[50]	train-auc:0.88183	valid-auc:0.87420
[100]	train-auc:0.90013	valid-auc:0.88967
[150]	train-auc:0.90824	valid-auc:0.89758
[200]	train-auc:0.91004	valid-auc:0.89892
Stopping. Best iteration:
[167]	train-auc:0.91004	valid-auc:0.89892



**Задание 2:** обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 1.

In [12]:
categorical_features = train.select_dtypes(include=[np.object])
print(f"Categorical Feature Count {categorical_features.shape[1]}")
categorical_features.head(n=2)

Categorical Feature Count 14


Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,W,discover,credit,,,T,T,T,M2,F,T,,,
1,W,mastercard,credit,gmail.com,,,,,M0,T,T,,,


In [13]:
train_cat = train.copy()

for cat_colname in train.select_dtypes(include='object').columns[1:]:
    train_cat = pd.concat([train_cat, pd.get_dummies(train[cat_colname], prefix=cat_colname)], axis=1)

In [14]:
train_cat.head(2)

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,1,0,0,1,0,0,0,0,0,0
1,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,0,1,0,1,0,0,0,0,0,0


In [16]:
for cat_colname in categorical_features:
    train_cat = train_cat.drop([cat_colname], axis=1)

In [17]:
train_cat.shape

(180000, 524)

In [18]:
x_train, x_valid = train_test_split(
    train_cat, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))

x_train.shape = 144000 rows, 524 cols
x_valid.shape = 36000 rows, 524 cols


In [19]:
dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)
dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

In [20]:
model_xgb_cf = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    verbose_eval=50,
    maximize=True,
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64988	valid-auc:0.65040
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[50]	train-auc:0.88777	valid-auc:0.87933
[100]	train-auc:0.90738	valid-auc:0.89644
[150]	train-auc:0.91448	valid-auc:0.90209
[200]	train-auc:0.91460	valid-auc:0.90220
Stopping. Best iteration:
[152]	train-auc:0.91460	valid-auc:0.90220



**Задание 4:** для числовых признаков обучить модель LightGBM. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [21]:
x_train, x_valid = train_test_split(
    numerical_features, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)

In [22]:
dtrain = lgb.Dataset(
    data=x_train, label=y_train
)
dvalid = lgb.Dataset(
    data=x_valid, label=y_valid
)

In [23]:
params = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 10000,
    "n_jobs": 6,
    "seed": 27
}

In [24]:
model_lgbm_nf = lgb.train(
    params=params,
    train_set=dtrain,
    num_boost_round=10000,
    valid_sets=[dtrain, dvalid],
    categorical_feature="auto",
    early_stopping_rounds=50,
    verbose_eval=250
)

[LightGBM] [Info] Number of positive: 4139, number of negative: 139861
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31591
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 376
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028743 -> initscore=-3.520195
[LightGBM] [Info] Start training from score -3.520195
Training until validation scores don't improve for 50 rounds
[250]	training's auc: 0.918873	valid_1's auc: 0.900764
[500]	training's auc: 0.944222	valid_1's auc: 0.919666
[750]	training's auc: 0.956513	valid_1's auc: 0.928522
[1000]	training's auc: 0.96491	valid_1's auc: 0.93473
[1250]	training's auc: 0.971182	valid_1's auc: 0.93837
[1500]	training's auc: 0.976402	valid_1's auc: 0.941549
[1750]	training's auc: 0.980631	valid_1's auc: 0.944279
[2000]	training's auc: 0.983645	valid_1's auc: 0.946121
[2250]	trai

**Задание 5:** обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 4.

In [25]:
x_train, x_valid = train_test_split(
    train_cat, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)

In [26]:
dtrain = lgb.Dataset(
    data=x_train, label=y_train
)
dvalid = lgb.Dataset(
    data=x_valid, label=y_valid
)

In [27]:
model_lgbm_cf = lgb.train(
    params=params,
    train_set=dtrain,
    num_boost_round=10000,
    valid_sets=[dtrain, dvalid],
    categorical_feature="auto",
    early_stopping_rounds=50,
    verbose_eval=250
)

[LightGBM] [Info] Number of positive: 4139, number of negative: 139861
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31819
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 490
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028743 -> initscore=-3.520195
[LightGBM] [Info] Start training from score -3.520195
Training until validation scores don't improve for 50 rounds
[250]	training's auc: 0.924914	valid_1's auc: 0.908456
[500]	training's auc: 0.950504	valid_1's auc: 0.926256
[750]	training's auc: 0.962976	valid_1's auc: 0.936219
[1000]	training's auc: 0.97083	valid_1's auc: 0.941402
[1250]	training's auc: 0.977093	valid_1's auc: 0.944932
[1500]	training's auc: 0.981555	valid_1's auc: 0.947695
[1750]	training's auc: 0.985105	valid_1's auc: 0.950126
[2000]	training's auc: 0.987622	valid_1's auc: 0.951798
[2250]	tr

**Задание 6:** обработать категориальные признаки встроенным методом в LightGBM. Выполнить задание 4. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 5.

In [28]:
train_cat_cat = train.copy()

In [29]:
for c in categorical_features:
    train_cat_cat[c] = train_cat_cat[c].astype('category')

In [30]:
x_train, x_valid = train_test_split(
    train_cat_cat, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)

In [31]:
cat_features = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

In [32]:
dtrain = lgb.Dataset(
    data=x_train, label=y_train, categorical_feature = cat_features
)
dvalid = lgb.Dataset(
    data=x_valid, label=y_valid, categorical_feature = cat_features
)

In [33]:
model_lgbm_cat = lgb.train(
    params=params,
    train_set=dtrain,
    num_boost_round=10000,
    valid_sets=[dtrain, dvalid],
    categorical_feature=cat_features,
    early_stopping_rounds=50,
    verbose_eval=250
)

[LightGBM] [Info] Number of positive: 4139, number of negative: 139861
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31755
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 390
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028743 -> initscore=-3.520195
[LightGBM] [Info] Start training from score -3.520195
Training until validation scores don't improve for 50 rounds
[250]	training's auc: 0.926908	valid_1's auc: 0.909239
[500]	training's auc: 0.951757	valid_1's auc: 0.927188
[750]	training's auc: 0.964871	valid_1's auc: 0.936713
[1000]	training's auc: 0.972807	valid_1's auc: 0.94183
[1250]	training's auc: 0.978737	valid_1's auc: 0.945283
[1500]	training's auc: 0.983219	valid_1's auc: 0.948374
[1750]	training's auc: 0.986667	valid_1's auc: 0.950623
[2000]	training's auc: 0.989152	valid_1's auc: 0.952264
[2250]	tr

При обработке категорий встроенным методом, качество обучения выше на 0,002 на трейне и 0,004 на вылидации

**Задание 7:** для числовых признаков обучить модель CatBoost. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [34]:
x_train, x_valid = train_test_split(
    numerical_features, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)
train_pool = cb.Pool(x_train, y_train)
valid_pool = cb.Pool(x_valid, y_valid)

In [35]:
cb_params = {
    "n_estimators": 10000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 200,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

In [36]:
model_cat_nf = cb.CatBoostClassifier(**cb_params)
model_cat_nf.fit(train_pool, eval_set=valid_pool)

0:	test: 0.5988007	best: 0.5988007 (0)	total: 107ms	remaining: 17m 52s
200:	test: 0.8722220	best: 0.8722220 (200)	total: 12.1s	remaining: 9m 51s
400:	test: 0.8839140	best: 0.8839140 (400)	total: 24.1s	remaining: 9m 37s
600:	test: 0.8854287	best: 0.8854292 (597)	total: 35.3s	remaining: 9m 11s
800:	test: 0.8859115	best: 0.8859124 (794)	total: 46.1s	remaining: 8m 49s
1000:	test: 0.8876637	best: 0.8876637 (1000)	total: 57.3s	remaining: 8m 35s
1200:	test: 0.8904952	best: 0.8904952 (1200)	total: 1m 8s	remaining: 8m 23s
1400:	test: 0.8912365	best: 0.8912365 (1400)	total: 1m 19s	remaining: 8m 9s
1600:	test: 0.8931860	best: 0.8931860 (1600)	total: 1m 30s	remaining: 7m 56s
1800:	test: 0.8961684	best: 0.8961684 (1800)	total: 1m 42s	remaining: 7m 46s
2000:	test: 0.8981880	best: 0.8981880 (2000)	total: 1m 53s	remaining: 7m 35s
2200:	test: 0.9001578	best: 0.9001578 (2200)	total: 2m 5s	remaining: 7m 23s
2400:	test: 0.9017867	best: 0.9017867 (2400)	total: 2m 16s	remaining: 7m 11s
2600:	test: 0.9034171

<catboost.core.CatBoostClassifier at 0x17246358148>

In [37]:
x_train, x_valid = train_test_split(
    train_cat, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)

train_pool = cb.Pool(x_train, y_train)
valid_pool = cb.Pool(x_valid, y_valid)

In [38]:
model_cat_cf = cb.CatBoostClassifier(**cb_params)
model_cat_cf.fit(train_pool, eval_set=valid_pool)

0:	test: 0.6010368	best: 0.6010368 (0)	total: 50.3ms	remaining: 8m 22s
200:	test: 0.8785992	best: 0.8785992 (200)	total: 12.5s	remaining: 10m 11s
400:	test: 0.8901979	best: 0.8902089 (398)	total: 24.9s	remaining: 9m 55s
600:	test: 0.8934740	best: 0.8934753 (592)	total: 36.8s	remaining: 9m 35s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8934755068
bestIteration = 643

Shrink model to first 644 iterations.


<catboost.core.CatBoostClassifier at 0x172463ef988>

**Задание 9:** обработать категориальные признаки встроенным методом в CatBoost. Выполнить задание 7. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 8.

In [39]:
train_cat_str = train.copy()

In [40]:
for c in categorical_features:
    train_cat_str[c] = train_cat_str[c].astype('str')

In [41]:
x_train, x_valid = train_test_split(
    train_cat_str, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)


train_pool = cb.Pool(x_train, y_train, cat_features = cat_features)
valid_pool = cb.Pool(x_valid, y_valid, cat_features = cat_features)

In [42]:
model_cat_cat = cb.CatBoostClassifier(**cb_params)
model_cat_cat.fit(train_pool, eval_set=valid_pool)

0:	test: 0.7531850	best: 0.7531850 (0)	total: 258ms	remaining: 42m 56s
200:	test: 0.8773895	best: 0.8773895 (200)	total: 45.3s	remaining: 36m 46s
400:	test: 0.8891383	best: 0.8891383 (400)	total: 1m 29s	remaining: 35m 40s
600:	test: 0.8923420	best: 0.8923460 (568)	total: 2m 12s	remaining: 34m 32s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8923460154
bestIteration = 568

Shrink model to first 569 iterations.


<catboost.core.CatBoostClassifier at 0x1724645dd88>

Модель показала результат лучше на предобработанных категориях

**Задание 10:** построить ROC-кривую для всех построенных алгоритмов на обучающей и тестовой выборке. Сделать выводы о работе алгоритмов с точки зрения качества на тестовой выборке и с точки зрения переобучения.

In [43]:
test_target = test['isFraud']
test = test.drop(['isFraud', 'TransactionID'], axis=1)

In [48]:
numerical_features = numerical_features.columns.tolist()

In [49]:
test_numerical_features = test[numerical_features]


In [51]:
test_cat = test.copy()
for cat_colname in cat_features:
    test_cat = pd.concat([test_cat, pd.get_dummies(test[cat_colname], prefix=cat_colname)], axis=1)


In [54]:
test_cat

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,7415038,226.000,12473,555.0,150.0,226.0,299.0,87.0,116.0,,...,0,1,1,0,1,0,1,0,0,1
1,7415054,3072.000,15651,417.0,150.0,226.0,330.0,87.0,,,...,0,0,0,1,0,0,0,0,0,0
2,7415081,319.950,13844,583.0,150.0,226.0,126.0,87.0,9.0,,...,0,0,0,1,1,0,1,0,0,1
3,7415111,171.000,11556,309.0,150.0,226.0,181.0,87.0,3.0,,...,0,1,0,1,1,0,0,1,0,1
4,7415112,107.950,10985,555.0,150.0,226.0,231.0,87.0,0.0,,...,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,10091528,368.990,13964,496.0,150.0,224.0,299.0,87.0,,,...,0,0,1,0,1,0,1,0,0,1
99997,10091533,445.330,10616,583.0,150.0,226.0,472.0,87.0,,,...,0,0,0,1,0,0,0,0,0,0
99998,10091544,15.226,9803,583.0,150.0,226.0,,,,,...,0,0,0,0,0,0,0,0,0,0
99999,10091549,34.742,16062,500.0,185.0,137.0,284.0,60.0,,,...,0,0,0,0,0,0,0,0,0,0


In [53]:
for cat_colname in categorical_features:
    test_cat = test_cat.drop([cat_colname], axis=1)

In [69]:

test_cat_cat = test.copy()

for c in categorical_features:
    test_cat_cat[c] = test_cat_cat[c].astype('category')
    

test_cat_str = test.copy()

for c in categorical_features:
    test_cat_str[c] = test_cat_str[c].astype('str')
    
    

In [70]:
test_cat_cat

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,7415038,226.000,W,12473,555.0,150.0,visa,226.0,credit,299.0,...,,,,,,,,,,
1,7415054,3072.000,W,15651,417.0,150.0,visa,226.0,debit,330.0,...,,,,,,,,,,
2,7415081,319.950,W,13844,583.0,150.0,visa,226.0,credit,126.0,...,,,,,,,,,,
3,7415111,171.000,W,11556,309.0,150.0,visa,226.0,debit,181.0,...,,,,,,,,,,
4,7415112,107.950,W,10985,555.0,150.0,visa,226.0,debit,231.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,10091528,368.990,W,13964,496.0,150.0,mastercard,224.0,debit,299.0,...,,,,,,,,,,
99997,10091533,445.330,W,10616,583.0,150.0,visa,226.0,credit,472.0,...,,,,,,,,,,
99998,10091544,15.226,C,9803,583.0,150.0,visa,226.0,credit,,...,,,,,,,,,,
99999,10091549,34.742,C,16062,500.0,185.0,mastercard,137.0,credit,284.0,...,,,,,,,,,,


In [71]:
test_xgb_nf = xgb.DMatrix(data=test_numerical_features)
test_xgb_nf

<xgboost.core.DMatrix at 0x172773902c8>

In [72]:
test_xgb_nf = xgb.DMatrix(data=test_numerical_features)
test_xgb_cf = xgb.DMatrix(data=test_cat)

test_lgbm_nf = lgb.Dataset(data=test_numerical_features)
test_lgbm_cf = lgb.Dataset(data=test_cat)
test_lgbm_cat = lgb.Dataset(data=test_cat_cat, categorical_feature = cat_features)

test_cat_nf = cb.Pool(test_numerical_features)
test_cat_cf = cb.Pool(test_cat)
test_cat_cat = cb.Pool(test_cat_str, cat_features = cat_features)

In [76]:
# fig = plt.figure(figsize=(25, 15))

# pred = model_xgb_nf.predict(test_xgb_nf)
# fpr, tpr, _ = roc_curve(test_target, pred)
# roc_auc = auc(fpr, tpr)
# plt.plot(fpr, tpr, label='test_xgb_num (area = %0.6f)' % roc_auc)

# # pred = model_xgb_cf.predict(test_xgb_cf)
# # fpr, tpr, _ = roc_curve(test_target, pred)
# # roc_auc = auc(fpr, tpr)
# # plt.plot(fpr, tpr, label='dtest_xgb_num_cat (area = %0.6f)' % roc_auc)


# pred = model_cat_nf.predict(test_cat_nf)
# fpr, tpr, _ = roc_curve(test_target, pred)
# roc_auc = auc(fpr, tpr)
# plt.plot(fpr, tpr, label='test_cat_num (area = %0.6f)' % roc_auc)

# # pred = model_cat_cf.predict(test_cat_cf)
# # fpr, tpr, _ = roc_curve(test_target, pred)
# # roc_auc = auc(fpr, tpr)
# # plt.plot(fpr, tpr, label='test_cat_cat (area = %0.6f)' % roc_auc)

# # pred = model_cat_cat.predict(test_cat_cat)
# # fpr, tpr, _ = roc_curve(test_target, pred)
# # roc_auc = auc(fpr, tpr)
# # plt.plot(fpr, tpr, label='test_cat_categorical (area = %0.6f)' % roc_auc)


# pred = model_lgbm_cf.predict(test_lgbm_cf)
# fpr, tpr, _ = roc_curve(test_target, pred)
# roc_auc = auc(fpr, tpr)
# plt.plot(fpr, tpr, label='test_lgbm_cat (area = %0.6f)' % roc_auc)

# # pred = model_lgbm_cat.predict(test_lgbm_cat)
# # fpr, tpr, _ = roc_curve(test_target, pred)
# # roc_auc = auc(fpr, tpr)
# # plt.plot(fpr, tpr, label='test_lgbm_categorical (area = %0.6f)' % roc_auc)

# # pred = model_lgbm_nf.predict(test_lgbm_nf)
# # fpr, tpr, _ = roc_curve(test_target, pred)
# # roc_auc = auc(fpr, tpr)
# # plt.plot(fpr, tpr, label='test_lgbm_num (area = %0.6f)' % roc_auc)



# plt.plot([0, 1], [0, 1], color='red', linestyle='*')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.legend(loc="lower right")
# plt.show()