In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score
from sklearn. linear_model import LogisticRegression

from imblearn.over_sampling import RandomOverSampler

from catboost import CatBoostClassifier, Pool

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

### Загрузка обработанных датасетов

In [3]:
data_train_preprocessed = pd.read_csv('content/data_train_preprocessed.csv')
data_test_preprocessed = pd.read_csv('content/data_test_preprocessed.csv')

In [4]:
id_test = data_test_preprocessed['id']
data_test_preprocessed = data_test_preprocessed.drop(['id', 'Unnamed: 0'], axis=1)
data_train_preprocessed = data_train_preprocessed.drop(['Unnamed: 0'], axis=1)

In [5]:
data_train_preprocessed.shape, data_test_preprocessed.shape

((6000000, 26), (4000000, 25))

### Подготовка данных для обучения модели

In [6]:
data_train_preprocessed['label'].value_counts()

0    5829020
1     170980
Name: label, dtype: int64

In [7]:
X_1 = data_train_preprocessed.drop(['label'], axis=1)
y_1 = data_train_preprocessed['label']

##### *Разделим массивы на случайные обучающие и тестовые подмножества*

In [8]:
X_train_1, X_holdout_1, y_train_1, y_holdout_1 = train_test_split(X_1, y_1, test_size=0.3, random_state=17)

##### *Сбалансируем распределение классов*

In [9]:
ros = RandomOverSampler(random_state=42)
X_resampled_1, y_resampled_1 = ros.fit_resample(X_train_1, y_train_1)

In [10]:
X_resampled_1.shape

(8160932, 25)

# Обучаемся на CatBoost / Модель №1

In [11]:
cat_features = ['index_oper',
                'type',
                'priority',
                'is_privatecategory',
                'class',
                'is_in_yandex',
                'is_return',
                'mailtype',
                'mailctg',
                'directctg',
                'postmark',
                'oper_type',
                'oper_attr']

text_features = ['name_mfi']

In [12]:
train_dataset = Pool(data=X_resampled_1,
                     label=y_resampled_1,
                     cat_features=cat_features,
                     text_features=text_features)

eval_dataset = Pool(data=X_holdout_1,
                     label=y_holdout_1,
                     cat_features=cat_features,
                     text_features=text_features)

In [13]:
model_1 = CatBoostClassifier(
    random_seed=63,
    iterations=300,
    task_type="CPU",
    learning_rate=0.3
)

model_1.fit(
    train_dataset,
    verbose=50,
    eval_set=eval_dataset,
)

0:	learn: 0.2802487	test: 0.2260985	best: 0.2260985 (0)	total: 17.3s	remaining: 1h 26m 24s
50:	learn: 0.0722478	test: 0.1071197	best: 0.1071197 (50)	total: 16m 21s	remaining: 1h 19m 52s
100:	learn: 0.0690481	test: 0.1042050	best: 0.1042033 (95)	total: 29m 55s	remaining: 58m 58s
150:	learn: 0.0668737	test: 0.1041729	best: 0.1040256 (144)	total: 43m 37s	remaining: 43m 2s
200:	learn: 0.0659865	test: 0.1040187	best: 0.1035579 (185)	total: 54m 12s	remaining: 26m 41s
250:	learn: 0.0649676	test: 0.1047910	best: 0.1035579 (185)	total: 1h 5m 50s	remaining: 12m 51s
299:	learn: 0.0642826	test: 0.1043925	best: 0.1035579 (185)	total: 1h 15m 52s	remaining: 0us

bestTest = 0.1035578909
bestIteration = 185

Shrink model to first 186 iterations.


<catboost.core.CatBoostClassifier at 0x257c1211a30>

In [15]:
model_1.save_model("content/CatBoost_1")

### Предсказывание значений на тестовой выборке

In [17]:
test_pool_1 = Pool(data_test_preprocessed,
                   cat_features=cat_features,
                   text_features=text_features)

In [18]:
predictions_1 = model_1.predict(test_pool_1)
    
sub_test_pred_model_1 = pd.DataFrame({'label': predictions_1})
sub_test_pred_model_1.index = id_test

In [19]:
sub_test_pred_model_1['label'].value_counts()

0    3765693
1     234307
Name: label, dtype: int64

In [20]:
predictions_2 = model_1.predict_proba(test_pool_1)

pred = []
for i in range(0, len(predictions_2)):
    pred.append(predictions_2[i][1])
    
test_pred_model_1 = pd.DataFrame({'label': pred})
test_pred_model_1.index = id_test

### Подсчёт метрик для Модели №1

In [22]:
holdout_pool_1 = Pool(X_holdout_1,
                      cat_features=cat_features,
                      text_features=text_features)

y_pred_hold_1 = model_1.predict(holdout_pool_1)

In [23]:
print("F1: ", f1_score(y_holdout_1, y_pred_hold_1) * 100)
print("Accuracy: ", accuracy_score(y_holdout_1, y_pred_hold_1) * 100)

score_recall_1 = recall_score(y_holdout_1, y_pred_hold_1, average = "macro" )
score_auc_1 = roc_auc_score(y_holdout_1, y_pred_hold_1, multi_class='ovo')

print("Recall: ", score_recall_1 * 100)
print("ROC_AUC: ", score_auc_1 * 100)
print("Score: ", 0.1 * score_recall_1 + 0.9 * score_auc_1)

F1:  57.3963538486461
Accuracy:  96.20122222222223
Recall:  92.96412528015206
ROC_AUC:  92.96412528015206
Score:  0.9296412528015205


### Столбец значений для второго уровня обучения

In [25]:
predictions_proba_1 = model_1.predict_proba(Pool(X_resampled_1,
                                                 cat_features=cat_features,
                                                 text_features=text_features))

In [26]:
pred_1 = []

for i in range(0, len(predictions_proba_1)):
    pred_1.append(predictions_proba_1[i][1])

In [27]:
stack_model_1 = pd.DataFrame({'label': pred_1})

### Загрузка исходных выборок

In [28]:
data_train = pd.read_csv('content/train_dataset_train.csv')
data_test = pd.read_csv('content/test_dataset_test.csv')

In [29]:
col_obj = data_train.select_dtypes(include=['object']).columns.values
col_obj = list(set(col_obj) ^ set(["id", "label"]))

In [30]:
X_2 = data_train.drop(col_obj, axis = 1)
y_2 = data_train[["label"]]

In [31]:
data_test = data_test.drop(['id',
                             'is_privatecategory',
                             'is_in_yandex',
                             'oper_type + oper_attr',
                             'name_mfi',
                             'index_oper',
                             'type',
                             'is_return'], axis=1)

In [32]:
X_train_2, X_holdout_2, y_train_2, y_holdout_2 = train_test_split(X_2, y_2, test_size=0.3, random_state=17)

##### *Сбалансируем распределение классов*

In [33]:
X_resampled_2, y_resampled_2 = ros.fit_resample(X_train_2, y_train_2)

# Обучаемся на CatBoost / Модель №2

In [35]:
train_dataset_2 = Pool(data=X_resampled_2,
                     label=y_resampled_2)

eval_dataset_2 = Pool(data=X_holdout_2,
                     label=y_holdout_2)

In [36]:
from catboost import CatBoostClassifier

model_2 = CatBoostClassifier(
    learning_rate=0.5,
    loss_function='MultiClassOneVsAll',
    iterations=100, 
    random_seed=63,
)

model_2.fit(
    train_dataset_2,
    verbose=False,
    eval_set=eval_dataset_2,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x259b7a0ecd0>

In [44]:
model_2.save_model("content/CatBoost_2")

### Предсказывание значений на тестовой выборке

In [38]:
test_pool_2 = Pool(data_test)

In [41]:
predictions_3 = model_2.predict(test_pool_2)

pred_2 = []

for i in range(0, len(predictions_3)):
    pred_2.append(predictions_3[i])
    
sub_test_pred_model_2 = pd.DataFrame({'label': pred_2})
sub_test_pred_model_2.index = id_test

In [42]:
sub_test_pred_model_2['label'].value_counts()

[0]    3220995
[1]     779005
Name: label, dtype: int64

In [43]:
predictions_2 = model_2.predict_proba(test_pool_2)

pred = []
for i in range(0, len(predictions_2)):
    pred.append(predictions_2[i][1])
    
test_pred_model_2 = pd.DataFrame({'label': pred})
test_pred_model_2.index = id_test

### Подсчёт метрик для Модели №2

In [46]:
holdout_pool_2 = Pool(X_holdout_2)

y_pred_hold_2 = model_2.predict(holdout_pool_2)

In [47]:
print("F1: ", f1_score(y_holdout_2, y_pred_hold_2) * 100)
print("Accuracy: ", accuracy_score(y_holdout_2, y_pred_hold_2) * 100)

score_recall_1 = recall_score(y_holdout_2, y_pred_hold_2, average = "macro" )
score_auc_1 = roc_auc_score(y_holdout_2, y_pred_hold_2, multi_class='ovo')

print("Recall: ", score_recall_1 * 100)
print("ROC_AUC: ", score_auc_1 * 100)
print("Score: ", 0.1 * score_recall_1 + 0.9 * score_auc_1)

F1:  24.01857680584354
Accuracy:  83.02144444444444
Recall:  88.29710878205105
ROC_AUC:  88.29710878205107
Score:  0.8829710878205107


### Столбец значений для второго уровня обучения

In [49]:
predictions_proba_2 = model_2.predict_proba(Pool(X_resampled_2))

In [50]:
pred_3 = []

for i in range(0, len(predictions_proba_2)):
    pred_3.append(predictions_proba_2[i][1])

In [59]:
stack_model_2 = pd.DataFrame({'label': pred_3})

# Stacking Gradient Boosting / Обучение на Meta-Learner / Модель №3

In [62]:
train_stack = pd.DataFrame({'X_1': stack_model_1['label'], 'X_2': stack_model_2['label']})
train_stack['label'] = y_resampled_2['label']

In [63]:
X_train_final, X_holdout_final, y_train_final, y_holdout_final = train_test_split(train_stack[['X_1', 'X_2']], train_stack['label'], test_size=0.2, random_state=17)

In [64]:
model_final = LogisticRegression()

model_final.fit(X_train_final, y_train_final)

### Финальное предсказывание на Meta-Leaner

In [65]:
test_stack = pd.DataFrame({'X_1': test_pred_model_2['label'], 'X_2': test_pred_model_1['label']})

In [66]:
final_predictions = pd.DataFrame({'label': model_final.predict(test_stack)})
final_predictions.index = id_test

In [67]:
final_predictions.value_counts()

label
0        3518906
1         481094
dtype: int64

In [68]:
final_predictions.to_csv("final_predictions.csv", line_terminator='\n')