# Решение трека №1

* Участник: Мелентьев Никита
* Команда: 101-team

------------------------------

**Используемые файлы:**
* Папка ./data - с данными
* Папка ./results - с результатами
* Файл utils.py - код с вспомогательными функциями
* Файл main.ipynb - скрипт для обучения модели и расчета сабмита

## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool, cv

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('./')
import utils

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

## Data Load

**Target**

In [2]:
target_train_df = pd.read_csv('./data/target_train.csv')
sub_df = pd.read_csv('./data/sample_submission.csv')

print('Train shape:', target_train_df.shape[0])
print('Test shape:', sub_df.shape[0])

# Проверим что трейн / тест не пересекаются по id
assert len(set(sub_df.NPLV.unique()).intersection(target_train_df.NPLV.unique())) == 0

target_train_df.head(3)

Train shape: 2063
Test shape: 780


Unnamed: 0,NPLV,TST,C
0,510008,1690,0.06
1,510009,1683,0.097
2,510010,1662,0.091


**Chronom**

In [3]:
chronom_train_df = pd.read_csv('./data/chronom_train.csv',index_col = 0)
chronom_test_df = pd.read_csv('./data/chronom_test.csv',index_col = 0)

# Типизация
chronom_train_df.VR_NACH = pd.to_datetime(chronom_train_df.VR_NACH)
chronom_test_df.VR_NACH = pd.to_datetime(chronom_test_df.VR_NACH)
chronom_train_df.VR_KON = pd.to_datetime(chronom_train_df.VR_KON)
chronom_test_df.VR_KON = pd.to_datetime(chronom_test_df.VR_KON)

# Уберем баг с 2011
chronom_train_df = chronom_train_df[chronom_train_df.VR_NACH.dt.year == 2021].copy()
chronom_test_df = chronom_test_df[chronom_test_df.VR_NACH.dt.year == 2021].copy()

# Закодируем марку
le = LabelEncoder()
chronom_train_df['NOP_le'] = le.fit_transform(chronom_train_df['NOP'])
le_dict = dict(zip(le.classes_, np.arange(len(le.classes_))))
chronom_test_df['NOP_le'] = chronom_test_df['NOP'].map(lambda x: le_dict.get(x, -1)).astype('int64')

chronom_train_df.head(3)

Unnamed: 0,NPLV,TYPE_OPER,NOP,VR_NACH,VR_KON,O2,NOP_le
35162,510008,межпл.прост.,межпл. простой,2021-01-01 03:01:07,2021-01-01 03:08:11,,33
35163,510008,межпл.прост.,Осмотр конвертера,2021-01-01 03:01:19,2021-01-01 03:03:43,,17
35164,510008,межпл.прост.,Наведение гарнисажа,2021-01-01 03:03:43,2021-01-01 03:05:23,,8


**Produv**

In [4]:
produv_train_df = pd.read_csv('./data/produv_train.csv')
produv_test_df = pd.read_csv('./data/produv_test.csv')

# Типизация
produv_train_df.SEC = pd.to_datetime(produv_train_df.SEC)
produv_test_df.SEC = pd.to_datetime(produv_test_df.SEC)

# Сортировка
produv_train_df = produv_train_df.sort_values(['NPLV', 'SEC']).reset_index(drop = True)
produv_test_df = produv_test_df.sort_values(['NPLV', 'SEC']).reset_index(drop = True)

# Проверка совпадения id с train / test
assert len(set(produv_train_df.NPLV.unique()).intersection(target_train_df.NPLV.unique())) == target_train_df.NPLV.nunique()
assert produv_train_df.NPLV.nunique() == target_train_df.NPLV.nunique()
assert len(set(produv_test_df.NPLV.unique()).intersection(sub_df.NPLV.unique())) == sub_df.NPLV.nunique()
assert produv_test_df.NPLV.nunique() == sub_df.NPLV.nunique()

# Оставим только время продувки (на основе хронометража)
produv_train_df = produv_train_df.merge(
    chronom_train_df[chronom_train_df.NOP == 'Продувка'][['NPLV', 'VR_NACH', 'VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
produv_test_df = produv_test_df.merge(
    chronom_test_df[chronom_test_df.NOP == 'Продувка'][['NPLV', 'VR_NACH', 'VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
produv_train_df = produv_train_df[(produv_train_df.SEC >= produv_train_df.VR_NACH)&
               (produv_train_df.SEC <= produv_train_df.VR_KON)]\
    .drop(['VR_NACH', 'VR_KON'], 1)
produv_test_df = produv_test_df[(produv_test_df.SEC >= produv_test_df.VR_NACH)&
               (produv_test_df.SEC <= produv_test_df.VR_KON)]\
    .drop(['VR_NACH', 'VR_KON'], 1)

# расход в минуту -> расход в секунду
produv_train_df['RAS'] = produv_train_df['RAS'] / 60
produv_test_df['RAS'] = produv_test_df['RAS'] / 60
produv_train_df = utils.min_to_sec(produv_train_df)
produv_test_df = utils.min_to_sec(produv_test_df)

# новые признаки
produv_train_df['RAS_MUL_POL'] = produv_train_df['RAS'] * produv_train_df['POL']
produv_test_df['RAS_MUL_POL'] = produv_test_df['RAS'] * produv_test_df['POL']
produv_train_df['RAS_REL_POL'] = produv_train_df['RAS'] / produv_train_df['POL']
produv_test_df['RAS_REL_POL'] = produv_test_df['RAS'] / produv_test_df['POL']

produv_train_df.head(3)

Unnamed: 0,NPLV,SEC,RAS,POL,RAS_MUL_POL,RAS_REL_POL
0,510008.0,2021-01-01 03:18:26,6.366667,3.92,24.957333,1.62415
1,510008.0,2021-01-01 03:18:27,6.366667,3.92,24.957333,1.62415
2,510008.0,2021-01-01 03:18:28,6.366667,3.92,24.957333,1.62415


**Lom**

In [5]:
lom_train_df = pd.read_csv('./data/lom_train.csv')
lom_test_df = pd.read_csv('./data/lom_test.csv')

# Для категориальных фичей
lom_train_df['VDL'] = 'VDL_' + lom_train_df['VDL'].astype('str')
lom_test_df['VDL'] = 'VDL_' + lom_test_df['VDL'].astype('str')

# Проверка на совпадение id с трейн / тестом
assert len(set(lom_train_df.VDL.unique()).union(lom_test_df.VDL.unique())) == len(lom_train_df.VDL.unique())
assert lom_train_df[lom_train_df.duplicated(['NPLV', 'VDL'])].shape[0] == 0

lom_train_df.head(3)

Unnamed: 0,NPLV,VDL,NML,VES
0,510008,VDL_4,К,56500
1,510008,VDL_8,О,16700
2,510008,VDL_13,КП,3000


**Plavki info**

In [6]:
plavki_train_df = pd.read_csv('./data/plavki_train.csv')
plavki_test_df = pd.read_csv('./data/plavki_test.csv')

# Типизация
plavki_train_df['plavka_VR_NACH'] = pd.to_datetime(plavki_train_df['plavka_VR_NACH'])
plavki_train_df['plavka_VR_KON'] = pd.to_datetime(plavki_train_df['plavka_VR_KON'])
plavki_test_df['plavka_VR_NACH'] = pd.to_datetime(plavki_test_df['plavka_VR_NACH'])
plavki_test_df['plavka_VR_KON'] = pd.to_datetime(plavki_test_df['plavka_VR_KON'])

# Удаление багов с дуплями
plavki_train_df = plavki_train_df[~plavki_train_df.duplicated(['NPLV'], keep = 'first')].reset_index(drop = True)
plavki_test_df = plavki_test_df[~plavki_test_df.duplicated(['NPLV'], keep = 'first')].reset_index(drop = True)

# Для категориальных фичей кодируем марку
le = LabelEncoder()
plavki_train_df['plavka_NMZ'] = le.fit_transform(plavki_train_df['plavka_NMZ'])
le_dict = dict(zip(le.classes_, np.arange(len(le.classes_))))
plavki_test_df['plavka_NMZ'] = plavki_test_df['plavka_NMZ'].map(lambda x: le_dict.get(x, -1))

plavki_train_df.head(3)

Unnamed: 0,NPLV,plavka_VR_NACH,plavka_VR_KON,plavka_NMZ,plavka_NAPR_ZAD,plavka_STFUT,plavka_TIPE_FUR,plavka_ST_FURM,plavka_TIPE_GOL,plavka_ST_GOL
0,510008,2021-01-01 03:08:11,2021-01-01 03:51:10,42,МНЛЗ,971,цилиндрическая,11,5 сопловая,11
1,510009,2021-01-01 04:00:44,2021-01-01 05:07:28,42,МНЛЗ,972,цилиндрическая,12,5 сопловая,12
2,510010,2021-01-01 05:12:29,2021-01-01 06:00:53,58,Изл,973,цилиндрическая,13,5 сопловая,13


**sip**

In [7]:
sip_train_df = pd.read_csv('./data/sip_train.csv')
sip_test_df = pd.read_csv('./data/sip_test.csv')

# Типизация
sip_train_df['DAT_OTD'] = pd.to_datetime(sip_train_df['DAT_OTD'])
sip_test_df['DAT_OTD'] = pd.to_datetime(sip_test_df['DAT_OTD'])

# Для категориальных фичей кодируем вид
sip_train_df['VDSYP'] = 'VDSYP_' + sip_train_df['VDSYP'].astype('str')
sip_test_df['VDSYP'] = 'VDSYP_' + sip_test_df['VDSYP'].astype('str')

# Оставим только время продувки (на основе хронометража)
sip_train_df = sip_train_df.merge(
    chronom_train_df[chronom_train_df.NOP == 'Продувка'][['NPLV','VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
sip_test_df = sip_test_df.merge(
    chronom_test_df[chronom_test_df.NOP == 'Продувка'][['NPLV', 'VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
sip_train_df = sip_train_df[sip_train_df['DAT_OTD'] <= sip_train_df['VR_KON']].copy()
sip_test_df = sip_test_df[sip_test_df['DAT_OTD'] <= sip_test_df['VR_KON']].copy()

sip_train_df.head(3)

Unnamed: 0,NPLV,VDSYP,NMSYP,VSSYP,DAT_OTD,VR_KON
0,510008,VDSYP_346,Уголь ТО,570,2021-01-01 03:03:53,2021-01-01 03:37:55
1,510008,VDSYP_346,Уголь ТО,220,2021-01-01 03:04:10,2021-01-01 03:37:55
2,510008,VDSYP_408,изв_ЦОИ,7300,2021-01-01 03:08:17,2021-01-01 03:37:55


**chugun**

In [8]:
chugun_train_df = pd.read_csv('./data/chugun_train.csv')
chugun_test_df = pd.read_csv('./data/chugun_test.csv')

# Типизация
chugun_train_df['DATA_ZAMERA'] = pd.to_datetime(chugun_train_df['DATA_ZAMERA'])
chugun_test_df['DATA_ZAMERA'] = pd.to_datetime(chugun_test_df['DATA_ZAMERA'])

chugun_train_df.head(3)

Unnamed: 0,NPLV,VES,T,SI,MN,S,P,CR,NI,CU,V,TI,DATA_ZAMERA
0,510008,263700.0,1396.0,0.44,0.22,0.023,0.097,0.03,0.01,0.03,0.103,0.084,2021-01-01 03:15:03
1,510009,264500.0,1419.0,0.68,0.2,0.017,0.087,0.02,0.01,0.03,0.084,0.096,2021-01-01 04:23:48
2,510010,263800.0,1384.0,0.56,0.26,0.017,0.096,0.03,0.01,0.03,0.115,0.11,2021-01-01 05:21:40


**gas**



In [9]:
gas_train_df = pd.read_csv('./data/gas_train.csv')
gas_test_df = pd.read_csv('./data/gas_test.csv')

# Типизация
gas_train_df['Time'] = pd.to_datetime(gas_train_df['Time'])
gas_test_df['Time'] = pd.to_datetime(gas_test_df['Time'])

# Оставим только время продувки (на основе хронометража)
gas_train_df = gas_train_df.merge(
    chronom_train_df[chronom_train_df.NOP == 'Продувка'][['NPLV','VR_NACH', 'VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
gas_test_df = gas_test_df.merge(
    chronom_test_df[chronom_test_df.NOP == 'Продувка'][['NPLV', 'VR_NACH', 'VR_KON']],
    on = ['NPLV'], how = 'left', validate = 'm:1'
)
gas_train_df = gas_train_df[gas_train_df['Time'] <= gas_train_df['VR_KON']].copy()
gas_test_df = gas_test_df[gas_test_df['Time'] <= gas_test_df['VR_KON']].copy()

# Новая фича
gas_train_df['T_rel'] = gas_train_df['T'] / gas_train_df['V'] 
gas_test_df['T_rel'] = gas_test_df['T'] / gas_test_df['V'] 
for col in ['O2', 'N2', 'H2', 'CO2', 'CO', 'AR']:
    gas_train_df[col + '_V'] = gas_train_df[col] / 100 * gas_train_df['V']
for col in ['O2', 'N2', 'H2', 'CO2', 'CO', 'AR']:
    gas_test_df[col + '_V'] = gas_test_df[col] / 100 * gas_train_df['V']

gas_train_df.head(3)

Unnamed: 0,NPLV,Time,V,T,O2,N2,H2,CO2,CO,AR,T фурмы 1,T фурмы 2,O2_pressure,VR_NACH,VR_KON,T_rel,O2_V,N2_V,H2_V,CO2_V,CO_V,AR_V
0,510008,2021-01-01 03:08:11.437,218263.34375,262.847229,18.722993,80.132247,0.087755,0.163878,0.009229,0.893243,0.0,0.0,13.085938,2021-01-01 03:18:25,2021-01-01 03:37:55,0.001204,40865.430838,174899.321661,191.537228,357.684632,20.143579,1949.621201
1,510008,2021-01-01 03:08:12.437,218263.34375,262.847229,18.732721,80.138406,0.087959,0.14898,0.00839,0.892948,0.0,0.0,13.085938,2021-01-01 03:18:25,2021-01-01 03:37:55,0.001204,40886.663258,174912.764694,191.982663,325.167847,18.312344,1948.977795
2,510008,2021-01-01 03:08:13.437,218369.359375,262.152771,18.742449,80.144565,0.088163,0.134082,0.007551,0.892653,0.0,0.0,13.085938,2021-01-01 03:18:25,2021-01-01 03:37:55,0.001201,40927.765603,175011.173489,192.521565,292.79321,16.489115,1949.28074


## Features

In [186]:
# Какие фичи в целом собирать для данных
used_features = [
    
    # Суммы весов
    'VDL_SUM','VDSYP_SUM',
    
    # Фичи продувки
    'NPLV_SUM_RAS', 'NPLV_MEDIAN_RAS', 'NPLV_MEAN_RAS', 'NPLV_STD_RAS', 'NPLV_MIN_RAS', 'NPLV_MAX_RAS',
    'NPLV_SUM_POL', 'NPLV_MEDIAN_POL', 'NPLV_MEAN_POL', 'NPLV_STD_POL', 'NPLV_MIN_POL', 'NPLV_MAX_POL',
    'NPLV_SUM_RAS_MUL_POL', 'NPLV_MEDIAN_RAS_MUL_POL', 'NPLV_MEAN_RAS_MUL_POL', 'NPLV_STD_RAS_MUL_POL', 'NPLV_MIN_RAS_MUL_POL', 'NPLV_MAX_RAS_MUL_POL',
    'NPLV_SUM_RAS_REL_POL', 'NPLV_MEDIAN_RAS_REL_POL', 'NPLV_MEAN_RAS_REL_POL', 'NPLV_STD_RAS_REL_POL', 'NPLV_MIN_RAS_REL_POL', 'NPLV_MAX_RAS_REL_POL',
    'NPLV_total_seconds', 'NPLV_TIME_SINCE_LAST', 
    
    # Фичи плавки
    'plavka_NAPR_ZAD', 'plavka_TIPE_FUR', 'plavka_TIPE_GOL',
    'plavka_ST_FURM', 'plavka_ST_GOL', 'plavka_NMZ',
    
    # Фичи чугуна
    'SI', 'MN', 'S', 'P', 'CR', 'NI', 'CU', 'V', 'TI', 'CHUGUN_SINCE_ZAME_SECONDS', 'T', #'VES',
    
    # Комбинированные фичи
    'VES_CHUGUN_REL_LOM',  #'VES_CHUGUN_SUM_LOM', 
    'VES_CHUGUN_REL_SIP', #'VES_CHUGUN_SUM_SIP', 'VES_CHUGUN_SUM_SIP_LOM',
    'T_CHUGUN_REL_GAS', 'T_GAS_DELTA_SINCE_ZAMER',
    
    # Фичи газа
    'T_last_gas',  'V_last_gas', 'O2_last_gas', 'N2_last_gas', 'H2_last_gas', 'CO2_last_gas', 'CO_last_gas', 'AR_last_gas',
    'O2_mean_gas', 'N2_mean_gas', 'H2_mean_gas', 'CO2_mean_gas', 'CO_mean_gas', 'AR_mean_gas', 'T фурмы 1_mean_gas', 'T фурмы 2_mean_gas', 'O2_pressure_mean_gas',
    'V_sum_gas', 'T_rel_last_gas', 'T_mean_gas', 'T_rel_mean_gas', 'V_mean_gas',
    'O2_V_SUM_PRODUVKA', 'N2_V_SUM_PRODUVKA', 'H2_V_SUM_PRODUVKA', 'CO2_V_SUM_PRODUVKA', 'CO_V_SUM_PRODUVKA', 'AR_V_SUM_PRODUVKA',
    'MAIN_FURMA',
    
    # Другие фичи
    'PLAVKI_O2', 'TIME_FROM_PLAVKI_TO_PRODUV', 'TIME_FROM_START_TO_PRODUV', 'OPERATIONS_COUNT_TO_PRODUV',
    
    # Другие фичи
    '0_CHRONO_NOP', '4_CHRONO_NOP', '5_CHRONO_NOP', '6_CHRONO_NOP', '7_CHRONO_NOP', '8_CHRONO_NOP', 
    '9_CHRONO_NOP', '10_CHRONO_NOP', '11_CHRONO_NOP', '12_CHRONO_NOP', '13_CHRONO_NOP', '14_CHRONO_NOP', 
    '16_CHRONO_NOP', '17_CHRONO_NOP', '19_CHRONO_NOP', '20_CHRONO_NOP', '21_CHRONO_NOP', '22_CHRONO_NOP', 
    '24_CHRONO_NOP', '26_CHRONO_NOP', '27_CHRONO_NOP', '29_CHRONO_NOP', '30_CHRONO_NOP', '32_CHRONO_NOP', 
    '33_CHRONO_NOP',
    
] + list(lom_train_df.VDL.unique()) + [x + '_REL' for x in list(lom_train_df.VDL.unique())]\
    + list(sip_train_df.VDSYP.unique()) + [x + '_REL' for x in list(sip_train_df.VDSYP.unique())] # фичи лома / материалов

cat_features = ['plavka_NAPR_ZAD', 'plavka_TIPE_FUR', 'plavka_TIPE_GOL']
target_feature_1 = 'TST'
target_feature_2 = 'C'

**Соберем фичи**

In [187]:
# TRAIN
train_df = target_train_df.copy()
train_df = utils.prepare_features(train_df, produv_train_df, lom_train_df, 
                            plavki_train_df, sip_train_df, chugun_train_df, 
                            gas_train_df, chronom_train_df, used_features)\
    [used_features + ['NPLV', target_feature_1, target_feature_2]]
assert train_df.shape[0] == target_train_df.shape[0]

# TEST
test_df = sub_df.copy()
test_df = utils.prepare_features(test_df, produv_test_df, lom_test_df, 
                           plavki_test_df, sip_test_df, chugun_test_df, 
                           gas_test_df, chronom_test_df, used_features)\
    [used_features + ['NPLV', target_feature_1, target_feature_2]]
assert test_df.shape[0] == sub_df.shape[0]

train_df.head()

Unnamed: 0,VDL_SUM,VDSYP_SUM,NPLV_SUM_RAS,NPLV_MEDIAN_RAS,NPLV_MEAN_RAS,NPLV_STD_RAS,NPLV_MIN_RAS,NPLV_MAX_RAS,NPLV_SUM_POL,NPLV_MEDIAN_POL,NPLV_MEAN_POL,NPLV_STD_POL,NPLV_MIN_POL,NPLV_MAX_POL,NPLV_SUM_RAS_MUL_POL,NPLV_MEDIAN_RAS_MUL_POL,NPLV_MEAN_RAS_MUL_POL,NPLV_STD_RAS_MUL_POL,NPLV_MIN_RAS_MUL_POL,NPLV_MAX_RAS_MUL_POL,NPLV_SUM_RAS_REL_POL,NPLV_MEDIAN_RAS_REL_POL,NPLV_MEAN_RAS_REL_POL,NPLV_STD_RAS_REL_POL,NPLV_MIN_RAS_REL_POL,NPLV_MAX_RAS_REL_POL,NPLV_total_seconds,NPLV_TIME_SINCE_LAST,plavka_NAPR_ZAD,plavka_TIPE_FUR,plavka_TIPE_GOL,plavka_ST_FURM,plavka_ST_GOL,plavka_NMZ,SI,MN,S,P,CR,NI,CU,V,TI,CHUGUN_SINCE_ZAME_SECONDS,T,VES_CHUGUN_REL_LOM,VES_CHUGUN_REL_SIP,T_CHUGUN_REL_GAS,T_GAS_DELTA_SINCE_ZAMER,T_last_gas,...,16_CHRONO_NOP,17_CHRONO_NOP,19_CHRONO_NOP,20_CHRONO_NOP,21_CHRONO_NOP,22_CHRONO_NOP,24_CHRONO_NOP,26_CHRONO_NOP,27_CHRONO_NOP,29_CHRONO_NOP,30_CHRONO_NOP,32_CHRONO_NOP,33_CHRONO_NOP,VDL_4,VDL_8,VDL_13,VDL_23,VDL_61,VDL_20,VDL_48,VDL_49,VDL_63,VDL_3,VDL_4_REL,VDL_8_REL,VDL_13_REL,VDL_23_REL,VDL_61_REL,VDL_20_REL,VDL_48_REL,VDL_49_REL,VDL_63_REL,VDL_3_REL,VDSYP_346,VDSYP_408,VDSYP_171,VDSYP_442,VDSYP_104,VDSYP_119,VDSYP_397,VDSYP_346_REL,VDSYP_408_REL,VDSYP_171_REL,VDSYP_442_REL,VDSYP_104_REL,VDSYP_119_REL,VDSYP_397_REL,NPLV,TST,C
0,76200.0,20970.0,16240.172271,13.840876,13.892363,1.676171,1.233333,16.083333,1335.281393,0.861364,1.142242,0.773493,0.77,7.07,17905.438713,11.81243,15.316885,8.895464,7.938,58.016,17283.995024,15.856279,14.785282,4.286615,0.174446,18.565401,1168.0,,2,1,1,11,11,42,0.44,0.22,0.023,0.097,0.03,0.01,0.03,0.103,0.084,203.0,1396.0,3.46063,12.575107,4.683509,-28.125,788.888855,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,56500.0,16700.0,3000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.74147,0.21916,0.03937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2950.0,14080.0,980.0,2960.0,0.0,0.0,0.0,0.140677,0.671435,0.046733,0.141154,0.0,0.0,0.0,510008,1690,0.06
1,78600.0,23780.0,17030.273545,13.256468,13.857017,2.100884,1.133333,17.1,1659.173624,0.840625,1.350019,0.918433,0.63,4.76,23516.694578,10.98375,19.134821,14.981412,5.394667,60.363,16857.240034,15.620293,13.716225,5.661529,0.238095,24.62963,1228.0,4110.0,2,1,1,12,12,42,0.68,0.2,0.017,0.087,0.02,0.01,0.03,0.084,0.096,188.0,1419.0,3.36514,11.122792,10.826226,-116.168983,849.652771,...,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,49800.0,22800.0,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.633588,0.290076,0.076336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2930.0,18830.0,960.0,0.0,1060.0,0.0,0.0,0.123213,0.791842,0.04037,0.0,0.044575,0.0,0.0,510009,1683,0.097
2,76300.0,24070.0,16757.740094,13.368033,13.679788,1.796618,1.2,17.15,1549.260352,0.802623,1.264702,0.899005,0.65,4.080235,21319.255239,10.653685,17.403474,13.320364,4.896,62.591667,17660.629291,16.731747,14.41684,5.472545,0.294118,23.153846,1224.0,3494.0,3,1,1,13,13,58,0.56,0.26,0.017,0.096,0.03,0.01,0.03,0.115,0.11,210.0,1384.0,3.457405,10.959701,6.540766,-233.635453,843.75,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,45900.0,22400.0,2000.0,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601573,0.293578,0.026212,0.078637,0.0,0.0,0.0,0.0,0.0,0.0,2990.0,16080.0,1050.0,2960.0,990.0,0.0,0.0,0.124221,0.668052,0.043623,0.122975,0.04113,0.0,0.0,510010,1662,0.091
3,84100.0,17930.0,14200.966667,13.054092,13.435162,1.83384,1.283333,16.466667,1518.655,1.094894,1.43676,1.013042,0.79,11.18,19639.962032,14.752152,18.580853,10.38861,10.322667,61.344,12346.529289,12.035864,11.680728,4.084131,0.114788,16.88172,1056.0,3522.0,3,1,1,14,14,48,0.48,0.27,0.018,0.091,0.03,0.01,0.02,0.112,0.11,175.0,1401.0,3.13912,14.723926,5.065278,-19.313794,828.472229,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,51900.0,29200.0,3000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617122,0.347206,0.035672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3620.0,14310.0,0.0,0.0,0.0,0.0,0.0,0.201896,0.798104,0.0,0.0,0.0,0.0,0.0,510011,1609,0.41
4,76100.0,26470.0,16786.875,13.056206,13.548729,1.778447,1.033333,16.583333,1418.405,0.8,1.144798,0.758379,0.67,5.0,19231.082959,11.064545,15.521455,11.153227,5.166667,60.888,18146.94262,16.904762,14.646443,4.701221,0.206667,23.432836,1238.0,3734.0,1,1,1,15,15,38,0.47,0.23,0.018,0.096,0.02,0.01,0.03,0.083,0.07,184.0,1422.0,3.459921,9.94711,8.406145,-236.735032,813.194397,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,64000.0,6100.0,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.840999,0.080158,0.078844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2930.0,17480.0,1000.0,3010.0,2050.0,0.0,0.0,0.110691,0.66037,0.037779,0.113714,0.077446,0.0,0.0,510012,1682,0.12


In [188]:
# similarity by plavka
comb_df = train_df.append(test_df, ignore_index = True)
for i, row in tqdm.tqdm(comb_df.drop(['TST', 'C'], 1)[250:].iterrows(), total = comb_df[250:].shape[0]):
    
    prev_df = train_df[:i].copy() 
    prev_df['diff'] = (np.abs(prev_df.drop(['TST', 'C'], 1) - row) / prev_df).sum(axis = 1)
    prev_df.sort_values('diff', ascending = True, inplace = True)
    
    comb_df.loc[i, 'TST_SIMILAR'] = np.median(prev_df.TST.values[:10])
    comb_df.loc[i, 'C_SIMILAR'] = np.median(prev_df.C.values[:10])

train_df['TST_SIMILAR'] = comb_df[:train_df.shape[0]]['TST_SIMILAR'].values
train_df['C_SIMILAR'] = comb_df[:train_df.shape[0]]['C_SIMILAR'].values
test_df['TST_SIMILAR'] = comb_df[train_df.shape[0]:]['TST_SIMILAR'].values
test_df['C_SIMILAR'] = comb_df[train_df.shape[0]:]['C_SIMILAR'].values

100%|██████████| 2593/2593 [01:08<00:00, 38.03it/s]


## Model

In [230]:
# Общие параметры модели
params = {
    "iterations": 10000,
    "loss_function": "MAE",
    'learning_rate':0.01,
    "verbose": 100,
    'random_seed':42,
}
nfolds = 5

# Для каждой модели отдельный датасет
train_df_1 = train_df.copy()
train_df_2 = train_df[~pd.isnull(train_df['C'])].copy()
train_size_1_start = int(train_df_1.shape[0] * 0.5)
train_size_1 = int(train_df_1.shape[0] * 0.8)
train_size_2_start = int(train_df_2.shape[0] * 0.5)
train_size_2 = int(train_df_2.shape[0] * 0.8)

**model 1 - TST**

In [231]:
# Фичи модели
used_features_1 = [x for x in used_features if 'HAS_DODUVKA' not in x ] + ['TST_SIMILAR']
cat_features_1 = [x for x in cat_features if x in used_features_1]

# Данные модели
X_1 = train_df_1[used_features_1].copy()
X_test_1 = test_df[used_features_1].copy()
y_1 = train_df_1[target_feature_1]

# Данные holdout train / val
X_train_1 = X_1[train_size_1_start:train_size_1].copy()
y_train_1 = y_1[train_size_1_start:train_size_1].copy()
X_val_1 = X_1[train_size_1:].copy()
y_val_1 = y_1[train_size_1:].copy()

# Обучение
model_1 = CatBoostRegressor(
    iterations=params['iterations'],
    learning_rate=params['learning_rate'],
    loss_function=params['loss_function'],
    verbose=False,
    cat_features = cat_features_1,
)
model_1.fit(X_train_1, y_train_1,
         eval_set=Pool(X_val_1, y_val_1, cat_features=cat_features_1),
         early_stopping_rounds=100,
         use_best_model=True,);
y_train_pred_1 = model_1.predict(X_train_1)
y_val_pred_1 = model_1.predict(X_val_1)
y_test_pred_1 = model_1.predict(X_test_1)
    
# Ошибки модели
print('Train MAE TST:', mean_absolute_error(y_train_1, y_train_pred_1))
print('VAL MAE TST:', mean_absolute_error(y_val_1, y_val_pred_1))
print('Best it TEST:', model_1.best_iteration_)

Train MAE TST: 13.997919400666028
VAL MAE TST: 17.61488254762921
Best it TEST: 362


In [None]:
Train MAE TST: 17.072706207773805
VAL MAE TST: 17.162533175835517
Best it TEST: 355

In [191]:
# utils.plot_feature_importance(model_1.get_feature_importance(), used_features_2,'CATBOOST ')

In [232]:
# Фичи модели
used_features_2 = [x for x in used_features if 'HAS_DODUVKA' not in x] + ['C_SIMILAR']
cat_features_2 = [x for x in cat_features if x in used_features_2]

# Данные модели
X_2 = train_df_2[used_features_2].copy()
X_test_2 = test_df[used_features_2].copy()
y_2 = train_df_2[target_feature_2]

# Данные holdout train / val
X_train_2 = X_2[train_size_2_start:train_size_2].copy()
y_train_2 = y_2[train_size_2_start:train_size_2].copy()
X_val_2 = X_2[train_size_2:].copy()
y_val_2 = y_2[train_size_2:].copy()

# Обучение
model_2 = CatBoostRegressor(
    iterations=params['iterations'],
    learning_rate=params['learning_rate'],
    loss_function=params['loss_function'],
    verbose=False,
    cat_features = cat_features_2,
)
model_2.fit(X_train_2, y_train_2,
         eval_set=Pool(X_val_2, y_val_2, cat_features=cat_features_2),
         early_stopping_rounds=100,
         use_best_model=True);
y_train_pred_2 = model_2.predict(X_train_2)
y_val_pred_2 = model_2.predict(X_val_2)
y_test_pred_2 = model_2.predict(X_test_2)

# Ошибки модели
print('Train MAE C:', mean_absolute_error(y_train_2, y_train_pred_2))
print('VAL MAE C:', mean_absolute_error(y_val_2, y_val_pred_2))
print('Best it 2:', model_2.best_iteration_)

Train MAE C: 0.01856929574368948
VAL MAE C: 0.030618935641587092
Best it 2: 756


In [None]:
Train MAE C: 0.016355316471977598
VAL MAE C: 0.024884619996303734
Best it 2: 918

In [183]:
# utils.plot_feature_importance(model_2.get_feature_importance(), used_features_2,'CATBOOST ')

## Predict

**Полное обучение моделей**

In [233]:
print('Model 1 iterations:', model_1.best_iteration_)
model_1_full = CatBoostRegressor(
    iterations=model_1.best_iteration_,
    learning_rate=params['learning_rate'],
    loss_function=params['loss_function'],
    verbose=False,
    cat_features = cat_features_1,
)
model_1_full.fit(X_1, y_1);
y_test_pred_1_full = model_1_full.predict(X_test_1)

# full train
print('Model 2 iterations:', model_2.best_iteration_)
model_2_full = CatBoostRegressor(
    iterations=model_2.best_iteration_,
    learning_rate=params['learning_rate'],
    loss_function=params['loss_function'],
    verbose=False,
    cat_features = cat_features_2,
)
model_2_full.fit(X_2, y_2);
y_test_pred_2_full = model_2_full.predict(X_test_2)

Model 1 iterations: 362
Model 2 iterations: 756


**Прогноз теста**

In [234]:
# predict
save_df = sub_df.copy()
save_df['TST'] = y_test_pred_1_full
save_df['C'] = y_test_pred_2_full
assert save_df.shape[0] == sub_df.shape[0]
save_df.to_csv('./results/submission_final.csv', index = False)
save_df.head(3)

Unnamed: 0,NPLV,TST,C
0,512324,1636.599355,0.043263
1,512327,1632.740396,0.084497
2,512328,1644.564815,0.100819
