In [1]:
import pandas as pd
import numpy as np
import ipywidgets


import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

pd.options.display.float_format ='{:,.3f}'.format

import seaborn as sns


from sklearn.metrics import make_scorer
from catboost import CatBoostRegressor



In [2]:
from module import *

In [3]:
random_seed=43
random_state=43

# Read data

In [4]:
gas_train = pd.read_csv('data_task1/gas_train.csv') 
chronom_train = pd.read_csv('data_task1/chronom_train.csv', index_col=0)
chugun_train = pd.read_csv('data_task1/chugun_train.csv')
lom_train = pd.read_csv('data_task1/lom_train.csv')
plavki_train = pd.read_csv('data_task1/plavki_train.csv')
produv_train = pd.read_csv('data_task1/produv_train.csv')
sip_train = pd.read_csv('data_task1/sip_train.csv')
target_train = pd.read_csv('data_task1/target_train.csv')

# Data preparation

## Lom  and Chugun - Base

In [5]:
cat_features=[]

In [6]:
lom_train.groupby('NPLV').sum().VES

NPLV
510008    76200
510009    78600
510010    76300
510011    84100
510012    76100
          ...  
512318    73600
512319    76600
512320    64200
512321    66200
512322    76100
Name: VES, Length: 2063, dtype: int64

In [7]:
target_train.dropna(inplace=True)
wide_lom_train = lom_train.loc[:, ['NPLV', 'NML', 'VES']].pivot_table(index='NPLV', columns='NML', values='VES')
wide_lom_train.fillna(0, inplace=True)
wide_lom_train['total_lom_ves'] = lom_train.groupby('NPLV').sum().VES
data = pd.merge(target_train, chugun_train, how='left', on='NPLV')
data = pd.merge(data, wide_lom_train, how='left', on='NPLV')
data = data.drop('DATA_ZAMERA', axis=1)

## Plavki

In [8]:
plavki_train.drop(columns = ['plavka_VR_NACH', 'plavka_VR_KON'], inplace=True)
plavki_train= plavki_train[np.logical_not(plavki_train.NPLV.duplicated())]

In [9]:
cat_features += ['plavka_TIPE_GOL', 'plavka_TIPE_FUR', 'plavka_NMZ', 'plavka_NAPR_ZAD']
data = pd.merge(data, plavki_train, how='left', on='NPLV')


## Sip

In [10]:
wide_sip_train_ves = sip_train.groupby(['NPLV', 'VDSYP'])\
                          .sum().reset_index()\
                          .pivot_table(index='NPLV', columns='VDSYP', values='VSSYP')\
                          .fillna(0).reset_index()

In [11]:
wide_sip_train_count = sip_train.groupby(['NPLV', 'VDSYP'])\
                          .count().reset_index()\
                          .pivot_table(index='NPLV', columns='VDSYP', values='VSSYP')\
                          .fillna(0).reset_index()

In [12]:
wide_sip_train = pd.merge(wide_sip_train_count, wide_sip_train_ves, on='NPLV', suffixes=('_sip_count', '_sip_ves'))

In [13]:
data = pd.merge(data, wide_sip_train, how='left', on='NPLV')

## Produv

In [14]:
produv_train['VR_KON'] = (pd.merge(produv_train, chronom_train[chronom_train.NOP=='Продувка'], how='left', on='NPLV')).VR_KON

In [15]:
produv_before_VRKON = produv_train.loc[produv_train.SEC < produv_train.VR_KON]

In [16]:
produv_features = pd.DataFrame(data.NPLV)
produv_features['O2_in'] = produv_before_VRKON.groupby(['NPLV']).sum().apply(lambda x: x/30).reset_index().RAS

In [17]:
produv_features['mean_POL'] = produv_before_VRKON.groupby('NPLV').mean().reset_index().POL

In [18]:
produv_features['max_POL'] = produv_before_VRKON.groupby('NPLV').max().reset_index().POL

In [19]:
data = pd.merge(data, produv_features, how='left', on='NPLV')

## Chorom

## Gas

# Testing

In [20]:
data['1RANDOM'] = np.random.normal(0, 1, size=data.shape[0])

## Data for TST

In [21]:
data_TST = data.reindex(sorted(data.columns), axis=1)
data_TST.set_index('NPLV', inplace=True)
cat_features_TST = ['plavka_NMZ', 'plavka_NAPR_ZAD', 'plavka_TIPE_FUR', 'plavka_TIPE_GOL']

## Data for C


In [22]:
data_C = data.reindex(sorted(data.columns), axis=1)
data_C.set_index('NPLV', inplace=True)
cat_features_C = ['plavka_NMZ', 'plavka_NAPR_ZAD', 'plavka_TIPE_FUR', 'plavka_TIPE_GOL']

## log for C


In [23]:
data_C['log_C'] =  data_C.C.apply(np.log)


# Cross val

# C

In [73]:
X_train_C = data_C.drop(columns = ['TST', 'C', 'log_C'])
y_train_C = data_C.log_C


In [74]:
iterations=2000
task_type="GPU"
verbose=0
cv = 4
catboost_regressor_C = CatBoostRegressor(iterations=iterations,
                                         cat_features=cat_features_C, 
                                         random_seed=random_seed, 
                                         task_type=task_type,
                                         verbose=verbose)

scorer_C = make_scorer(metric_C)

scorer_C_log = make_scorer(metric_C_log)

In [75]:
cross_val_C = cross_val_score(catboost_regressor_C, X_train_C, y_train_C, scoring=scorer_C_log, cv = cv)

In [76]:
cross_val_C.mean()

0.6253951230526078

In [77]:
catboost_regressor_C.fit(X_train_C, y_train_C, verbose=verbose, plot=False)

<catboost.core.CatBoostRegressor at 0x7eff74a31bd0>

In [78]:
feature_imps = list(zip(X_train_C,catboost_regressor_C.feature_importances_))
feature_imps = sorted(feature_imps, key = lambda x: x[1])
feature_imps.reverse()
features_drop_C = []
drop_treshold_C = 1.0
print('Feature importances')
for feat, imp in feature_imps:
    if imp < drop_treshold_C:
        features_drop_C.append(feat)
    print(feat, ': ', imp)

Feature importances
408_sip_ves :  18.511438170277827
plavka_NMZ :  15.069359362864692
VES :  14.575715761112955
O2_in :  8.756920535467104
plavka_STFUT :  4.3784952142928555
346_sip_ves :  3.3168458195453763
T :  2.6914731635991633
442_sip_ves :  2.5463722158759845
plavka_NAPR_ZAD :  2.3820864266049475
total_lom_ves :  2.250336135262688
mean_POL :  2.1408924303206347
max_POL :  2.0463413204105168
P :  1.8500706833119818
SI :  1.8338295875805757
V :  1.6279243388490547
ЛЧ   :  1.5362243677788898
plavka_TIPE_GOL :  1.378957845159364
plavka_ST_FURM :  1.3776432642572827
plavka_ST_GOL :  1.2771282799798283
S :  1.0706417140076299
1RANDOM :  1.0592469966288909
408_sip_count :  1.0495683023293305
К    :  0.989697656958091
TI :  0.6734909048223094
О    :  0.6626615766297825
442_sip_count :  0.5815816889957148
КШС8 :  0.4508614869687762
ОК   :  0.4329123392501343
MN :  0.4107539305691424
171_sip_ves :  0.40709449783762847
346_sip_count :  0.4046214505443694
plavka_TIPE_FUR :  0.39312251555124

In [79]:
X_train_C_droped = data_C.drop(columns = ['TST', 'C', 'log_C', '1RANDOM']+features_drop_C)
y_train_C = data_C.log_C

In [80]:
cat_features_C_droped = list(set(cat_features_C).difference(set(features_drop_C)))

In [81]:
catboost_regressor_C_droped = CatBoostRegressor(iterations=iterations,
                                         cat_features=cat_features_C_droped, 
                                         random_seed=random_seed, 
                                         task_type=task_type,
                                         verbose=verbose)

In [82]:
cross_val_C_droped = cross_val_score(catboost_regressor_C_droped, X_train_C_droped, y_train_C, scoring=scorer_C_log, cv = cv)
cross_val_C_droped.mean()

KeyboardInterrupt: 

In [83]:
catboost_regressor_C_droped.fit(X_train_C_droped, y_train_C)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

<catboost.core.CatBoostRegressor at 0x7effb0156b50>

# TST 

In [35]:
X_train_TST = data_TST.drop(columns = ['TST', 'C'])
y_train_TST = data_TST.TST

In [36]:
cv = 4
iterations=2000
task_type="GPU"
verbose=0

catboost_regressor_TST = CatBoostRegressor(iterations=iterations,
                                           cat_features=cat_features_TST, 
                                           random_seed=random_seed, 
                                           task_type=task_type,
                                           verbose=verbose)

scorer_TST = make_scorer(metric_TST)

In [37]:
cross_val_TST = cross_val_score(catboost_regressor_TST, X_train_TST, y_train_TST, scoring=scorer_TST, cv = cv)
cross_val_TST.mean()

0.5269276360352224

In [38]:
catboost_regressor_TST.fit(X_train_TST, y_train_TST, verbose=verbose, plot=False)

<catboost.core.CatBoostRegressor at 0x7effb01831d0>

In [39]:
feature_imps = list(zip(X_train_TST,catboost_regressor_TST.feature_importances_))
feature_imps = sorted(feature_imps, key = lambda x: x[1])
feature_imps.reverse()
features_drop_TST = []
drop_treshold_TST = 1.3
print('Feature importances')
for feat, imp in feature_imps:
    if imp < drop_treshold_TST:
        features_drop_TST.append(feat)
    print(feat, ': ', imp)

Feature importances
O2_in :  11.175814075543526
total_lom_ves :  6.452560786600407
plavka_NAPR_ZAD :  6.238212988184643
T :  6.133104458148579
plavka_NMZ :  5.477989281669864
VES :  4.930881737123529
346_sip_ves :  4.80766939777147
К    :  3.4692337147284946
plavka_STFUT :  3.3545413881837964
442_sip_ves :  3.3047268122969027
max_POL :  2.9872286486031525
mean_POL :  2.86951058624672
КП   :  2.666600995395054
plavka_ST_GOL :  2.6202002631315278
408_sip_ves :  2.2526884825705893
plavka_ST_FURM :  2.1605569802632814
О    :  2.1561721823525497
У2КШ :  2.099509645837586
MN :  1.9475917327086967
408_sip_count :  1.8753882752018447
plavka_TIPE_GOL :  1.8055322678487753
ОК   :  1.6827177537553504
SI :  1.5531047856456837
V :  1.4059988166197936
1RANDOM :  1.309237382502922
346_sip_count :  1.2651201318690575
P :  1.2634708232451208
442_sip_count :  1.2292560397876167
TI :  1.2217794658034635
S :  1.1840426072139265
25КШ :  1.0774748910102807
171_sip_count :  1.0592244826298707
171_sip_ves :  

In [69]:
features_drop_TST

['ОК  ',
 'SI',
 'V',
 '1RANDOM',
 '346_sip_count',
 'P',
 '442_sip_count',
 'TI',
 'S',
 '25КШ',
 '171_sip_count',
 '171_sip_ves',
 'ЛЧ  ',
 'CR',
 'КШС8',
 '104_sip_count',
 '104_sip_ves',
 'plavka_TIPE_FUR',
 'CU',
 '397_sip_ves',
 'СК  ',
 '397_sip_count',
 'NI',
 '119_sip_ves',
 '119_sip_count',
 'НБ  ']

In [40]:
X_train_TST_droped = data_TST.drop(columns = ['TST', 'C',  '1RANDOM']+features_drop_TST)
y_train_TST = data_TST.TST

In [41]:
cat_features_TST_droped = list(set(cat_features_TST).difference(set(features_drop_TST)))

In [42]:
catboost_regressor_TST_droped = CatBoostRegressor(iterations=iterations,
                                         cat_features=cat_features_TST_droped, 
                                         random_seed=random_seed, 
                                         task_type=task_type,
                                         verbose=verbose)

In [43]:
cross_val_TST_droped = cross_val_score(catboost_regressor_TST_droped, X_train_TST_droped, y_train_TST, scoring=scorer_TST, cv = cv)
cross_val_TST_droped.mean()

0.5283942951757357

In [64]:
catboost_regressor_TST_droped.fit( X_train_TST_droped, y_train_TST)

<catboost.core.CatBoostRegressor at 0x7effb016bb10>

# data to predict

In [44]:
chronom_test = pd.read_csv('data_task1/chronom_test.csv', index_col=0)
chugun_test = pd.read_csv('data_task1/chugun_test.csv')
lom_test = pd.read_csv('data_task1/lom_test.csv')

plavki_test = pd.read_csv('data_task1/plavki_test.csv')



produv_test = pd.read_csv('data_task1/produv_test.csv')
sip_test = pd.read_csv('data_task1/sip_test.csv')
submission = pd.read_csv('data_task1/sample_submission.csv')

## Lom  and Chugun

In [45]:
wide_lom_test = lom_test.loc[:, ['NPLV', 'NML', 'VES']].pivot_table(index='NPLV', columns='NML', values='VES')
wide_lom_test.fillna(0, inplace=True)
wide_lom_test['total_lom_ves'] = lom_test.groupby('NPLV').sum().VES
submission = pd.merge(submission, chugun_test, how='left', on='NPLV')
submission = pd.merge(submission, wide_lom_test, how='left', on='NPLV')
submission = submission.drop('DATA_ZAMERA', axis=1)
submission['НБ  '] = 0.


In [46]:
y_pred = pd.read_csv('data_task1/sample_submission.csv', index_col = 'NPLV')

## Plavki

In [47]:
plavki_test.drop(columns = ['plavka_VR_NACH', 'plavka_VR_KON'], inplace=True)
plavki_test= plavki_test[np.logical_not(plavki_test.NPLV.duplicated())]

In [48]:
submission = pd.merge(submission, plavki_test, how='left', on='NPLV')

## Sip

In [49]:
wide_sip_test_ves = sip_test.groupby(['NPLV', 'VDSYP'])\
                          .sum().reset_index()\
                          .pivot_table(index='NPLV', columns='VDSYP', values='VSSYP')\
                          .fillna(0).reset_index()

In [50]:
wide_sip_test_count = sip_test.groupby(['NPLV', 'VDSYP'])\
                          .count().reset_index()\
                          .pivot_table(index='NPLV', columns='VDSYP', values='VSSYP')\
                          .fillna(0).reset_index()

In [51]:
wide_sip_test = pd.merge(wide_sip_test_count, wide_sip_test_ves, on='NPLV', suffixes=('_sip_count', '_sip_ves'))

In [52]:
submission = pd.merge(submission, wide_sip_test, how='left', on='NPLV')

## Produv

In [53]:
produv_test['VR_KON'] = (pd.merge(produv_test, chronom_test[chronom_test.NOP=='Продувка'], how='left', on='NPLV')).VR_KON

In [54]:
produv_before_VRKON = produv_test.loc[produv_test.SEC < produv_test.VR_KON]

In [55]:
produv_features = pd.DataFrame(submission.NPLV)
produv_features['O2_in'] = produv_before_VRKON.groupby(['NPLV']).sum().apply(lambda x: x/30).reset_index().RAS

In [56]:
produv_features['mean_POL'] = produv_before_VRKON.groupby('NPLV').mean().reset_index().POL

In [57]:
produv_features['max_POL'] = produv_before_VRKON.groupby('NPLV').max().reset_index().POL

In [58]:
submission = pd.merge(submission, produv_features, how='left', on='NPLV')

In [59]:
submission.drop(columns=set(submission.columns).difference(set(data.columns)), inplace=True)

# Predict C

In [60]:
submission_C = submission.reindex(sorted(submission.columns), axis=1)
submission_C.set_index('NPLV', inplace=True)

In [85]:
X_test_C_droped = submission_C.drop(columns = ['TST', 'C']+features_drop_C)

In [86]:
y_pred.C = np.exp(catboost_regressor_C_droped.predict(X_test_C_droped))

# Predict TST

In [67]:
submission_TST = submission.reindex(sorted(submission.columns), axis=1)
submission_TST.set_index('NPLV', inplace=True)

In [71]:
X_test_TST_droped = submission_TST.drop(columns = ['TST', 'C']+features_drop_TST)

In [72]:
y_pred.TST = catboost_regressor_TST_droped.predict(X_test_TST_droped)

In [88]:
y_pred.to_csv('submissions/main_15.csv')