# MBK competition

In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as prp

pd.set_option('display.max_columns', None)

train = pd.read_csv('train_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')
test = pd.read_csv('test_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')
print(train.shape, test.shape)

(17891, 124) (7330, 123)


## генерация признаков
- тут немного, я попробовал много вариантов, но оставил только те, которые дали прирост скора. 
- недостаточно для повышенного скора, как оказалось (топ-2 решение генерировал 200+ фичей)

In [2]:
def make_features(data):

    data['CITIZENSHIP_NAME'] = data['CITIZENSHIP_NAME'].fillna(-1).map({-1: -1, 'Российская Федерация': 4, 'Таджикистан': 3, 'Казахстан': 2, 'Армения': 1})
    data['SEX_NAME'] = data['CITIZENSHIP_NAME'].fillna(0).map({0: 0, 'мужской': 1, 'женский': -1})
    cat_names = ['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP'] + ['WORKERSRANGE', 'OKVED_CODE'] + ['OKATO_FED','OKTMO_FED']
    date_names = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']
    for name in date_names:
        data[name] = pd.to_datetime(data[name], format = '%d%b%Y:%H:%M:%S', errors = 'coerce')
        data[name + '_dayofweek'] = data[name].dt.dayofweek
    def minus_shift(x):
        diff = (x-x.shift()).dt.days
        return diff.fillna(0)
    data['vari_diff'] = data.groupby('id_client')['SIGN_DATE'].apply(minus_shift)
    data['vari_diff_mean'] = data.groupby('id_client')['vari_diff'].transform('mean')
    # data['vari_diff'] = np.log1p(data_train['vari_diff']) # log-operation
    # data['vari_diff_mean'] = np.log1p(data_train['vari_diff_mean']) # log-operation
    for name in cat_names + date_names+ ['id_client']:
        data[name] = data[name].fillna(-1)
        tmp = data[name].value_counts()
        tmp = tmp + 0.05 * np.random.randn(len(tmp))
        data[name] = data[name].map(tmp)
    # data = pd.get_dummies(data, columns = cat_names)
    
    #for name in cols:
    #    data[name] = prp.StandardScaler().fit_transform(data[[name]])
    
    # data['OKATO_FED'] = np.log1p(data_train['OKATO_FED']) # log-operation
    # data['OKTMO_FED'] = np.log1p(data_train['OKTMO_FED']) # log-operation
    data.fillna(-1, inplace=True)
    return data

In [3]:
data_train = make_features(train)
data_test = make_features(test)

## Модель: lightgbm
был немножко тюнинг с GridSearchCV

In [6]:
y = data_train.pop('TARGET').values
data_test = data_test[data_train.columns]

In [8]:
import lightgbm as lgb
import xgboost as xgb

model = lgb.LGBMClassifier(num_leaves=30, learning_rate=0.05, n_estimators=500)
model.fit(data_train, y)
pred = model.predict_proba(data_test)[:, 1]

In [10]:
df = pd.DataFrame({'id_contract': data_test.id_contract.values, 'TARGET': pred})
df.to_csv('lightgbm8.csv', sep=';', index=False)

какие фичи дали прирост, а какие нет
1. vari_diff +
2. log-operation -
3. 'OKATO_FED','OKTMO_FED' log???
4. dayofweek +
5. MinMaxScaler of F --
6. 'id_client', 'id_contrast' +

## Модель: catboost

In [7]:
y = data_train.pop('TARGET').values
data_test = data_test[data_train.columns]

In [8]:
import catboost as cb
model = cb.CatBoostClassifier(iterations=300, depth=6,learning_rate=0.1, custom_loss='AUC', eval_metric='AUC')

model.fit(data_train, y)
pred = model.predict_proba(data_test)[:, 1]

0:	total: 74.8ms	remaining: 22.4s
1:	total: 90ms	remaining: 13.4s
2:	total: 105ms	remaining: 10.4s
3:	total: 119ms	remaining: 8.83s
4:	total: 134ms	remaining: 7.91s
5:	total: 148ms	remaining: 7.25s
6:	total: 165ms	remaining: 6.89s
7:	total: 179ms	remaining: 6.55s
8:	total: 194ms	remaining: 6.28s
9:	total: 208ms	remaining: 6.04s
10:	total: 222ms	remaining: 5.84s
11:	total: 237ms	remaining: 5.69s
12:	total: 255ms	remaining: 5.64s
13:	total: 271ms	remaining: 5.54s
14:	total: 288ms	remaining: 5.48s
15:	total: 303ms	remaining: 5.38s
16:	total: 318ms	remaining: 5.29s
17:	total: 332ms	remaining: 5.2s
18:	total: 346ms	remaining: 5.12s
19:	total: 360ms	remaining: 5.04s
20:	total: 374ms	remaining: 4.97s
21:	total: 387ms	remaining: 4.89s
22:	total: 402ms	remaining: 4.84s
23:	total: 419ms	remaining: 4.81s
24:	total: 433ms	remaining: 4.76s
25:	total: 447ms	remaining: 4.71s
26:	total: 461ms	remaining: 4.66s
27:	total: 477ms	remaining: 4.63s
28:	total: 494ms	remaining: 4.61s
29:	total: 509ms	remainin

In [9]:
df = pd.DataFrame({'id_contract': data_test.id_contract.values, 'TARGET': pred})
df.to_csv('catboost1.csv', sep=';', index=False)