## Результаты
| Model                    | P@5    |
|--------------------------|--------|
| Random                   | 0.0093 |
| Catboost                 | 0.1057 |
| Catboost + fe            | 0.1026 |
| Catboost + add_data      | 0.1091 |
| Catboost + add_data + fe | 0.1015 |


In [1]:
import os
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

In [2]:
def fix_seed(seed_value):
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)

seed_value = 7575
fix_seed(seed_value)

In [116]:
# class Pipeline:
#     def __init__(self, train_path, val_path, test_path=None, data_loader=None, data_prep=None, model=None, **kwargs):
#         """
#
#         :param train_path:
#         :param val_path:
#         :param test_path:
#         :param data_loader: implements method load
#         :param data_prep: implements methods fit transform
#         :param model: implements methods fit, predict_proba
#         """
#         if data_loader:
#             self.data_loader = data_loader()
#         else:
#             self.train_df = pd.read_csv(train_path)
#             self.val_df = pd.read_csv(val_path)
#             if test_path:
#                 self.test_df = pd.read_csv(test_path)
#
#         self.data_prep = data_prep()
#         self.model = model(**kwargs)
#
#     def fit(self, **fit_params):
#         train_df


# train_df = pd.read_csv('data/train_catugra.csv', sep=',',  index_col=0)
# val_df = pd.read_csv('data/test_catugra.csv', sep=',', index_col=0)

## Оценка сырых данных

In [16]:
train_df = pd.read_csv('data/train.csv', sep=';')
val_df = pd.read_csv('data/test.csv', sep=';')

In [119]:
# random modeling
N = 5
top5p = int(val_df.shape[0] * 0.05)
random_res = []

label_val_df = val_df[['label']]

for i in range(N):
    preds = label_val_df.label.sample(val_df.shape[0]).reset_index(drop=True)
    label_val_df['pred'] = preds
    random_res.append(label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p)

np.mean(random_res), random_res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


(0.009347996857816182,
 [0.008771929824561403,
  0.00968840010473946,
  0.010081173081958628,
  0.008379156847342237,
  0.009819324430479183])

In [120]:
## Simple modeling

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')


ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6470023	total: 361ms	remaining: 6m
200:	learn: 0.0918462	total: 29.8s	remaining: 1m 58s
400:	learn: 0.0897373	total: 54.2s	remaining: 1m 20s
600:	learn: 0.0884431	total: 1m 44s	remaining: 1m 9s
800:	learn: 0.0874101	total: 2m 22s	remaining: 35.5s
999:	learn: 0.0865075	total: 3m 3s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10578685519769573

# Data Preparation

In [36]:
def add_agg_features(df):
    df = df.copy()
    subj_feat = df[['subject_name'] + [f'f{i}' for i in range(1, 31)]].\
        groupby('subject_name').\
        agg({f"f{i}":[min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1,31)})

    features_names_subject = np.array([col + '_subject_' + stat for col in [f'f{i}' for i in range(1, 31)]
                                       for stat in ['min', 'max', 'mean', 'std','sum', 'median', 'skew', 'kurtosis']]).flatten().tolist()

    # city_feat = df[['city_name'] + [f'f{i}' for i in range(1, 31)]].groupby('city_name').agg(
    #     {f"f{i}": [min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1, 31)})
    #
    # features_city_subject = np.array([col + '_city_' + stat for col in [f'f{i}' for i in range(1, 31)]
    #                                    for stat in ['min', 'max', 'mean', 'std', 'sum', 'median', 'skew','kurtosis']]).flatten().tolist()

    # subjt_feat = df[['subject_type'] + [f'f{i}' for i in range(1, 31)]].groupby('subject_type').agg(
    #     {f"f{i}": [min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1, 31)})
    #
    # features_t_subject = np.array([col + '_subjt_' + stat for col in [f'f{i}' for i in range(1, 31)]
    #                                   for stat in ['min', 'max', 'mean', 'std', 'sum', 'median', 'skew','kurtosis']]).flatten().tolist()

    subj_df = pd.DataFrame(subj_feat.values, columns=features_names_subject)
    subj_df['subject_name'] = subj_feat.index

    # city_df = pd.DataFrame(city_feat.values, columns=features_city_subject)
    # city_df['city_name'] = city_feat.index

    # subjt_df = pd.DataFrame(subjt_feat.values, columns=features_t_subject)
    # subjt_df['subject_type'] = subjt_feat.index
    df = df.merge(subj_df, on='subject_name')#.merge(city_df, on='city_name')#.merge(subjt_df, on='subject_type')
    return df

In [38]:
train_df = pd.read_csv('data/train.csv', sep=';')
val_df = pd.read_csv('data/test.csv', sep=';')


cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')

# добавляем фич
train_df = add_agg_features(train_df)
val_df = add_agg_features(val_df)

## Simple modeling
ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6462937	total: 257ms	remaining: 4m 16s
200:	learn: 0.0916607	total: 11.6s	remaining: 46s
400:	learn: 0.0892890	total: 24.4s	remaining: 36.4s
600:	learn: 0.0878918	total: 40.7s	remaining: 27s
800:	learn: 0.0867803	total: 1m 3s	remaining: 15.8s
999:	learn: 0.0858101	total: 1m 28s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.1050013092432574

In [10]:
# пробуем с данными от кати

train_df = pd.read_csv('data/train_catugra.csv', sep=',',  index_col=0)
val_df = pd.read_csv('data/test_catugra.csv', sep=',', index_col=0)

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')


ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6459351	total: 228ms	remaining: 3m 47s
200:	learn: 0.0917587	total: 18.2s	remaining: 1m 12s
400:	learn: 0.0895619	total: 42.9s	remaining: 1m 4s
600:	learn: 0.0882322	total: 1m 10s	remaining: 47s
800:	learn: 0.0871200	total: 1m 32s	remaining: 22.9s
999:	learn: 0.0861629	total: 2m 13s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10919088766692851

In [11]:
train_df = add_agg_features(train_df)
val_df = add_agg_features(val_df)

ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6480979	total: 1.65s	remaining: 27m 30s
200:	learn: 0.0916542	total: 39.2s	remaining: 2m 35s
400:	learn: 0.0894033	total: 1m 22s	remaining: 2m 3s
600:	learn: 0.0880655	total: 1m 38s	remaining: 1m 5s
800:	learn: 0.0870913	total: 1m 59s	remaining: 29.7s
999:	learn: 0.0861618	total: 2m 43s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10159727677402461

In [31]:
# пробуем данные сереги и кати

train_df = pd.read_csv('data/train_catugra_v2.csv', sep=',')
val_df = pd.read_csv('data/test_catugra_v2.csv', sep=',', index_col=0)

drop_cols = ['total_cases', 'new_cases']

tarif_cols = ["district"]


def prepare_data(df):
    # df = df.drop(drop_cols, axis=1)
    # for col in [f'f{i}' for i in range(1,31)]:
    #     mask = df[col].isna()
    #     df[col][mask] = np.nan
    #     df[col] = df[col].fillna(df.groupby(tarif_cols[-1])[col].transform('mean'))
    return df

train_df = prepare_data(train_df)
val_df = prepare_data(val_df)

In [32]:
val_df

Unnamed: 0,label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,...,rural_district,services_district,communication_district,customer_district,mobile_customer_district,mobile_th_district,pc_district,inter_district,good_inter_district,amount_inf_district
0,1,2020-11-01,Город,Москва,Москва,8611aa6b7ffffff,55.656639,37.774902,0.00292,0.00211,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
1,1,2020-11-01,Город,Москва,Москва,8611aa70fffffff,55.879910,37.583383,0.00265,0.00355,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
2,1,2020-11-01,Город,Москва,Москва,8611aa627ffffff,55.855726,37.669858,0.00046,0.00059,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
3,1,2020-11-01,Город,Москва,Москва,861181b2fffffff,55.496542,37.542584,0.00304,0.00243,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
4,1,2020-11-01,Город,Москва,Москва,8611aa4cfffffff,55.613025,37.529612,0.00183,0.00163,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152761,0,2021-06-01,Область,Магаданская,Магадан,8617142efffffff,59.652400,150.759792,0.00024,0.00042,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
152762,0,2021-06-01,Область,Магаданская,Магадан,861710967ffffff,59.544997,150.861279,0.00044,0.00044,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
152763,0,2021-06-01,Область,Магаданская,Магадан,861710967ffffff,59.544997,150.861279,0.00023,0.00106,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
152764,0,2021-06-01,Область,Магаданская,Магадан,861710967ffffff,59.544997,150.861279,0.00022,0.00020,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0


In [33]:
train_df

Unnamed: 0,label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,...,rural_district,services_district,communication_district,customer_district,mobile_customer_district,mobile_th_district,pc_district,inter_district,good_inter_district,amount_inf_district
0,1,2020-05-01,Город,Москва,Москва,8611aa7a7ffffff,55.729458,37.516569,0.00101,0.00103,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
1,1,2020-05-01,Город,Москва,Москва,8611aa01fffffff,55.975851,37.237085,0.00000,0.00027,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
2,1,2020-05-01,Город,Москва,Москва,861181b6fffffff,55.622721,37.695121,0.00339,0.00313,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
3,1,2020-05-01,Город,Москва,Москва,8611aa017ffffff,55.941586,37.157487,0.00048,0.00054,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
4,1,2020-05-01,Город,Москва,Москва,8611aa637ffffff,55.797494,37.676200,0.00164,0.00179,...,17.6,28.7,3.3,27.9,110.7,44834.4,76.2,81.7,79.6,18037.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294253,0,2020-12-01,Область,Магаданская,Ола,8617154afffffff,59.577462,151.294079,0.00045,0.00046,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
294254,0,2020-12-01,Область,Магаданская,Ягодное,861735767ffffff,62.514463,149.591693,0.00048,0.00048,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
294255,0,2020-12-01,Область,Магаданская,Магадан,8617142cfffffff,59.598694,150.810630,,,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0
294256,0,2020-12-01,Автономный Округ,Чукотский,Певек,8604ebc5fffffff,69.703205,170.185894,0.00001,0.00002,...,27.0,25.5,3.7,19.0,98.7,8184.0,67.3,81.3,75.7,3681.0


In [34]:
train_df = pd.read_csv('data/train_catugra_v2.csv', sep=',')
val_df = pd.read_csv('data/test_catugra_v2.csv', sep=',', index_col=0)

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')

In [36]:
from sklearn.model_selection import StratifiedKFold

train_df = pd.read_csv('data/train_catugra_v2.csv', sep=',')
val_df = pd.read_csv('data/test_catugra_v2.csv', sep=',', index_col=0)

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')

models = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed_value)

for train_ids, test_ids in skf.split(train_df.drop('label', axis=1), train_df.label):
    x_tr = train_df.drop('label', axis=1).iloc[train_ids]
    x_ts = train_df.drop('label', axis=1).iloc[test_ids]
    y_tr = train_df.label[train_ids]
    y_ts = train_df.label[test_ids]

    ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value, eval_metric='AUC',)
    ctb.fit(x_tr, y_tr, cat_features=cat_cols, eval_set=(x_ts, y_ts), use_best_model=True, early_stopping_rounds=150)
    models.append(ctb)

label_val_df = val_df[['label']]

preds = np.mean([ctb.predict_proba(val_df.drop('label', axis=1))[:,1] for ctb in models], axis=0)
label_val_df['pred'] = preds
top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7834525	best: 0.7834525 (0)	total: 201ms	remaining: 3m 21s
200:	test: 0.8457316	best: 0.8457509 (199)	total: 18s	remaining: 1m 11s
400:	test: 0.8518673	best: 0.8518673 (400)	total: 36.1s	remaining: 54s
600:	test: 0.8540120	best: 0.8540120 (600)	total: 58s	remaining: 38.5s
800:	test: 0.8552468	best: 0.8552814 (772)	total: 1m 20s	remaining: 19.9s
999:	test: 0.8561128	best: 0.8561454 (993)	total: 1m 44s	remaining: 0us
bestTest = 0.8561453819
bestIteration = 993
Shrink model to first 994 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7877047	best: 0.7877047 (0)	total: 236ms	remaining: 3m 55s
200:	test: 0.8394053	best: 0.8394053 (200)	total: 28.2s	remaining: 1m 52s
400:	test: 0.8435685	best: 0.8435685 (400)	total: 47.9s	remaining: 1m 11s
600:	test: 0.8449746	best: 0.8450038 (578)	total: 1m 11s	remaining: 47.6s
800:	test: 0.8456100	best: 0.8457433 (775)	total: 1m 34s	remaining: 23.6s
bestTest = 0.8457432985
bestIteration = 775
Shrink model to first 776 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7705998	best: 0.7705998 (0)	total: 223ms	remaining: 3m 42s
200:	test: 0.8258160	best: 0.8258160 (200)	total: 21.9s	remaining: 1m 27s
400:	test: 0.8306485	best: 0.8306695 (399)	total: 47.2s	remaining: 1m 10s
600:	test: 0.8319824	best: 0.8320035 (585)	total: 1m 7s	remaining: 44.7s
800:	test: 0.8335217	best: 0.8335313 (799)	total: 1m 27s	remaining: 21.8s
999:	test: 0.8343971	best: 0.8345859 (960)	total: 1m 47s	remaining: 0us
bestTest = 0.8345859349
bestIteration = 960
Shrink model to first 961 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7476742	best: 0.7476742 (0)	total: 207ms	remaining: 3m 26s
200:	test: 0.8456572	best: 0.8456572 (200)	total: 19.2s	remaining: 1m 16s
400:	test: 0.8496324	best: 0.8496734 (397)	total: 40.1s	remaining: 59.8s
600:	test: 0.8511034	best: 0.8511829 (592)	total: 59.8s	remaining: 39.7s
800:	test: 0.8518252	best: 0.8518252 (800)	total: 1m 20s	remaining: 19.9s
999:	test: 0.8524050	best: 0.8525307 (931)	total: 1m 40s	remaining: 0us
bestTest = 0.852530688
bestIteration = 931
Shrink model to first 932 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7215285	best: 0.7215285 (0)	total: 214ms	remaining: 3m 33s
200:	test: 0.8287566	best: 0.8287566 (200)	total: 19.5s	remaining: 1m 17s
400:	test: 0.8365884	best: 0.8365884 (400)	total: 40s	remaining: 59.7s
600:	test: 0.8396222	best: 0.8396222 (600)	total: 1m 1s	remaining: 40.8s
800:	test: 0.8416092	best: 0.8416092 (800)	total: 1m 41s	remaining: 25.3s
999:	test: 0.8427416	best: 0.8427902 (988)	total: 1m 59s	remaining: 0us
bestTest = 0.842790246
bestIteration = 988
Shrink model to first 989 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7652357	best: 0.7652357 (0)	total: 195ms	remaining: 3m 14s
200:	test: 0.8297311	best: 0.8297311 (200)	total: 17.8s	remaining: 1m 10s
400:	test: 0.8350317	best: 0.8350716 (397)	total: 37.7s	remaining: 56.2s
600:	test: 0.8377247	best: 0.8377306 (599)	total: 56.9s	remaining: 37.8s
800:	test: 0.8384243	best: 0.8384310 (799)	total: 1m 16s	remaining: 18.9s
999:	test: 0.8390075	best: 0.8390610 (997)	total: 1m 35s	remaining: 0us
bestTest = 0.8390610218
bestIteration = 997
Shrink model to first 998 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7342900	best: 0.7342900 (0)	total: 301ms	remaining: 5m
200:	test: 0.8339884	best: 0.8339884 (200)	total: 18.6s	remaining: 1m 14s
400:	test: 0.8395171	best: 0.8395171 (400)	total: 37.5s	remaining: 56.1s
600:	test: 0.8416912	best: 0.8417659 (586)	total: 57.1s	remaining: 37.9s
800:	test: 0.8426311	best: 0.8427375 (782)	total: 1m 17s	remaining: 19.2s
999:	test: 0.8428524	best: 0.8429367 (909)	total: 1m 36s	remaining: 0us
bestTest = 0.8429366946
bestIteration = 909
Shrink model to first 910 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7405195	best: 0.7405195 (0)	total: 326ms	remaining: 5m 26s
200:	test: 0.8326367	best: 0.8326367 (200)	total: 18.5s	remaining: 1m 13s
400:	test: 0.8396336	best: 0.8396336 (400)	total: 37.1s	remaining: 55.5s
600:	test: 0.8420189	best: 0.8421022 (593)	total: 56.6s	remaining: 37.6s
800:	test: 0.8435405	best: 0.8435638 (791)	total: 1m 16s	remaining: 19.1s
999:	test: 0.8443286	best: 0.8443515 (998)	total: 1m 36s	remaining: 0us
bestTest = 0.8443514705
bestIteration = 998
Shrink model to first 999 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7431302	best: 0.7431302 (0)	total: 202ms	remaining: 3m 22s
200:	test: 0.8344687	best: 0.8345066 (199)	total: 19.1s	remaining: 1m 15s
400:	test: 0.8400874	best: 0.8400874 (400)	total: 38.7s	remaining: 57.9s
600:	test: 0.8418689	best: 0.8418884 (594)	total: 58.4s	remaining: 38.8s
800:	test: 0.8433756	best: 0.8433756 (800)	total: 1m 18s	remaining: 19.5s
999:	test: 0.8438882	best: 0.8439429 (971)	total: 1m 38s	remaining: 0us
bestTest = 0.8439428806
bestIteration = 971
Shrink model to first 972 iterations.
Learning rate set to 0.044317


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7402733	best: 0.7402733 (0)	total: 220ms	remaining: 3m 39s
200:	test: 0.8291143	best: 0.8291469 (199)	total: 18.4s	remaining: 1m 13s
400:	test: 0.8353809	best: 0.8353809 (400)	total: 38.3s	remaining: 57.2s
600:	test: 0.8364229	best: 0.8364387 (598)	total: 57.8s	remaining: 38.4s
800:	test: 0.8369712	best: 0.8371354 (780)	total: 1m 17s	remaining: 19.2s
999:	test: 0.8372980	best: 0.8373291 (987)	total: 1m 37s	remaining: 0us
bestTest = 0.8373290896
bestIteration = 987
Shrink model to first 988 iterations.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10631055250065462

In [61]:
from lightgbm import LGBMClassifier
from category_encoders import CatBoostEncoder

def precA5(y_true, y_pred):
    return "p@5",\
           pd.DataFrame(
               np.hstack([y_true[...,None], y_pred[...,None]]),
               columns=['true', 'pred']
           ).\
               sort_values('pred',ascending=False).\
               iloc[:int(y_true.shape[0]*0.05)].\
               true.sum()/y_true.shape[0], True


train_df = pd.read_csv('data/train_catugra_v2.csv', sep=',')
val_df = pd.read_csv('data/test_catugra_v2.csv', sep=',', index_col=0)

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')

enc = CatBoostEncoder()
train_df[cat_cols] = enc.fit_transform(train_df[cat_cols], y=train_df.label)
val_df[cat_cols] = enc.transform(val_df[cat_cols])

lgb = LGBMClassifier(**study.best_params)

lgb.fit(train_df.drop('label', axis=1), train_df.label, eval_set=(val_df.drop('label', axis=1), val_df.label), eval_metric=precA5, early_stopping_rounds=100)

label_val_df = val_df[['label']]
preds = lgb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response



[1]	valid_0's binary_logloss: 0.102206	valid_0's p@5: 0.00190487
[2]	valid_0's binary_logloss: 0.102091	valid_0's p@5: 0.00181323
[3]	valid_0's binary_logloss: 0.102173	valid_0's p@5: 0.00181323
[4]	valid_0's binary_logloss: 0.102371	valid_0's p@5: 0.00181323
[5]	valid_0's binary_logloss: 0.102675	valid_0's p@5: 0.00181323
[6]	valid_0's binary_logloss: 0.103117	valid_0's p@5: 0.00111281
[7]	valid_0's binary_logloss: 0.103721	valid_0's p@5: 0.00111281
[8]	valid_0's binary_logloss: 0.104444	valid_0's p@5: 0.00111281
[9]	valid_0's binary_logloss: 0.104185	valid_0's p@5: 0.00198343
[10]	valid_0's binary_logloss: 0.105032	valid_0's p@5: 0.00198343
[11]	valid_0's binary_logloss: 0.105602	valid_0's p@5: 0.00144666
[12]	valid_0's binary_logloss: 0.106196	valid_0's p@5: 0.000798607
[13]	valid_0's binary_logloss: 0.106893	valid_0's p@5: 0.000962256
[14]	valid_0's binary_logloss: 0.107921	valid_0's p@5: 0.000962256
[15]	valid_0's binary_logloss: 0.109019	valid_0's p@5: 0.000962256
[16]	valid_0's 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.03626603822990312

In [71]:
train_feat

Unnamed: 0,",label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,mean_income_subject,subject_population,district,children_subject,rural_subject,services_subject,communication_subject,district_population,mean_income_district,children_district,rural_district,services_district,communication_district"
0,"0,1,2020-05-01,Город,Москва,Москва,8611aa7a7ff..."
1,"1,1,2020-05-01,Город,Москва,Москва,8611aa01fff..."
2,"2,1,2020-05-01,Город,Москва,Москва,861181b6fff..."
3,"3,1,2020-05-01,Город,Москва,Москва,8611aa017ff..."
4,"4,1,2020-05-01,Город,Москва,Москва,8611aa637ff..."
...,...
294253,"294253,0,2020-12-01,Область,Магаданская,Ола,86..."
294254,"294254,0,2020-12-01,Область,Магаданская,Ягодно..."
294255,"294255,0,2020-12-01,Область,Магаданская,Магада..."
294256,"294256,0,2020-12-01,Автономный Округ,Чукотский..."


In [28]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML


train_feat = pd.read_csv('data/train_catugra_merged_table_with_covid.csv', sep=';')
test_feat = pd.read_csv('data/test_catugra_merged_table_with_covid.csv', sep=';')


In [29]:
def process_period(try_set):
    try_set = try_set.drop(['total_cases', 'new_cases'], axis=1)

    try_set['period'] = pd.to_datetime(try_set['period'])
    try_set['year'] = try_set['period'].dt.year
    try_set['month'] = try_set['period'].dt.month
    try_set['day'] = try_set['period'].dt.day
    try_set['cos_month'] = np.cos(try_set['period'].dt.day)
    drop_cols = ['period','day']
    try_set.drop(drop_cols, axis=1, inplace=True)
    return try_set

train_feat = process_period(train_feat)
test_feat = process_period(test_feat)

In [31]:
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.metrics import roc_auc_score
import pickle


def roc_auc_my(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

lama = TabularUtilizedAutoML(
    timeout = 2500,
    general_params = {'nested_cv': False, 'use_algos': [["lgb", "lgb_tuned", "cb", "cb_tuned"]]},
    reader_params = {'cv': 10, 'random_state': 7575, 'stratify':"label"},
    task = Task(
        name = 'binary',
        metric = roc_auc_my
    )
)
oof_pred = lama.fit_predict(
    train_feat,
    roles = {'target': 'label'}, verbose=2
)

[00:56:08] Start automl [1mutilizator[0m with listed constraints:
[00:56:08] - time: 2500.00 seconds
[00:56:08] - CPU: 4 cores
[00:56:08] - memory: 16 GB

[00:56:08] [1mIf one preset completes earlier, next preset configuration will be started[0m

[00:56:08] Start 0 automl preset configuration:
[00:56:08] [1mC:\Users\Alexandr\AppData\Roaming\Python\Python38\site-packages\lightautoml\automl\presets\tabular_configs\conf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[00:56:08] Stdout logging level is INFO2.
[00:56:08] Task: binary

[00:56:08] Start automl preset with listed constraints:
[00:56:08] - time: 2500.00 seconds
[00:56:08] - CPU: 4 cores
[00:56:08] - memory: 16 GB

[00:56:08] [1mTrain data shape: (294258, 58)[0m

[00:56:41] Layer [1m1[0m train process start. Time left 2467.34 secs
[00:57:16] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
[00:57:16] ===== Start working with [1mfold 

In [32]:
label_val_df = test_feat[['label']]
label_val_df['pred'] = lama.predict(test_feat.drop('label', axis=1)).data

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = lama.predict(test_feat.drop('label', axis=1)).data


0.10120450379680544

In [18]:
test_feat

Unnamed: 0_level_0,new_cases,period,Апгрейд,Игровой,Технологии доступа,Технологии доступа PRO,Технологии контроля,label,subject_type,subject_name,...,children_subject,rural_subject,services_subject,communication_subject,district_population,mean_income_district,children_district,rural_district,services_district,communication_district
total_cases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1624648.0,18381.0,2020-11-01,500.0,890.0,500.0,890.0,740.0,1,Город,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
1624648.0,18381.0,2020-11-01,500.0,890.0,500.0,890.0,740.0,1,Город,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
1624648.0,18381.0,2020-11-01,500.0,890.0,500.0,890.0,740.0,1,Город,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
1624648.0,18381.0,2020-11-01,500.0,890.0,500.0,890.0,740.0,1,Город,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
1624648.0,18381.0,2020-11-01,500.0,890.0,500.0,890.0,740.0,1,Город,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4503291.0,9057.0,2021-04-01,-1.0,890.0,450.0,-1.0,740.0,0,Область,Курганская,...,19.4,37.7,19.1,3.2,12329.5,42555.9,20.6,18.3,26.0,3.0
4503291.0,9057.0,2021-04-01,-1.0,890.0,450.0,-1.0,740.0,0,Область,Курганская,...,19.4,37.7,19.1,3.2,12329.5,42555.9,20.6,18.3,26.0,3.0
4503291.0,9057.0,2021-04-01,-1.0,890.0,450.0,-1.0,740.0,0,Область,Курганская,...,19.4,37.7,19.1,3.2,12329.5,42555.9,20.6,18.3,26.0,3.0
4503291.0,9057.0,2021-04-01,-1.0,890.0,450.0,-1.0,740.0,0,Область,Курганская,...,19.4,37.7,19.1,3.2,12329.5,42555.9,20.6,18.3,26.0,3.0
