## Загрузим нужные библиотеки

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/AI/pochta/
!ls

/content/drive/MyDrive/AI/pochta
catboost_info	scale_pos_weight34.csv	train_dataset_train.csv
experiments	submissions		training.ipynb
sample_sub.csv	test_dataset_test.csv	Untitled5.ipynb


In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
import os
import pickle
import logging
!pip install catboost -qqq
import catboost as cb
import numpy as np
from catboost.utils import get_gpu_device_count
from catboost import CatBoostClassifier

!pip install omegaconf -qqq
from omegaconf import OmegaConf

%matplotlib inline

In [4]:
params = {
    'data': 
 {
        'n_splits': 5,
 },

    'cb_params': 
 {
        'n_estimators' : 200, #5000
        'max_depth' : 4,
        'task_type': 'CPU',
        # 'border_count' : 64,
#        'l2_leaf_reg' : 2,
#        'bagging_temperature' : 20,
#        'rsm' : .5,
        'use_best_model': True,
#        'task_type': 'GPU',
        'random_state': 42,
        'early_stopping_rounds': 25,
        # 'scale_pos_weight': 34
 }
}


cfg = OmegaConf.create(params)
# !mkdir experiments/
cfg.exp_name = 'opers'
exp_dir = os.path.join('experiments/', cfg.exp_name)
if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)

with open(f"experiments/{cfg.exp_name}/config.yaml", "w") as f:
    OmegaConf.save(cfg, f)

In [None]:
df_train = pd.read_csv('train_dataset_train.csv').drop(columns=['id'])
df_train.sample(5)

In [6]:
def reduce_mem_usage(func, verbose=True):
    def wrapped(payments):
        df = func(payments)
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2    
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)  
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased from {:5.2f} to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df
    return wrapped

## Конвертация фичей

In [7]:
def convert_yes_no(x):
    return 1 if x == 'Y' else 0

@reduce_mem_usage
def convert_types(data):
    df = data.copy()
    df['oper_type'] = df['oper_type + oper_attr'].apply(lambda x: x.split('_')[0])
    df['oper_attr'] = df['oper_type + oper_attr'].apply(lambda x: x.split('_')[1])

    index_oper = df['index_oper']

    new_values = []
    for num in index_oper:
        try:
            new_values.append(int(float(num)))
        except:
            new_values.append(-1)
    df['index_oper'] = new_values
    df['index_oper'] = df['index_oper'].astype('str')

    df['is_return'] = df['is_return'].apply(convert_yes_no)
    df['is_in_yandex'] = df['is_in_yandex'].apply(convert_yes_no)
    df['is_privatecategory'] = df['is_privatecategory'].apply(convert_yes_no)

    obj_cols = ['mailctg', 'mailtype', 'mailrank', 'directctg', 
            'postmark', 'priority', 'class', 'oper_attr', 
            'oper_type', 'oper_type + oper_attr']

    df[obj_cols] = df[obj_cols].astype('str')      
    return df    

df_train = convert_types(df_train)      

Mem. usage decreased from 1373.29 to 812.53 Mb (40.8% reduction)


In [9]:
test = convert_types(pd.read_csv('test_dataset_test.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


Mem. usage decreased from 915.53 to 553.13 Mb (39.6% reduction)


In [14]:
cols = [ 'index_oper', 'type', 'priority',
       'is_privatecategory', 'class', 'is_in_yandex', 'is_return', 'weight',
       'mailtype', 'mailctg', 'mailrank', 'directctg', 'transport_pay',
       'postmark', 'name_mfi', 'weight_mfi', 'price_mfi',
       'dist_qty_oper_login_1', 'total_qty_oper_login_1',
       'total_qty_oper_login_0', 'total_qty_over_index_and_type',
       'total_qty_over_index', 'is_wrong_sndr_name', 'is_wrong_rcpn_name',
       'is_wrong_phone_number', 'is_wrong_address']

In [11]:
x = train_test.groupby(cols)['oper_type + oper_attr'].unique().reset_index()
df = train_test.merge(x, on=cols, how='left').rename(
    columns={'oper_type + oper_attr_y': 'unique_oper',
             'oper_type + oper_attr_x': 'oper_type + oper_attr',}
    )
del x

In [12]:
most_freq_oper = train_test['oper_type + oper_attr'].value_counts().index[:50]

In [23]:
with open(f'experiments/{cfg.exp_name}/df_test.pkl', 'wb') as f:
    pickle.dump(test, f)

with open(f'experiments/{cfg.exp_name}/df_train.pkl', 'wb') as f:
    pickle.dump(df_train, f)

In [23]:
df_train = train_test[~train_test['label'].isna()]

In [20]:
oper_cols = []
for col in most_freq_oper:
    oper_cols.append(f'is_oper_{col}')
    df_train[f'is_oper_{col}'] = np.zeros(len(df_train))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [21]:
unique_oper = df_train['unique_oper']
is_oper_list = []
for idx in tqdm(range(len(df_train))):
    uniq_opers = unique_oper[idx]
    opers = []
    for col in most_freq_oper:
        if col in uniq_opers:
            opers.append(1)
        else:
            opers.append(0)
    is_oper_list.append(opers)
    # df.loc[idx, oper_cols] = opers
is_oper_list = np.array(is_oper_list)
df_train[oper_cols] = is_oper_list
df_train[oper_cols]= df_train[oper_cols].astype(bool)
with open(f'experiments/{cfg.exp_name}/df_train_ex.pkl', 'wb') as f:
    pickle.dump(df_train, f)

100%|██████████| 6000000/6000000 [24:46<00:00, 4036.81it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [22]:
del train_test, df
import gc
gc.collect()

113

In [7]:
# with open(f'experiments/{cfg.exp_name}/df_train.pkl', 'rb') as f:
#     df_train = pickle.load(f)

In [9]:
df_train = df_train.drop(columns=['oper_type + oper_attr', 'oper_type', 'oper_attr'])
cat_cols = df_train.columns[df_train.dtypes == 'object'].tolist()

## Обучение

### Train test split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=['label']), df_train['label'], test_size=0.2, stratify=df_train['label'])


In [33]:
del df_train, train_test, test
import gc
gc.collect()

191

In [None]:
train_pool = cb.Pool(X_train, y_train,
                cat_features=cat_cols)
val_pool = cb.Pool(X_val, y_val,
                cat_features=cat_cols)

In [None]:
feature_weights = np.ones(X_train.shape[1])
feature_weights[0] = 0.2
feature_weights[-2] = 0.2
feature_weights[-1] = 0.2
feature_weights = dict(zip(X_train.columns.tolist(), feature_weights))

In [1]:
# max_ctr_complexity 

In [11]:
model = CatBoostClassifier(**cfg.cb_params, cat_features=cat_cols.copy())
model.fit(X_train, y_train, verbose=100, eval_set=(X_val, y_val))
# model.fit(train_pool, verbose=100, eval_set=val_pool)
recall_score(y_val, model.predict(X_val), average="macro")

Learning rate set to 0.5
0:	learn: 0.0912935	test: 0.0914970	best: 0.0914970 (0)	total: 4.26s	remaining: 14m 7s
100:	learn: 0.0418833	test: 0.0415847	best: 0.0415847 (100)	total: 5m 58s	remaining: 5m 51s
199:	learn: 0.0411102	test: 0.0408014	best: 0.0408014 (199)	total: 11m 40s	remaining: 0us

bestTest = 0.04080142865
bestIteration = 199



0.8084771291709709

In [5]:
# with open(f'experiments/{cfg.exp_name}/model.cb', 'wb') as f:
#     pickle.dump(model, f)
with open(f'experiments/{cfg.exp_name}/model.cb', 'rb') as f:
    model = pickle.load(f)

In [12]:
model.get_feature_importance(val_pool,
                       prettified=True).head(15)

NameError: ignored

### Batch training

In [11]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=['label']), df_train['label'], test_size=0.2, stratify=df_train['label'])

In [12]:
val_pool = cb.Pool(X_val, y_val,
                    cat_features=cat_cols)

In [None]:
clf = CatBoostClassifier(**cfg.cb_params, cat_features=cat_cols.copy())

scores = []
models = []
num_parts = list(range(0, len(X_train)+1, len(X_train)//5))
slices = list(zip(num_parts, num_parts[1:]))
for batch_idx in range(len(slices)):
    start, end = slices[batch_idx][0], slices[batch_idx][1]
    X_batch_train = X_train[start:end]
    y_batch_train = y_train[start:end]

    train_pool = cb.Pool(X_batch_train, y_batch_train,
                    cat_features=cat_cols)

    if batch_idx == 0:
        clf.fit(train_pool, eval_set=val_pool, verbose=100) 
    else:
        clf.fit(train_pool,      
                eval_set=val_pool,
                verbose=100,
                init_model=f'experiments/{cfg.exp_name}/{batch_idx-1}.cb') 
    clf.save_model(f'experiments/{cfg.exp_name}/{batch_idx}.cb') 

    recall = recall_score(y_val, clf.predict(X_val), average="macro")
    print(f'Score on #{batch_idx} fold: ', recall)
    scores.append(recall)

In [15]:
recall_score(y_val, clf.predict(X_val), average="macro")

0.8937988505601142

In [None]:
scores = []
models = []
num_parts = list(range(0, len(X_train)+1, len(X_train)//5))
slices = list(zip(num_parts, num_parts[1:]))
for batch_idx in range(len(slices)):
    start, end = slices[batch_idx][0], slices[batch_idx][1]
    X_batch_train = X_train[start:end]
    y_batch_train = y_train[start:end]

    train_pool = cb.Pool(X_batch_train, y_batch_train,
                        cat_features=cat_cols)
    if batch_idx != 0:
        train_pool.set_baseline(prev_model.predict(prev_pool))
        val_pool.set_baseline([0] * len(X_val))
    model = CatBoostClassifier(**cfg.cb_params, cat_features=cat_cols.copy())


    model.fit(train_pool, verbose=100, eval_set=val_pool)
    recall = recall_score(y_val, model.predict(X_val), average="macro")
    print(f'Score on #{batch_idx} fold: ', recall)
    scores.append(recall)

    models.append(model)
    with open(f'experiments/{cfg.exp_name}/{batch_idx}.cb', 'wb') as f:
        pickle.dump(model, f)

    prev_model = model
    prev_pool = train_pool

scores = np.array(scores)
print(f'Result score: {np.mean(scores)} +- {np.std(scores)}')

In [35]:
meta_pred = meta_model.predict(X_val)
meta_pred.shape

(1200000,)

In [32]:
from catboost import (CatBoostRegressor, Pool, sum_models,)

meta_model = sum_models(models)
recall_score(y_val, meta_model.predict(X_val), average="macro")

ValueError: ignored

## Test

In [10]:
test = convert_types(pd.read_csv('test_dataset_test.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


Mem. usage decreased from 915.53 to 553.13 Mb (39.6% reduction)


In [10]:
del df_train
import gc
gc.collect()

195

In [6]:
with open(f'experiments/{cfg.exp_name}/df_test.pkl', 'rb') as f:
    test = pickle.load(f)

In [7]:
test

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,is_oper_14_1001,is_oper_4_3,is_oper_1017_1001,is_oper_1017_-1,is_oper_8_20,is_oper_12_9,is_oper_1017_1000,is_oper_14_18,is_oper_154_-1,is_oper_41_-1
0,7815282,8_13,102976,ММПО,7503.0,0,0.0,0,0,85.0,...,False,False,False,False,False,False,False,False,False,False
1,8443555,8_2,238753,ГОПС,7503.0,0,4.0,1,0,21.0,...,False,False,False,False,False,False,False,False,False,False
2,6352559,1020_-1,618254,ГОПС,7503.0,0,4.0,1,0,388.0,...,False,False,False,False,False,False,False,False,False,False
3,4921420,1016_-1,102968,ТИ,7503.0,0,0.0,0,0,1203.0,...,False,False,False,False,False,False,False,False,False,False
4,1420440,1018_-1,102971,Цех,7506.0,0,0.0,0,0,956.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999995,9030800,1017_1001,655003,ГОПС,7503.0,0,4.0,1,0,53.0,...,False,False,False,False,False,False,False,False,False,False
3999996,6275048,1023_-1,102976,ММПО,7503.0,0,0.0,0,0,286.0,...,False,False,False,False,False,False,False,False,False,False
3999997,4701757,1022_-1,102969,ТИ,7503.0,0,0.0,0,0,733.0,...,False,False,False,False,False,False,False,False,False,False
3999998,611153,39_-1,104006,Участок,7503.0,0,0.0,0,0,265.0,...,False,False,False,False,False,False,False,False,False,False


In [8]:
test = test.drop(columns=['oper_type + oper_attr', 'oper_type', 'oper_attr'])
cat_cols = test.columns[test.dtypes == 'object'].tolist()

In [9]:
most_freq_oper = df_train.columns[df_train.columns.str.startswith('is_oper')]
len(most_freq_oper)

50

In [11]:
for col in most_freq_oper:
    test[f'{col}'] = np.zeros(len(test))

In [12]:
test['unique_oper']

6000000    [1004_-1, 1019_-1, 8_6, 1023_-1, 1020_-1, 14_1...
6000001                                           [8_2, 2_1]
6000002                         [2_1, 8_2, 1020_-1, 1019_-1]
6000003                              [8_0, 1016_-1, 1022_-1]
6000004    [1020_-1, 1_-1, 1022_-1, 1018_-1, 1041_-1, 104...
                                 ...                        
9999995    [8_2, 1019_-1, 1020_-1, 4_3, 1022_-1, 8_20, 10...
9999996    [1004_-1, 1019_-1, 14_10, 8_0, 8_6, 1022_-1, 1...
9999997                                       [8_0, 1022_-1]
9999998                                              [39_-1]
9999999    [1023_-1, 1014_-1, 8_13, 9_-1, 1019_-1, 1004_-...
Name: unique_oper, Length: 4000000, dtype: object

In [13]:
oper_cols = most_freq_oper

In [15]:
test = test.reset_index().drop(columns=['index'])

In [17]:
unique_oper = test['unique_oper']
test = test.drop(columns=['oper_type + oper_attr', 'oper_type', 'oper_attr', 'unique_oper'])
is_oper_list = []
for idx in tqdm(range(len(test))):
    uniq_opers = unique_oper[idx]
    opers = []
    for col in most_freq_oper:
        if col in uniq_opers:
            opers.append(1)
        else:
            opers.append(0)
    is_oper_list.append(opers)
    # df.loc[idx, oper_cols] = opers
is_oper_list = np.array(is_oper_list)
test[oper_cols] = is_oper_list
del is_oper_list
test[oper_cols]= test[oper_cols].astype(bool)

100%|██████████| 4000000/4000000 [15:13<00:00, 4380.52it/s]


In [40]:
test[oper_cols].describe()

Unnamed: 0,is_oper_1020_-1,is_oper_1022_-1,is_oper_1019_-1,is_oper_8_0,is_oper_1_-1,is_oper_1001_-1,is_oper_1018_-1,is_oper_1004_-1,is_oper_1042_-1,is_oper_1041_-1,...,is_oper_14_1001,is_oper_4_3,is_oper_1017_1001,is_oper_1017_-1,is_oper_8_20,is_oper_12_9,is_oper_1017_1000,is_oper_14_18,is_oper_154_-1,is_oper_41_-1
count,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,...,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0
mean,0.4522565,0.4254695,0.2943975,0.2428685,0.2520733,0.2444898,0.2385645,0.167966,0.224771,0.220909,...,0.00314225,0.00146575,0.001176,0.00348775,0.0011415,0.000684,0.0017555,0.00163425,0.0011535,0.00020675
std,0.4977154,0.4944141,0.4557715,0.4288163,0.4342031,0.4297843,0.426206,0.3738361,0.4174315,0.4148593,...,0.05596764,0.03825705,0.03427269,0.0589541,0.03376681,0.02614445,0.0418619,0.04039282,0.03394363,0.01437732
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
cat_cols = test.columns[test.dtypes == 'object'].tolist()

In [13]:
test_pred = model.predict_proba(cb.Pool(test, cat_features=cat_cols))[:, 1]

In [16]:
(test_pred > 0.003).sum()

772369

In [11]:
test_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
sub = pd.DataFrame(test['id'].astype('int'))
sub['label'] = (test_pred > 0.003).astype('int')
sub_path = 'submissions'
if not os.path.exists(sub_path):
    os.mkdir(sub_path)
sub.to_csv(f'{sub_path}/0_003.csv', index=False)