## Результаты
| Model                    | P@5    |
|--------------------------|--------|
| Random                   | 0.0093 |
| Catboost                 | 0.1057 |
| Catboost + fe            | 0.1026 |
| Catboost + add_data      | 0.1091 |
| Catboost + add_data + fe | 0.1015 |


In [5]:
import os
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

In [6]:
def fix_seed(seed_value):
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)

seed_value = 7575
fix_seed(seed_value)

In [116]:
# class Pipeline:
#     def __init__(self, train_path, val_path, test_path=None, data_loader=None, data_prep=None, model=None, **kwargs):
#         """
#
#         :param train_path:
#         :param val_path:
#         :param test_path:
#         :param data_loader: implements method load
#         :param data_prep: implements methods fit transform
#         :param model: implements methods fit, predict_proba
#         """
#         if data_loader:
#             self.data_loader = data_loader()
#         else:
#             self.train_df = pd.read_csv(train_path)
#             self.val_df = pd.read_csv(val_path)
#             if test_path:
#                 self.test_df = pd.read_csv(test_path)
#
#         self.data_prep = data_prep()
#         self.model = model(**kwargs)
#
#     def fit(self, **fit_params):
#         train_df


# train_df = pd.read_csv('data/train_catugra.csv', sep=',',  index_col=0)
# val_df = pd.read_csv('data/test_catugra.csv', sep=',', index_col=0)

## Оценка сырых данных

In [16]:
train_df = pd.read_csv('data/train.csv', sep=';')
val_df = pd.read_csv('data/test.csv', sep=';')

In [119]:
# random modeling
N = 5
top5p = int(val_df.shape[0] * 0.05)
random_res = []

label_val_df = val_df[['label']]

for i in range(N):
    preds = label_val_df.label.sample(val_df.shape[0]).reset_index(drop=True)
    label_val_df['pred'] = preds
    random_res.append(label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p)

np.mean(random_res), random_res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


(0.009347996857816182,
 [0.008771929824561403,
  0.00968840010473946,
  0.010081173081958628,
  0.008379156847342237,
  0.009819324430479183])

In [120]:
## Simple modeling

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')


ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6470023	total: 361ms	remaining: 6m
200:	learn: 0.0918462	total: 29.8s	remaining: 1m 58s
400:	learn: 0.0897373	total: 54.2s	remaining: 1m 20s
600:	learn: 0.0884431	total: 1m 44s	remaining: 1m 9s
800:	learn: 0.0874101	total: 2m 22s	remaining: 35.5s
999:	learn: 0.0865075	total: 3m 3s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10578685519769573

# Data Preparation

In [36]:
def add_agg_features(df):
    df = df.copy()
    subj_feat = df[['subject_name'] + [f'f{i}' for i in range(1, 31)]].\
        groupby('subject_name').\
        agg({f"f{i}":[min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1,31)})

    features_names_subject = np.array([col + '_subject_' + stat for col in [f'f{i}' for i in range(1, 31)]
                                       for stat in ['min', 'max', 'mean', 'std','sum', 'median', 'skew', 'kurtosis']]).flatten().tolist()

    # city_feat = df[['city_name'] + [f'f{i}' for i in range(1, 31)]].groupby('city_name').agg(
    #     {f"f{i}": [min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1, 31)})
    #
    # features_city_subject = np.array([col + '_city_' + stat for col in [f'f{i}' for i in range(1, 31)]
    #                                    for stat in ['min', 'max', 'mean', 'std', 'sum', 'median', 'skew','kurtosis']]).flatten().tolist()

    # subjt_feat = df[['subject_type'] + [f'f{i}' for i in range(1, 31)]].groupby('subject_type').agg(
    #     {f"f{i}": [min, max, np.mean, np.std, sum, np.median, pd.Series.skew, pd.Series.kurtosis] for i in range(1, 31)})
    #
    # features_t_subject = np.array([col + '_subjt_' + stat for col in [f'f{i}' for i in range(1, 31)]
    #                                   for stat in ['min', 'max', 'mean', 'std', 'sum', 'median', 'skew','kurtosis']]).flatten().tolist()

    subj_df = pd.DataFrame(subj_feat.values, columns=features_names_subject)
    subj_df['subject_name'] = subj_feat.index

    # city_df = pd.DataFrame(city_feat.values, columns=features_city_subject)
    # city_df['city_name'] = city_feat.index

    # subjt_df = pd.DataFrame(subjt_feat.values, columns=features_t_subject)
    # subjt_df['subject_type'] = subjt_feat.index
    df = df.merge(subj_df, on='subject_name')#.merge(city_df, on='city_name')#.merge(subjt_df, on='subject_type')
    return df

In [38]:
train_df = pd.read_csv('data/train.csv', sep=';')
val_df = pd.read_csv('data/test.csv', sep=';')


cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')

# добавляем фич
train_df = add_agg_features(train_df)
val_df = add_agg_features(val_df)

## Simple modeling
ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6462937	total: 257ms	remaining: 4m 16s
200:	learn: 0.0916607	total: 11.6s	remaining: 46s
400:	learn: 0.0892890	total: 24.4s	remaining: 36.4s
600:	learn: 0.0878918	total: 40.7s	remaining: 27s
800:	learn: 0.0867803	total: 1m 3s	remaining: 15.8s
999:	learn: 0.0858101	total: 1m 28s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.1050013092432574

In [10]:
# пробуем с данными от кати

train_df = pd.read_csv('data/train_catugra.csv', sep=',',  index_col=0)
val_df = pd.read_csv('data/test_catugra.csv', sep=',', index_col=0)

cat_cols = train_df.select_dtypes(include=['object']).columns.values
train_df[cat_cols] = train_df[cat_cols].fillna('NaN')
val_df[cat_cols] = val_df[cat_cols].fillna('NaN')


ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6459351	total: 228ms	remaining: 3m 47s
200:	learn: 0.0917587	total: 18.2s	remaining: 1m 12s
400:	learn: 0.0895619	total: 42.9s	remaining: 1m 4s
600:	learn: 0.0882322	total: 1m 10s	remaining: 47s
800:	learn: 0.0871200	total: 1m 32s	remaining: 22.9s
999:	learn: 0.0861629	total: 2m 13s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10919088766692851

In [11]:
train_df = add_agg_features(train_df)
val_df = add_agg_features(val_df)

ctb = CatBoostClassifier(verbose=200, task_type='GPU', random_seed=seed_value)
ctb.fit(train_df.drop('label', axis=1), train_df.label, cat_features=cat_cols)
label_val_df = val_df[['label']]

preds = ctb.predict_proba(val_df.drop('label', axis=1))[:,1]
label_val_df['pred'] = preds

top5p = int(label_val_df.shape[0] * 0.05)
res_response = label_val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

Learning rate set to 0.02466
0:	learn: 0.6480979	total: 1.65s	remaining: 27m 30s
200:	learn: 0.0916542	total: 39.2s	remaining: 2m 35s
400:	learn: 0.0894033	total: 1m 22s	remaining: 2m 3s
600:	learn: 0.0880655	total: 1m 38s	remaining: 1m 5s
800:	learn: 0.0870913	total: 1m 59s	remaining: 29.7s
999:	learn: 0.0861618	total: 2m 43s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_val_df['pred'] = preds


0.10159727677402461