In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import StratifiedKFold, KFold

from catboost import CatBoostClassifier, Pool

import matplotlib.pyplot as plt
import gc
from pathlib import Path
from tqdm.notebook import tqdm

import seaborn as sns
sns.set_theme()

In [2]:
data_path = Path(".")

In [4]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

test = pd.read_parquet(data_path / 'test_data.pqt', engine='pyarrow')

full_users = set(test[test.date == "month_4"].id)
test_ful = test[test.id.isin(full_users)]
test_cut = test[~test.id.isin(full_users)]

train = pd.read_parquet(data_path / 'train_data.pqt', engine='pyarrow')

In [5]:
class StartClusterFeaturesForLast(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, train):
        return self

    def transform(self, df):
        #prev1 prev2
        clusters = []
        ohs = []

        for _, x in df.groupby("id"):
            cc = x.start_cluster.tolist()


            clusters.append([cc[0], cc[1]])

        df_clust = pd.DataFrame(clusters, columns=["prev2_start_cluster", "prev_start_cluster"])

        return df_clust

class StartClusterFeaturesForLastСut(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, train):
        return self

    def transform(self, df):
        #prev1 prev2
        clusters = []
        ohs = []

        for _, x in df.groupby("id"):
            cc = x.start_cluster.tolist()


            clusters.append([cc[0]])

        df_clust = pd.DataFrame(clusters, columns=["prev_start_cluster"])

        return df_clust


cluster_feat_ful = StartClusterFeaturesForLast().transform(train)
cluster_feat_cut = StartClusterFeaturesForLastСut().transform(train)

cluster_feat_test_ful = StartClusterFeaturesForLast().transform(test_ful)
cluster_feat_test_cut = StartClusterFeaturesForLastСut().transform(test_cut)

In [6]:
train_ful = pd.concat([train[train.date=="month_3"].copy().reset_index(drop=True),
                       cluster_feat_ful],
                      axis=1)
train_cut = pd.concat([train[train.date=="month_3"].copy().reset_index(drop=True),
                       cluster_feat_cut],
                      axis=1)

test_ful = pd.concat([test_ful[test_ful.date=="month_6"].copy().reset_index(drop=True),
                      cluster_feat_test_ful],
                     axis=1)
test_cut = pd.concat([test_cut[test_cut.date=="month_6"].copy().reset_index(drop=True),
                      cluster_feat_test_cut],
                     axis=1)


y_train_ful = train[train.date=="month_3"].copy().reset_index(drop=True)["end_cluster"]
y_train_cut = train_cut[train_cut.date=="month_3"][train_cut['start_cluster'].isin(['{}', '{α}', '{other}', '{α, η}', '{α, γ}', '{α, λ}', '{α, ψ}'])].copy().reset_index(drop=True)["end_cluster"]
train_cut = train_cut[train_cut['start_cluster'].isin(['{}', '{α}', '{other}', '{α, η}', '{α, γ}', '{α, λ}', '{α, ψ}'])]

In [7]:
train_ful = train_ful.drop("start_cluster", axis=1)
train_cut = train_cut.drop("start_cluster", axis=1)
test_ful = test_ful.drop("start_cluster", axis=1)
test_cut = test_cut.drop("start_cluster", axis=1)

In [8]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment",
    "index_city_code", "ogrn_month", "ogrn_year",
]

def catboost_prep(X, value=-100000):
    """
    1) out-of-bound NaN encoding
    2) category type -> object type
    3) clean "id", "date", "end_cluster", "index"
    4) fix category features list
    """
    for col, t in zip(X.dtypes.index, X.dtypes):
        if t == "category":
            X[col] = X[col].astype("object")
    X = X.fillna(value)
    X = X.replace([None], [value])
    
    features = []
    for f in X.columns.values:
        if not (f in ["id", "date", "end_cluster", "index"]):
            features.append(f)
    X = X[features]

    cat_features = list(set(cat_cols + [col for col in X.columns if X[col].dtype == "object"]))
    return X, cat_features

X_train_ful, cat_cols_train_ful = catboost_prep(train_ful)
X_train_cut, cat_cols_train_cut = catboost_prep(train_cut)
X_test_ful, cat_cols_test_ful = catboost_prep(test_ful)
X_test_cut, cat_cols_test_cut = catboost_prep(test_cut)

In [9]:
#ful train
from sklearn.model_selection import StratifiedKFold

KFold_random_state = [42, 228, 777] #, 2024, 911, 69]
fitted_models_ful = []

for ind_k, random_state in tqdm(enumerate(KFold_random_state), total=len(KFold_random_state)):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    for idx_train, idx_valid in tqdm(cv.split(X_train_ful, y_train_ful), total=cv.n_splits):
        X_t, y_t = X_train_ful.iloc[idx_train], y_train_ful[idx_train]
        X_v, y_v = X_train_ful.iloc[idx_valid], y_train_ful[idx_valid]

        model = CatBoostClassifier(
            # iterations=10000,
            task_type="GPU",
            early_stopping_rounds=100 + ind_k*5
            # early_stopping_rounds=250 + ind_k*10  # для 10000 итераций
            )
        model.fit(X_t, y_t,
                  eval_set=(X_v, y_v),
                  use_best_model=True,
                  cat_features=cat_cols_train_ful,
                  logging_level='Silent')

        fitted_models_ful.append(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
#cut train
from sklearn.model_selection import StratifiedKFold

KFold_random_state = [42, 228, 777] #, 2024, 911, 69]
fitted_models_cut = []

for ind_k, random_state in tqdm(enumerate(KFold_random_state), total=len(KFold_random_state)):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    for idx_train, idx_valid in tqdm(cv.split(X_train_cut, y_train_cut), total=cv.n_splits):
        X_t, y_t = X_train_cut.iloc[idx_train], y_train_cut[idx_train]
        X_v, y_v = X_train_cut.iloc[idx_valid], y_train_cut[idx_valid]

        model = CatBoostClassifier(
            # iterations=10000,
            task_type="GPU",
            early_stopping_rounds=100 + ind_k*5
            # early_stopping_rounds=250 + ind_k*10  # для 10000 итераций
            )
        model.fit(X_t, y_t,
                  eval_set=(X_v, y_v),
                  use_best_model=True,
                  cat_features=cat_cols_train_cut,
                  logging_level='Silent')

        fitted_models_cut.append(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]



Здесь немного более простой подход, совсем чуть-чуть отстающий в качестве

In [9]:
# #ful train
# from sklearn.model_selection import StratifiedKFold

# n_classes = len(np.unique(y_train_ful))
# meta_train_ful = np.zeros((X_train_ful.shape[0], n_classes))

# cv = StratifiedKFold(n_splits=5, shuffle=True)
# fitted_models_ful = []

# for idx_train, idx_valid in tqdm(cv.split(X_train_ful, y_train_ful), total=cv.n_splits):
#     X_t, y_t = X_train_ful.iloc[idx_train], y_train_ful[idx_train]
#     X_v, y_v = X_train_ful.iloc[idx_valid], y_train_ful[idx_valid]

#     model = CatBoostClassifier(
# #         iterations=10000,
#         task_type="GPU",
#         )
#     model.fit(X_t, y_t,
#               eval_set=(X_v, y_v),
#               use_best_model=True,
#               cat_features=cat_cols_train_ful,
#               logging_level='Silent')
    
#     fitted_models_ful.append(model)
    
#     meta_train_ful[idx_valid, :] = model.predict_proba(X_v)


# classes = fitted_models_ful[0].classes_
# meta_train_ful = pd.concat([pd.DataFrame(meta_train_ful, columns=classes), train_ful["id"]], axis=1)
# meta_train_ful.to_parquet("/kaggle/working/meta_train.pqt", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

In [33]:
# #cut train
# from sklearn.model_selection import StratifiedKFold

# cv = StratifiedKFold(n_splits=5, shuffle=True)
# fitted_models_cut = []

# for idx_train, idx_valid in tqdm(cv.split(X_train_cut, y_train_cut), total=cv.n_splits):
#     X_t, y_t = X_train_cut.iloc[idx_train], y_train_cut[idx_train]
#     X_v, y_v = X_train_cut.iloc[idx_valid], y_train_cut[idx_valid]

#     model = CatBoostClassifier(
# #         iterations=10000,
#         task_type="GPU",
#         )
#     model.fit(X_t, y_t,
#               eval_set=(X_v, y_v),
#               use_best_model=True,
#               cat_features=cat_cols_train_cut,
#               logging_level='Silent')

#     fitted_models_cut.append(model)

  0%|          | 0/5 [00:00<?, ?it/s]



In [11]:
classes = fitted_models_ful[0].classes_

In [12]:
models_pred_proba = []
for clf in tqdm(fitted_models_ful, total=len(fitted_models_ful)):
    pred = clf.predict_proba(X_test_ful)
    models_pred_proba.append(pred)

test_pred_proba_ful = np.mean(models_pred_proba, axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
models_pred_proba = []
for clf in tqdm(fitted_models_cut, total=len(fitted_models_cut)):
    pred = clf.predict_proba(X_test_cut)
    models_pred_proba.append(pred)

test_pred_proba_cut = np.mean(models_pred_proba, axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]

In [14]:
test_pred_proba_df_ful = pd.concat([pd.DataFrame(test_pred_proba_ful, columns=classes), test_ful["id"]], axis=1)
test_pred_proba_df_cut = pd.concat([pd.DataFrame(test_pred_proba_cut, columns=classes), test_cut["id"]], axis=1)
test_pred_proba_df = pd.concat([test_pred_proba_df_ful, test_pred_proba_df_cut])

In [15]:
subm = test_pred_proba_df.set_index("id").reset_index()
subm = subm.sort_values("id").reset_index(drop=True)
subm.to_csv("/kaggle/working/submission.csv", index=False)