In [1]:
import os
import sys
import warnings

sys.path.append("..")

from src.pickle_manager import open_pickle
from src.regression_models import metrics_print
from src.classification_models import classification_metrics_print

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
data = open_pickle(os.path.join("..", "data", "pkls"), "processed_dataset.pkl")
X_train, X_test, y_train, y_test = (
    data["X_train"],
    data["X_test"],
    data["y_train"],
    data["y_test"],
)
X_train["lifetime"] = y_train
X_test["lifetime"] = y_test

In [3]:
# Объединяем обратно закодированные признаки
def reverse_one_hot(df, prefix):
    filtered_cols = df.filter(like=prefix)
    max_cols = filtered_cols.idxmax(axis=1)
    return max_cols.apply(lambda x: x.replace(prefix + "_", ""))


for prefix in ["Основной вид деятельности", "Регион"]:
    X_train[prefix] = reverse_one_hot(X_train, prefix)
    X_test[prefix] = reverse_one_hot(X_test, prefix)

## Регрессия

In [4]:
# Группировка на основе 'Тип субъекта', 'Основной вид деятельности', 'Регион'
pred_df = X_train.groupby(["Тип субъекта", "Основной вид деятельности", "Регион"])[
    "lifetime"
].aggregate(["min", "mean", "max"])
constant_value = X_train.lifetime.mean()


def get_prediction(x, pred_df, constant_value):
    try:
        return pred_df.loc[
            (x["Тип субъекта"], x["Основной вид деятельности"], x["Регион"]), "mean"
        ]
    except KeyError:
        return constant_value


# Получим предсказание
y_pred_train = X_train.apply(
    lambda x: get_prediction(x, pred_df, constant_value), axis=1
)
y_pred_test = X_test.apply(lambda x: get_prediction(x, pred_df, constant_value), axis=1)

In [5]:
metrics_dict = dict()
metrics_print(
    X_train["lifetime"],
    y_pred_train,
    X_test["lifetime"],
    y_pred_test,
    metrics_dict,
    "baseline",
)

Train 
r2 score: 0.13701415847510268
MAE: 41.458226551106925
RMSE: 52.977589657456726
Test 
r2 score: 0.13206008322036067
MAE: 41.64982747703339
RMSE: 53.19466428984099


{'baseline': {'train': {'r2': 0.13701415847510268,
   'mae': 41.458226551106925,
   'rmse': 52.977589657456726},
  'test': {'r2': 0.13206008322036067,
   'mae': 41.64982747703339,
   'rmse': 53.19466428984099}}}

## Классификация

In [6]:
def classify_lifetime(lifetime):
    if lifetime <= 12:
        return 0
    elif 12 < lifetime <= 24:
        return 1
    elif 24 < lifetime <= 48:
        return 2
    elif 48 < lifetime <= 120:
        return 3
    elif lifetime > 120:
        return 4

In [7]:
X_train["lifetime"] = X_train["lifetime"].apply(classify_lifetime)
X_test["lifetime"] = X_test["lifetime"].apply(classify_lifetime)

In [8]:
# Группировка на основе 'Тип субъекта', 'Основной вид деятельности', 'Регион'
mode_df = X_train.groupby(["Тип субъекта", "Основной вид деятельности", "Регион"])[
    "lifetime"
].agg(lambda x: x.mode()[0])
default_value = X_train["lifetime"].mode()[0]


def get_prediction_classification(x):
    try:
        return mode_df.loc[
            (x["Тип субъекта"], x["Основной вид деятельности"], x["Регион"])
        ]
    except KeyError:
        return default_value


# Получим предсказание
y_pred_train = X_train.apply(get_prediction_classification, axis=1)
y_pred_test = X_test.apply(get_prediction_classification, axis=1)

In [9]:
metrics_dict_classification = dict()
classification_metrics_print(
    X_train["lifetime"],
    y_pred_train,
    X_test["lifetime"],
    y_pred_test,
    metrics_dict_classification,
    "baseline",
)

Train 
Accuracy: 0.3232137630455634
Precision: 0.3235301504942581
Recall: 0.3232137630455634
F1 Score: 0.31537628496520337
Test 
Accuracy: 0.31690989938657343
Precision: 0.316719879017582
Recall: 0.31690989938657343
F1 Score: 0.3090081752079128


{'baseline': {'train': {'accuracy': 0.3232137630455634,
   'precision': 0.3235301504942581,
   'recall': 0.3232137630455634,
   'f1': 0.31537628496520337},
  'test': {'accuracy': 0.31690989938657343,
   'precision': 0.316719879017582,
   'recall': 0.31690989938657343,
   'f1': 0.3090081752079128}}}