In [1]:
import os
import sys
import warnings

sys.path.append("..")

from src.pickle_manager import open_pickle
from src.regression_models import metrics_print
from src.classification_models import classification_metrics_print

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
data = open_pickle(os.path.join("..", "data", "pkls"), "processed_dataset_regression_first_year.pkl")
X_train, X_test, y_train, y_test = (
    data[0],
    data[1],
    data[2],
    data[3],
)
X_train["lifetime"] = y_train.values
X_test["lifetime"] = y_test.values

In [3]:
# Объединяем обратно закодированные признаки
def reverse_one_hot(df, prefix):
    filtered_cols = df.filter(like=prefix)
    max_cols = filtered_cols.idxmax(axis=1)
    return max_cols.apply(lambda x: x.replace(prefix + "_", ""))


for prefix in ["Основной вид деятельности", "Регион"]:
    X_train[prefix] = reverse_one_hot(X_train, prefix)
    X_test[prefix] = reverse_one_hot(X_test, prefix)

## Регрессия

In [5]:
# Группировка на основе 'Тип субъекта', 'Основной вид деятельности', 'Регион'
pred_df = X_train.groupby(["Тип субъекта", "Основной вид деятельности", "Регион"])[
    "lifetime"
].aggregate(["min", "mean", "max"])
constant_value = X_train.lifetime.mean()


def get_prediction(x, pred_df, constant_value):
    try:
        return pred_df.loc[
            (x["Тип субъекта"], x["Основной вид деятельности"], x["Регион"]), "mean"
        ]
    except KeyError:
        return constant_value


# Получим предсказание
y_pred_train = X_train.apply(
    lambda x: get_prediction(x, pred_df, constant_value), axis=1
)
y_pred_test = X_test.apply(lambda x: get_prediction(x, pred_df, constant_value), axis=1)

In [6]:
metrics_dict = dict()
metrics_print(
    X_train["lifetime"],
    y_pred_train,
    X_test["lifetime"],
    y_pred_test,
    metrics_dict,
    "baseline",
)

Train 
r2 score: 0.13698933606905472
MAE: 41.47056368836765
RMSE: 52.981976655735615
Test 
r2 score: 0.13208774454647154
MAE: 41.62097158490592
RMSE: 53.1813228721454


{'baseline': {'train': {'r2': 0.13698933606905472,
   'mae': 41.47056368836765,
   'rmse': 52.981976655735615},
  'test': {'r2': 0.13208774454647154,
   'mae': 41.62097158490592,
   'rmse': 53.1813228721454}}}

## Классификация

In [7]:
def classify_lifetime(lifetime):
    if lifetime <= 24:
        return 0
    elif 24 < lifetime <= 120:
        return 1
    elif lifetime > 120:
        return 2

In [8]:
X_train["lifetime"] = X_train["lifetime"].apply(classify_lifetime)
X_test["lifetime"] = X_test["lifetime"].apply(classify_lifetime)

In [9]:
# Группировка на основе 'Тип субъекта', 'Основной вид деятельности', 'Регион'
mode_df = X_train.groupby(["Тип субъекта", "Основной вид деятельности", "Регион"])[
    "lifetime"
].agg(lambda x: x.mode()[0])
default_value = X_train["lifetime"].mode()[0]


def get_prediction_classification(x):
    try:
        return mode_df.loc[
            (x["Тип субъекта"], x["Основной вид деятельности"], x["Регион"])
        ]
    except KeyError:
        return default_value


# Получим предсказание
y_pred_train = X_train.apply(get_prediction_classification, axis=1)
y_pred_test = X_test.apply(get_prediction_classification, axis=1)

In [10]:
metrics_dict_classification = dict()
classification_metrics_print(
    X_train["lifetime"],
    y_pred_train,
    X_test["lifetime"],
    y_pred_test,
    metrics_dict_classification,
    "baseline",
)

Train 
Accuracy: 0.5444169054527875
Precision: 0.5459237018083843
Recall: 0.5444169054527875
F1 Score: 0.5029948398551273
Test 
Accuracy: 0.5411402438696711
Precision: 0.5369373194007406
Recall: 0.5411402438696711
F1 Score: 0.49928618965770927


{'baseline': {'train': {'accuracy': 0.5444169054527875,
   'precision': 0.5459237018083843,
   'recall': 0.5444169054527875,
   'f1': 0.5029948398551273},
  'test': {'accuracy': 0.5411402438696711,
   'precision': 0.5369373194007406,
   'recall': 0.5411402438696711,
   'f1': 0.49928618965770927}}}