In [1]:
!pip install catboost
!pip install xgboost
!pip install dash

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from typing import Any, Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import (
    mean_absolute_percentage_error, 
    mean_squared_log_error, 
    roc_auc_score
)


def calc_all_metrics(data: Any) -> Dict[str, float]:
    def is_credit_issued(x: Any):
        ratio = x['__price_predict'] / x['__price_doc']
        if x['__priority'] <= 0:
            value = 0.0
        elif 0.9 < ratio < 1.0:
            value = x['__price_predict']
        elif 1.0 <= ratio < 1.1:
            value = x['__price_doc']
        else:
            value = 0.0

        return value

    def calc_profit(x: pd.DataFrame) -> np.array:
        if x['is_credit'] == 0.0:
            return 0.0
        if x['__churn'] == 1:
            return -x['debt'] * 2.0
        if x['debt'] < 5:
            return x['debt'] * 0.3
        if x['debt'] < 9:
            return x['debt'] * 0.4
        if x['debt'] >= 9:
            return x['debt'] * 0.5

    max_account = 25e3
    
    s = (
        data[['__priority', '__churn', '__churn_prob', '__price_doc', '__price_predict']]
        .sort_values('__priority', ascending=False)
        .copy(True)
    )

    s['debt'] = s.apply(is_credit_issued, axis=1)
    s['debt_cum'] = s['debt'].cumsum()
    s['is_credit'] = 0
    s.loc[(s['debt'] > 0) & (s['debt_cum'] <= max_account), 'is_credit'] = 1
    s['profit'] = s.apply(calc_profit, axis=1)

    total_profit = round(s['profit'].sum(), 2)
    good_credits_count = int(s['is_credit'].sum())
    good_credits_debt = int(s[s['is_credit'] == 1]['debt'].sum())
    bad_credits_count = s[s['is_credit'] == 1]['__churn'].sum()

    return {
        'total_profit': int(total_profit),
        'issue_amount': good_credits_debt,
        'bad_loans': round(bad_credits_count / (good_credits_count + bad_credits_count) * 100.0, 1),
        'churn_auc': round(roc_auc_score(y_true=s['__churn'], y_score=s['__churn_prob']), 3),
        'price_nmsle': round(
            -mean_squared_log_error(y_true=s['__price_doc'], y_pred=s['__price_predict']),
            3,
        ),
    }


METRICS_DESC = {
    'total_profit': 'Итоговая полученная прибыль (Ключевая метрика), млн руб.',
    'issue_amount': 'Итоговая выданная сумма (25 000 максимум), млн руб.',
    'bad_loans': 'Доля выданных кредитов с задолженностью, %',
    'churn_auc': 'Метрика ROC AUC по модели предсказания задолженности',
    'price_nmsle': 'Метрика Negative Mean Squared Logarithmic Error по модели предсказания стоимости',
}

In [3]:
import datetime

RANDOM_STATE = 42

now = datetime.datetime.now().strftime('%Y-%m-%d_%H%M')
SUBMISSION_PATH = f'../data/submissions/ml_pandas_submission.csv'
SUBMISSION_PATH

'../data/submissions/ml_pandas_submission.csv'

In [4]:
import pandas as pd
data = pd.read_csv('../data/train.csv')
submission = pd.read_csv('../data/test.csv')
data.shape, submission.shape

((20483, 61), (9988, 59))

In [5]:
data.loc[:, data.isnull().any()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20483 entries, 0 to 20482
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   max_floor                    10911 non-null  float64
 1   state                        8469 non-null   float64
 2   railroad_station_walk_km     20473 non-null  float64
 3   0_17_all                     18080 non-null  float64
 4   build_count_wood             17420 non-null  float64
 5   life_sq                      16290 non-null  float64
 6   cafe_sum_1000_min_price_avg  15887 non-null  float64
 7   metro_km_walk                20473 non-null  float64
 8   total_trans_amt              18080 non-null  float64
 9   cafe_sum_1500_min_price_avg  17485 non-null  float64
 10  floor                        20316 non-null  float64
 11  num_room                     10911 non-null  float64
 12  build_year                   8901 non-null   float64
 13  build_count_mix 

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20483 entries, 0 to 20482
Data columns (total 61 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   max_floor                              10911 non-null  float64
 1   state                                  8469 non-null   float64
 2   marital_status                         20483 non-null  object 
 3   big_market_raion                       20483 non-null  object 
 4   total_revolving_bal                    20483 non-null  int64  
 5   market_count_1500                      20483 non-null  int64  
 6   leisure_count_3000                     20483 non-null  int64  
 7   total_ct_chng_q4_q1                    20483 non-null  float64
 8   water_1line                            20483 non-null  object 
 9   railroad_station_walk_km               20473 non-null  float64
 10  culture_objects_top_25                 20483 non-null  object 
 11  co

In [7]:
from sklearn.impute import SimpleImputer

# Выделяем метки и удаляем их из данных
churn = data["__churn"]
price = data["__price_doc"]
data = data.drop(["__churn", "__price_doc"], axis=1)

data = data.drop(["timestamp"], axis=1)
submission = submission.drop(["timestamp"], axis=1)

# Удаляем столбцы с большим количеством пропусков
threshold = 0.4
missing_fraction = data.isnull().mean()
columns_to_drop = missing_fraction[missing_fraction > threshold].index

data = data.drop(columns=columns_to_drop)
submission = submission.drop(columns=columns_to_drop)

# Разделяем столбцы на числовые и категориальные
numeric_cols = data.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = data.select_dtypes(include=["object"]).columns

# Заполняем числовые данные медианой
num_imputer = SimpleImputer(strategy="median")
data[numeric_cols] = num_imputer.fit_transform(data[numeric_cols])
submission[numeric_cols] = num_imputer.transform(submission[numeric_cols])

# Заполняем категориальные данные самым частым значением
cat_imputer = SimpleImputer(strategy="most_frequent")
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])
submission[categorical_cols] = cat_imputer.transform(submission[categorical_cols])

In [8]:
from sklearn.preprocessing import StandardScaler

price_scaler = StandardScaler()
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
submission[numeric_cols] = scaler.transform(submission[numeric_cols])

price_scaled = price_scaler.fit_transform(price.values.reshape(-1, 1))


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20483 entries, 0 to 20482
Data columns (total 54 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   marital_status                         20483 non-null  object 
 1   big_market_raion                       20483 non-null  object 
 2   total_revolving_bal                    20483 non-null  float64
 3   market_count_1500                      20483 non-null  float64
 4   leisure_count_3000                     20483 non-null  float64
 5   total_ct_chng_q4_q1                    20483 non-null  float64
 6   water_1line                            20483 non-null  object 
 7   railroad_station_walk_km               20483 non-null  float64
 8   culture_objects_top_25                 20483 non-null  object 
 9   contacts_count_12_mon                  20483 non-null  float64
 10  0_17_all                               20483 non-null  float64
 11  tr

In [10]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


encoder = OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore')
encoded_data = pd.DataFrame(encoder.fit_transform(data[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
data = pd.concat([data.drop(categorical_cols, axis=1), encoded_data], axis=1)

encoded_submission = pd.DataFrame(encoder.transform(submission[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
submission = pd.concat([submission.drop(categorical_cols, axis=1), encoded_submission], axis=1)




In [11]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

model = RandomForestClassifier()
model.fit(data, churn)
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': data.columns,
    'Importance': feature_importances
})
important_features = np.array(data.columns)[feature_importances > 0.01]
data = data[important_features]
submission = submission[important_features]

In [12]:
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
5,contacts_count_12_mon,0.186275
3,total_ct_chng_q4_q1,0.100685
10,total_trans_ct,0.075175
33,total_relationship_count,0.064241
25,total_trans_amt,0.056144
26,months_inactive_12_mon,0.04184
0,total_revolving_bal,0.0399
22,total_amt_chng_q4_q1,0.036898
16,avg_utilization_ratio,0.032963
31,avg_open_to_buy,0.022829


In [13]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, mean_squared_error

X_train, X_val, y_churn_train, y_churn_val, y_price_train, y_price_val = train_test_split(
    data, churn, price_scaled, test_size=0.5, random_state=RANDOM_STATE
)

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

churn_model = LogisticRegression()
churn_model.fit(X_train, y_churn_train)

price_model =  xgb.XGBRegressor(n_estimators = 200, max_depth = 4)
price_model.fit(X_train, y_price_train)

y_price_pred = price_model.predict(X_val)
y_churn_pred = churn_model.predict(X_val)
y_price_pred_tr = price_model.predict(X_train)
y_churn_pred_tr = churn_model.predict(X_train)

In [15]:
"""
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

cat_features = [i for i in X_train.columns if X_train[i].dtype == 'object']

# Создание и обучение модели для классификации вероятности задолженности
churn_model = CatBoostClassifier(
    iterations= 6000,
    learning_rate=0.01,
    depth=6,
    cat_features=cat_features,
    verbose=200,
    random_state=42
)

churn_model.fit(X_train, y_churn_train, eval_set=(X_val, y_churn_val), early_stopping_rounds=50)

y_churn_pred = churn_model.predict_proba(X_val)[:, 1]
y_churn_pred_tr = churn_model.predict_proba(X_train)[:, 1]
roc_auc = roc_auc_score(y_churn_val, y_churn_pred)
print(f'ROC AUC Score for Churn Prediction: {roc_auc:.4f}')

# Создание и обучение модели для предсказания цены квартиры
price_model = CatBoostRegressor(
    iterations=6000,
    learning_rate=0.001,
    depth=6,
    cat_features=cat_features,
    verbose=200,
    random_state=42
)
price_model.fit(X_train, y_price_train, eval_set=(X_val, y_price_val), early_stopping_rounds=50)

# Оценка производительности регрессии
y_price_pred = price_model.predict(X_val)
y_price_pred_tr = price_model.predict(X_train)
rmse = np.sqrt(mean_squared_error(price_scaler.inverse_transform(y_price_val.reshape(-1, 1)), price_scaler.inverse_transform(y_price_pred.reshape(-1, 1))))
print(f'RMSE for Price Prediction: {rmse:.4f}')

# Предсказание на тестовом наборе
"""

"\nfrom catboost import CatBoostClassifier, CatBoostRegressor, Pool\n\ncat_features = [i for i in X_train.columns if X_train[i].dtype == 'object']\n\n# Создание и обучение модели для классификации вероятности задолженности\nchurn_model = CatBoostClassifier(\n    iterations= 6000,\n    learning_rate=0.01,\n    depth=6,\n    cat_features=cat_features,\n    verbose=200,\n    random_state=42\n)\n\nchurn_model.fit(X_train, y_churn_train, eval_set=(X_val, y_churn_val), early_stopping_rounds=50)\n\ny_churn_pred = churn_model.predict_proba(X_val)[:, 1]\ny_churn_pred_tr = churn_model.predict_proba(X_train)[:, 1]\nroc_auc = roc_auc_score(y_churn_val, y_churn_pred)\nprint(f'ROC AUC Score for Churn Prediction: {roc_auc:.4f}')\n\n# Создание и обучение модели для предсказания цены квартиры\nprice_model = CatBoostRegressor(\n    iterations=6000,\n    learning_rate=0.001,\n    depth=6,\n    cat_features=cat_features,\n    verbose=200,\n    random_state=42\n)\nprice_model.fit(X_train, y_price_train, ev

In [16]:
submission['__churn_prob'] = churn_model.predict_proba(submission)[:, 1]
submission['__price_predict'] = price_scaler.inverse_transform(price_model.predict(submission.drop(['__churn_prob'], axis = 1)).reshape(-1, 1))
submission.loc[submission['__price_predict'] < 0.01, '__price_predict'] = 0.01
#submission.loc[submission['__churn_prob'] < 0.01, '__churn_prob'] = 0.01
# Создание приоритета
#submission['__priority'] = (submission['__churn_prob'] * submission['__price_predict']).rank(ascending=False)

  submission.loc[submission['__price_predict'] < 0.01, '__price_predict'] = 0.01


In [17]:
train = pd.DataFrame()
test = pd.DataFrame()

test['__churn'] = y_churn_val
test['__price_doc'] = price_scaler.inverse_transform(y_price_val.reshape(-1, 1))
test['__price_predict'] = price_scaler.inverse_transform(y_price_pred.reshape(-1, 1))
test['__churn_prob'] = y_churn_pred
test.loc[test['__price_predict'] < 0.01, '__price_predict'] = 0.01
#test.loc[test['__churn_prob'] < 0.01, '__churn_prob'] = 0.01
#test['__priority'] = (test['__churn_prob'] * test['__price_predict']).rank(ascending=False)


train['__churn'] = y_churn_train
train['__price_doc'] = price_scaler.inverse_transform(y_price_train.reshape(-1, 1))
train['__price_predict'] = price_scaler.inverse_transform(y_price_pred_tr.reshape(-1, 1))
train['__churn_prob'] = y_churn_pred_tr
train.loc[train['__price_predict'] < 0.01, '__price_predict'] = 0.01
#train.loc[train['__churn_prob'] < 0.01, '__churn_prob'] = 0.01
#train['__priority'] = (train['__churn_prob'] * train['__price_predict']).rank(ascending=False)

  test.loc[test['__price_predict'] < 0.01, '__price_predict'] = 0.01
  train.loc[train['__price_predict'] < 0.01, '__price_predict'] = 0.01


In [None]:
def alg(x, min_treshholdm, price_scaler, price_importance):
    return - (x['__price_predict'] ** price_importance) * np.log(min_treshholdm + (1 - min_treshholdm)* np.tanh((x['__price_predict']*price_scaler)) + x['__churn_prob'])

train['__priority'] = train.apply(lambda x: alg(x , 0.75, 0, 0.1), axis=1)
test['__priority'] = test.apply(lambda x: alg(x , 0.75, 0, 0.1), axis=1)
submission['__priority'] = submission.apply(lambda x: alg(x , 0.75, 0, 0.1), axis=1)

In [19]:
score = pd.concat([
    pd.Series(calc_all_metrics(train), name='train'),
    pd.Series(calc_all_metrics(test), name='test'),
], axis=1)

# добавляем колонку с описанием метрики
score['desc'] = score.index.map(METRICS_DESC)
score

Unnamed: 0,train,test,desc
total_profit,6618.0,6166.0,Итоговая полученная прибыль (Ключевая метрика)...
issue_amount,24993.0,21091.0,"Итоговая выданная сумма (25 000 максимум), млн..."
bad_loans,5.2,4.3,"Доля выданных кредитов с задолженностью, %"
churn_auc,0.838,0.851,Метрика ROC AUC по модели предсказания задолже...
price_nmsle,-0.078,-0.148,Метрика Negative Mean Squared Logarithmic Erro...


In [20]:
mysub = submission[['__price_predict', '__churn_prob', '__priority']]
mysub.to_csv(SUBMISSION_PATH, index=False)

if mysub.shape != (9988, 3):
    raise ValueError('Неправильный размер submission файла')

In [21]:
mysub.iloc[1]

__price_predict    7.230572
__churn_prob       0.004380
__priority         0.343517
Name: 1, dtype: float64

In [22]:
%run ../dashboard/app.py