### Импорты 

In [6]:
import pandas as pd
import numpy as np
import pickle


from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Определение фичей по типам

In [7]:
cat_features = ['home_ownership',
 'years_in_current_job',
 'tax_liens',
 'number_of_credit_problem',
 'bankruptcies',
 'purpose',
 'term',
 'num_features_clustered_by_3',
 'num_features_clustered_by_4']

num_features = ['income',
 'number_of_open_accounts',
 'years_of_credit_history',
 'maximum_open_credit',
 'current_loan_amount',
 'current_credit_balance',
 'monthly_debt',
 'credit_score',
 'mean_credit_score_per_history',
 'current_credit_balance_per_income',
 'months_to_pay_per_month_of_credit_history',
 'income_per_credit_history',
 'number_of_credit_problem_per_history',
 'number_of_bankruptcies_per_history',
 'monthly_income_per_monthly_debt',
 'current_loan_amount_per_income',
 'PCA_5_0',
 'PCA_5_1',
 'PCA_5_2',
 'PCA_5_3',
 'PCA_5_4']

target = 'credit_default'

### Обучение модели

В ходе исслеования были созданы фичи, показывающие различные отношения признаков, а также с помощью алгоритма PCA. По итогу их отбора, при изменении или удалении лишних, метрика f1 ухудшалась, поэтому принято решение оставить всё "как есть"

In [10]:
data = pd.read_csv('data/train_with_all_features_made_without_cat_encoding.csv')

In [11]:
# разделение датасетов на обучене и таргет
x = data.drop(columns=[target])
y = data[target]

In [12]:
data['credit_default'].value_counts()  # видим дисбаланс классов

0    5387
1    2113
Name: credit_default, dtype: int64

In [13]:
x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']] = x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']].astype(int) # приедение категориальных фичей к типу int

In [27]:
params = {'depth': 7, 'l2_leaf_reg': 9, 'learning_rate': 0.03, 'n_estimators': 200}

# обучение модели на уже подобранных параметрах
catboost = CatBoostClassifier(class_weights=[2.1, 5.3],  
                           random_seed=1, 
                           silent=True,
                           **params)

catboost.fit(x, y, cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x7f7c85393bb0>

In [28]:
# сохранение модели
with open('research/catboost_grid_search_model.pkl', 'wb') as f:
    pickle.dump(catboost, f)

### Предсказание на тестовой выборке

In [30]:
test = pd.read_csv('data/course_project_test.csv')  # загружаем тестовый датасет
test.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,,4 years,0.0,9.0,12.5,220968.0,0.0,70.0,0.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,
1,Rent,231838.0,1 year,0.0,6.0,32.7,55946.0,0.0,8.0,0.0,educational expenses,Short Term,78298.0,46037.0,2318.0,699.0
2,Home Mortgage,1152540.0,3 years,0.0,10.0,13.7,204600.0,0.0,,0.0,debt consolidation,Short Term,200178.0,146490.0,18729.0,7260.0
3,Home Mortgage,1220313.0,10+ years,0.0,16.0,17.0,456302.0,0.0,70.0,0.0,debt consolidation,Short Term,217382.0,213199.0,27559.0,739.0
4,Home Mortgage,2340952.0,6 years,0.0,11.0,23.6,1207272.0,0.0,,0.0,debt consolidation,Long Term,777634.0,425391.0,42605.0,706.0


In [19]:
test.info()  # смотрим на пропуски

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                2500 non-null   object 
 1   Annual Income                 1987 non-null   float64
 2   Years in current job          2414 non-null   object 
 3   Tax Liens                     2500 non-null   float64
 4   Number of Open Accounts       2500 non-null   float64
 5   Years of Credit History       2500 non-null   float64
 6   Maximum Open Credit           2500 non-null   float64
 7   Number of Credit Problems     2500 non-null   float64
 8   Months since last delinquent  1142 non-null   float64
 9   Bankruptcies                  2497 non-null   float64
 10  Purpose                       2500 non-null   object 
 11  Term                          2500 non-null   object 
 12  Current Loan Amount           2500 non-null   float64
 13  Cur

In [31]:
def process_df(x_test):
    # очистка и заполнение пропусков
    x_test.loc[x_test['Home Ownership']=='Have Mortgage', 'Home Ownership'] = 'Home Mortgage'
    x_test['Home Ownership'] = x_test['Home Ownership'].astype('object')

    x_test.loc[x_test['Annual Income'].isna(), 'Annual Income'] = 0
    
    x_test.loc[x_test['Years in current job'].isna(), 'Years in current job'] = 'unknown'
    x_test['Years in current job'] = x_test['Years in current job'].astype('object')
    
    x_test['Tax Liens'] = x_test['Tax Liens'].astype('object')
    
    x_test['Number of Credit Problems'] = x_test['Number of Credit Problems'].astype('object')

    x_test.drop(columns=['Months since last delinquent'], inplace=True)

    x_test.loc[x_test['Bankruptcies'].isna(), 'Bankruptcies'] = 0
    x_test['Bankruptcies'] = x_test['Bankruptcies'].astype('object')


    x_test.loc[x_test['Purpose']=='small business', 'Purpose'] = 'business loan'
    x_test.loc[x_test['Purpose']=='vacation', 'Purpose'] = 'take a trip'
    x_test.loc[x_test['Purpose']=='renewable energy', 'Purpose'] = 'other'
    x_test['Purpose'] = x_test['Purpose'].astype('object')

    x_test.loc[x_test['Term']=='Short Term', 'Term'] = 0
    x_test.loc[x_test['Term']=='Long Term', 'Term'] = 1
    x_test['Term'] = x_test['Term'].astype('int8')

    x_test.loc[x_test['Current Loan Amount'] > 40000000, 'Current Loan Amount'] = x_test['Current Loan Amount'].mean()

    x_test.loc[x_test['Credit Score']>5000, 'Credit Score'] = x_test.loc[x_test['Credit Score']>5000, 'Credit Score'] / 10
    x_test.loc[x_test['Credit Score'].isna(), 'Credit Score'] = x_test['Credit Score'].median()

    columns_to_rename = {
    'Home Ownership': 'home_ownership',
    'Annual Income': 'income',
    'Years in current job': 'years_in_current_job',
    'Tax Liens': 'tax_liens',
    'Number of Open Accounts': 'number_of_open_accounts',
    'Years of Credit History': 'years_of_credit_history',
    'Maximum Open Credit': 'maximum_open_credit',
    'Number of Credit Problems': 'number_of_credit_problem',
    'Bankruptcies': 'bankruptcies',
    'Purpose': 'purpose',
    'Term': 'term',
    'Current Loan Amount': 'current_loan_amount',
    'Current Credit Balance': 'current_credit_balance',
    'Monthly Debt': 'monthly_debt',
    'Credit Score': 'credit_score',
    'Credit Default': 'credit_default'
}
    x_test.rename(columns=columns_to_rename, inplace=True)

    cat_features = ['home_ownership', 'years_in_current_job', 'tax_liens', 'number_of_credit_problem', 'bankruptcies', 'purpose', 'term']
    num_features = ['income', 'number_of_open_accounts', 'years_of_credit_history', 'maximum_open_credit', 'current_loan_amount', 'current_credit_balance', 'monthly_debt', 'credit_score']
    target = 'credit_default'


    # создание фичей
    kmeans = KMeans(n_clusters=3)
    x_test['num_features_clustered_by_3'] = kmeans.fit_predict(StandardScaler().fit_transform(x_test[num_features].copy()))

    kmeans = KMeans(n_clusters=4)
    x_test['num_features_clustered_by_4'] = kmeans.fit_predict(StandardScaler().fit_transform(x_test[num_features].copy()))

    cat_features.extend(['num_features_clustered_by_3', 'num_features_clustered_by_4'])

    x_test['income_per_credit_history'] = x_test['income'] * x_test['years_of_credit_history']
    x_test['mean_credit_score_per_history'] = x_test['credit_score'] / x_test['years_of_credit_history']
    x_test['number_of_credit_problem_per_history'] = x_test['number_of_credit_problem'] / x_test['years_of_credit_history']
    x_test['number_of_bankruptcies_per_history'] = x_test['bankruptcies'] / x_test['years_of_credit_history']
    x_test['current_credit_balance_per_income'] = x_test['current_credit_balance'] / x_test['income']
    x_test['monthly_income_per_monthly_debt'] = (x_test['income'] / 12) / x_test['monthly_debt']
    x_test['current_loan_amount_per_income'] = x_test['current_loan_amount'] / x_test['income']
    x_test['months_to_pay_per_month_of_credit_history'] = (x_test['current_loan_amount'] / x_test['monthly_debt']) / (x_test['years_of_credit_history'] * 12)


    x_test['current_credit_balance_per_income'].replace([np.inf], 2.1767, inplace=True)
    x_test['monthly_income_per_monthly_debt'].replace([np.inf], 1427, inplace=True)
    x_test['current_loan_amount_per_income'].replace([np.inf], 54.787312, inplace=True)
    x_test['months_to_pay_per_month_of_credit_history'].replace([np.inf], 165.114387, inplace=True)

    x_test['current_credit_balance_per_income'].fillna(x_test['current_credit_balance_per_income'].median(), inplace=True)
    x_test['monthly_income_per_monthly_debt'].fillna(x_test['monthly_income_per_monthly_debt'].median(), inplace=True)

    x_test[['tax_liens', 'number_of_credit_problem', 'bankruptcies']] = x_test[['tax_liens', 'number_of_credit_problem', 'bankruptcies']].astype(int)
     

    data_to_pca = StandardScaler().fit_transform(x_test[num_features].copy())
    pca_5 = PCA(n_components=5)
    data_5 = pca_5.fit_transform(data_to_pca)
    for i in range(data_5.shape[1]):
        x_test[f'PCA_5_{i}'] = data_5[:, i]

    return x_test






In [32]:
test_processed = process_df(test)

In [33]:
test_processed.info()  # смотрим результаты обработки

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 30 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   home_ownership                             2500 non-null   object 
 1   income                                     2500 non-null   float64
 2   years_in_current_job                       2500 non-null   object 
 3   tax_liens                                  2500 non-null   int64  
 4   number_of_open_accounts                    2500 non-null   float64
 5   years_of_credit_history                    2500 non-null   float64
 6   maximum_open_credit                        2500 non-null   float64
 7   number_of_credit_problem                   2500 non-null   int64  
 8   bankruptcies                               2500 non-null   int64  
 9   purpose                                    2500 non-null   object 
 10  term                    

In [23]:
test_processed.head()

Unnamed: 0,home_ownership,income,years_in_current_job,tax_liens,number_of_open_accounts,years_of_credit_history,maximum_open_credit,number_of_credit_problem,bankruptcies,purpose,...,number_of_bankruptcies_per_history,current_credit_balance_per_income,monthly_income_per_monthly_debt,current_loan_amount_per_income,months_to_pay_per_month_of_credit_history,PCA_5_0,PCA_5_1,PCA_5_2,PCA_5_3,PCA_5_4
0,Rent,0.0,4 years,0,9.0,12.5,220968.0,0,0,debt consolidation,...,0.0,2.1767,0.0,54.787312,0.15898,-1.635416,-0.001191,0.778274,0.471258,-0.320359
1,Rent,231838.0,1 year,0,6.0,32.7,55946.0,0,0,educational expenses,...,0.0,0.198574,8.334699,0.337727,0.086081,-1.342081,-0.281763,0.795822,-1.616833,1.013115
2,Home Mortgage,1152540.0,3 years,0,10.0,13.7,204600.0,0,0,debt consolidation,...,0.0,0.127102,5.128144,0.173684,0.065013,-0.58444,-0.207081,-0.02923,0.154884,-0.004013
3,Home Mortgage,1220313.0,10+ years,0,16.0,17.0,456302.0,0,0,debt consolidation,...,0.0,0.174708,3.690001,0.178136,0.038666,0.475219,-0.102777,-0.217966,-0.355043,-1.089592
4,Home Mortgage,2340952.0,6 years,0,11.0,23.6,1207272.0,0,0,debt consolidation,...,0.0,0.181717,4.57879,0.332187,0.06445,1.955064,-0.739666,-0.753796,-0.661283,0.704344


In [34]:
prediction = catboost.predict(test_processed)  # получение предсказаний на тестовой выборке

In [35]:
len(prediction[prediction==1]), len(prediction[prediction==0])


(1001, 1499)

In [36]:
pred_series = pd.Series(prediction)
pred_series.to_csv('MMogilyov_predictions.csv', index=False)  # сохраняем в файл