In [None]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV

import catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from math import isnan as isnan

import warnings
#warnings.filterwarnings('ignore')

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [None]:
def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [None]:
def show_proba_calibration_plots(y_predicted_probs, y_true_labels):
    preds_with_true_labels = np.array(list(zip(y_predicted_probs, y_true_labels)))

    thresholds = []
    precisions = []
    recalls = []
    f1_scores = []

    for threshold in np.linspace(0.1, 0.9, 9):
        thresholds.append(threshold)
        precisions.append(precision_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        recalls.append(recall_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        f1_scores.append(f1_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))

    scores_table = pd.DataFrame({'f1':f1_scores,
                                 'precision':precisions,
                                 'recall':recalls,
                                 'probability':thresholds}).sort_values('f1', ascending=False).round(3)
  
    figure = plt.figure(figsize = (15, 5))

    plt1 = figure.add_subplot(121)
    plt1.plot(thresholds, precisions, label='Precision', linewidth=4)
    plt1.plot(thresholds, recalls, label='Recall', linewidth=4)
    plt1.plot(thresholds, f1_scores, label='F1', linewidth=4)
    plt1.set_ylabel('Scores')
    plt1.set_xlabel('Probability threshold')
    plt1.set_title('Probabilities threshold calibration')
    plt1.legend(bbox_to_anchor=(0.25, 0.25))   
    plt1.table(cellText = scores_table.values,
               colLabels = scores_table.columns, 
               colLoc = 'center', cellLoc = 'center', loc = 'bottom', bbox = [0, -1.3, 1, 1])

    plt2 = figure.add_subplot(122)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 0][:, 0], 
              label='Another class', color='royalblue', alpha=1)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 1][:, 0], 
              label='Main class', color='darkcyan', alpha=0.8)
    plt2.set_ylabel('Number of examples')
    plt2.set_xlabel('Probabilities')
    plt2.set_title('Probability histogram')
    plt2.legend(bbox_to_anchor=(1, 1))

    plt.show()

In [None]:
def show_learning_curve_plot(estimator, X, y, cv=3, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):

    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, 
                                                            cv=cv, 
                                                            scoring='f1',
                                                            train_sizes=train_sizes, 
                                                            n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(15,8))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.title(f"Learning curves ({type(estimator).__name__})")
    plt.xlabel("Training examples")
    plt.ylabel("Score")     
    plt.legend(loc="best")
    plt.grid()
    plt.show()

In [None]:
def show_feature_importances(feature_names, feature_importances, get_top=None):
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False)
       
    plt.figure(figsize = (20, len(feature_importances) * 0.355))
    
    sns.barplot(feature_importances['importance'], feature_importances['feature'])
    
    plt.xlabel('Importance')
    plt.title('Importance of features')
    plt.show()
    
    if get_top is not None:
        return feature_importances['feature'][:get_top].tolist()

In [None]:
def balance_df_by_target(df, target_name, method='over'):

    assert method in ['over', 'under', 'tomek', 'smote'], 'Неверный метод сэмплирования'
    
    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1
    if method == 'over':
        for i in range(disbalance_coeff):
            sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
            df = df.append(sample, ignore_index=True)
            
    elif method == 'under':
        df_ = df.copy()
        df = df_[df_[target_name] == minor_class_name]
        tmp = df_[df_[target_name] == major_class_name]
        df = df.append(tmp.iloc[
            np.random.randint(0, tmp.shape[0], target_counts[minor_class_name])
        ], ignore_index=True)

    elif method == 'tomek':
        from imblearn.under_sampling import TomekLinks
        tl = TomekLinks()
        X_tomek, y_tomek = tl.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_tomek, y_tomek], axis=1)
    
    elif method == 'smote':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE()
        X_smote, y_smote = smote.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_smote, y_smote], axis=1)

    return df.sample(frac=1) 

In [None]:
DATA_ROOT = Path('/kaggle/')

# input
TRAIN_DATASET_PATH = DATA_ROOT / 'input/gb-credit-default/train.csv'
TEST_DATASET_PATH = DATA_ROOT / 'input/gb-credit-default/test.csv'
SAMPLE_PREDICTION_PATH = DATA_ROOT / 'input/gb-credit-default/sample_submission.csv'

# output
OUTPUT_PATH = DATA_ROOT / 'output/'
SCALER_FILE_PATH = OUTPUT_PATH / 'scaler.pkl'
MODEL_FILE_PATH = OUTPUT_PATH / 'model.pkl'
PREDICTION_PATH = OUTPUT_PATH / 'submission.csv'

In [None]:
test_df = pd.read_csv(TEST_DATASET_PATH)

In [None]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.describe()

In [None]:
df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
df.select_dtypes(include='object').columns.tolist()

In [None]:
test_df.select_dtypes(include='object').columns.tolist()

### Заполнение пропусков.

In [None]:
df.hist(figsize=(16, 16), bins=20, grid=False);

In [None]:
test_df.hist(figsize=(16, 16), bins=20, grid=False);

In [None]:
median = df['Annual Income'].median()
df.loc[df['Annual Income'].isna(), 'Annual Income'] = median
median = df['Credit Score'].median()
df.loc[df['Credit Score'].isna(), 'Credit Score'] = median
df.loc[df['Years in current job'].isna(), 'Years in current job'] = 0
df.loc[df['Bankruptcies'].isna(), 'Bankruptcies'] = 0
df.loc[df['Home Ownership'].isna(), 'Home Ownership'] = 0
df.loc[(df['Home Ownership'] == 0), 'Home Ownership'] = 'No'
df.loc[df['Months since last delinquent'].isna(), 'Months since last delinquent'] = df['Years of Credit History']*12
df.isna().sum()

In [None]:
median = test_df['Annual Income'].median()
test_df.loc[test_df['Annual Income'].isna(), 'Annual Income'] = median
median = test_df['Credit Score'].median()
test_df.loc[test_df['Credit Score'].isna(), 'Credit Score'] = median
test_df.loc[test_df['Years in current job'].isna(), 'Years in current job'] = 0
test_df.loc[test_df['Bankruptcies'].isna(), 'Bankruptcies'] = 0
test_df.loc[test_df['Home Ownership'].isna(), 'Home Ownership'] = 0
test_df.loc[(test_df['Home Ownership'] == 0), 'Home Ownership'] = 'No'
test_df.loc[test_df['Months since last delinquent'].isna(), 'Months since last delinquent'] = test_df['Years of Credit History']*12
test_df.isna().sum()

## Приведение типов

In [None]:
df_object_features = df.select_dtypes(include='object').columns.tolist()
df.select_dtypes(include='object').columns.tolist()

In [None]:
df = df.astype({'Home Ownership': str, 'Years in current job': str, 'Purpose': str, 'Term': str})

In [None]:
test_df_object_features = test_df.select_dtypes(include='object').columns.tolist()
test_df.select_dtypes(include='object').columns.tolist()

In [None]:
test_df = test_df.astype({'Home Ownership': str, 'Years in current job': str, 'Purpose': str, 'Term': str})

## Построение модели

In [None]:
target_name = 'Credit Default'
X = df.drop(columns=target_name)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=23)

In [None]:
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]
disbalance

In [None]:
frozen_params = {
     'class_weights':[1, disbalance], 
     'silent':True,
     'random_state':21,
     'text_features':df_object_features,
     'eval_metric':'F1',
     'early_stopping_rounds':20
}

In [None]:
model_cat = catb.CatBoostClassifier(**frozen_params, iterations=300, max_depth=7)
model_cat.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

evaluate_preds(model_cat, X_train, X_test, y_train, y_test)

nan

## Применение модели

In [None]:
prediction = model_cat.predict(test_df)
prediction

In [None]:
submit = pd.read_csv(SAMPLE_PREDICTION_PATH)
submit.head()

In [None]:
submit[target_name] = prediction
submit.head()

In [None]:
submit.to_csv('gb_submit.csv', index=False)

nan