In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
df = pd.read_csv("C:/Users/Ilsaf/Desktop/credit_scoring_sample.csv")

In [3]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.00019,0,0,10500.0,2.0
4,1,49,0,0.27182,0,0,400.0,0.0


In [4]:
df["MonthlyIncome"] = df["MonthlyIncome"].fillna(df["MonthlyIncome"].median())
df["NumberOfDependents"] = df["NumberOfDependents"].fillna(df["NumberOfDependents"].median())

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45063 entries, 0 to 45062
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SeriousDlqin2yrs                      45063 non-null  int64  
 1   age                                   45063 non-null  int64  
 2   NumberOfTime30-59DaysPastDueNotWorse  45063 non-null  int64  
 3   DebtRatio                             45063 non-null  float64
 4   NumberOfTimes90DaysLate               45063 non-null  int64  
 5   NumberOfTime60-89DaysPastDueNotWorse  45063 non-null  int64  
 6   MonthlyIncome                         45063 non-null  float64
 7   NumberOfDependents                    45063 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 2.8 MB


In [6]:
X = df.drop("SeriousDlqin2yrs", axis=1)
y = df["SeriousDlqin2yrs"]

In [7]:
X.head()

Unnamed: 0,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,64,0,0.249908,0,0,8158.0,0.0
1,58,0,3870.0,0,0,5166.0,0.0
2,41,0,0.456127,0,0,6666.0,0.0
3,43,0,0.00019,0,0,10500.0,2.0
4,49,0,0.27182,0,0,400.0,0.0


In [8]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: SeriousDlqin2yrs, dtype: int64

In [9]:
bad_age = df.query("SeriousDlqin2yrs == 1")["age"].reset_index(drop=True)

In [10]:
len(bad_age)

10026

In [11]:
np.random.seed(0)

In [12]:
mean_sample = [bad_age.sample(len(bad_age), replace=True).mean() for _ in range(1000)]

In [13]:
pd.Series(mean_sample).quantile(0.05)

45.71379413524836

In [14]:
pd.Series(mean_sample).quantile(0.95)

46.127004787552366

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [119]:
lr = LogisticRegression(random_state=0, class_weight="balanced")
parameters = {"C": (0.0001, 0.001, 0.01, 0.1, 1, 10)}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

In [120]:
gs = GridSearchCV(lr, parameters, n_jobs=-1, scoring="roc_auc", cv=skf)

In [121]:
gs.fit(X, y)

In [122]:
gs.best_params_

{'C': 0.001}

In [126]:
gs.cv_results_['std_score_time'][1]

0.0011659059537481487

In [45]:
gs.feature_names_in_

array(['age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse',
       'MonthlyIncome', 'NumberOfDependents'], dtype=object)

In [46]:
lr = gs.best_estimator_

In [47]:
lr.coef_

array([[-1.36555066e-02,  4.46831599e-01, -6.49418826e-06,
         3.90380174e-01,  2.16037978e-01, -1.14779682e-05,
         1.91725732e-01]])

In [48]:
lr.intercept_

array([0.11948203])

In [49]:
X.head()

Unnamed: 0,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,64,0,0.249908,0,0,8158.0,0.0
1,58,0,3870.0,0,0,5166.0,0.0
2,41,0,0.456127,0,0,6666.0,0.0
3,43,0,0.00019,0,0,10500.0,2.0
4,49,0,0.27182,0,0,400.0,0.0


In [50]:
from sklearn.preprocessing import normalize, MinMaxScaler

In [51]:
normalize([np.array([1, 2, 10, 500, -5])])

array([[ 0.00199948,  0.00399896,  0.0199948 ,  0.9997401 , -0.0099974 ]])

In [52]:
new_X = (X - X.min()) / (X.max() - X.min())

In [53]:
new_X

Unnamed: 0,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0.500000,0.000000,7.655510e-07,0.0,0.0,0.004547,0.0
1,0.430233,0.000000,1.185509e-02,0.0,0.0,0.002880,0.0
2,0.232558,0.000000,1.397268e-06,0.0,0.0,0.003716,0.0
3,0.255814,0.000000,5.820329e-10,0.0,0.0,0.005853,0.2
4,0.325581,0.000000,8.326747e-07,0.0,0.0,0.000223,0.0
...,...,...,...,...,...,...,...
45058,0.116279,0.000000,2.526406e-06,0.0,0.0,0.001672,0.1
45059,0.325581,0.000000,2.000355e-02,0.0,0.0,0.000000,0.5
45060,0.197674,0.000000,1.457659e-06,0.0,0.0,0.001672,0.2
45061,0.302326,0.010204,1.486322e-06,0.0,0.0,0.006533,0.5


In [76]:
lr = LogisticRegression(random_state=0, class_weight="balanced")
parameters = {"C": (0.0001, 0.001, 0.01, 0.1, 1, 10)}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

In [77]:
gs = GridSearchCV(lr, parameters, n_jobs=-1, scoring="roc_auc", cv=skf)

In [78]:
gs.fit(new_X, y)

In [79]:
be = gs.best_estimator_

In [80]:
be.coef_[0].argmax()

1

In [81]:
be.coef_[0]

array([ -2.61149115,  45.95491065,  -2.91138877,  40.21441851,
         5.11616919, -13.78999788,   0.71430096])

In [82]:
gs.feature_names_in_[1]

'NumberOfTime30-59DaysPastDueNotWorse'

In [83]:
feature_df = pd.DataFrame({"feature": new_X.columns, "coefficient": be.coef_[0]})

In [84]:
feature_df

Unnamed: 0,feature,coefficient
0,age,-2.611491
1,NumberOfTime30-59DaysPastDueNotWorse,45.954911
2,DebtRatio,-2.911389
3,NumberOfTimes90DaysLate,40.214419
4,NumberOfTime60-89DaysPastDueNotWorse,5.116169
5,MonthlyIncome,-13.789998
6,NumberOfDependents,0.714301


In [85]:
f = np.array([np.exp(coef) for coef in feature_df["coefficient"]])

In [86]:
f / np.sum(f)

array([8.06281061e-22, 9.96797105e-01, 5.97368861e-22, 3.20289503e-03,
       1.83048723e-18, 1.12647789e-26, 2.24315674e-20])

In [87]:
xxx = X.copy()
xxx["age"] = xxx["age"] + 20
new_X2 = (xxx - xxx.min()) / (xxx.max() - xxx.min())

In [88]:
new_X2

Unnamed: 0,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0.500000,0.000000,7.655510e-07,0.0,0.0,0.004547,0.0
1,0.430233,0.000000,1.185509e-02,0.0,0.0,0.002880,0.0
2,0.232558,0.000000,1.397268e-06,0.0,0.0,0.003716,0.0
3,0.255814,0.000000,5.820329e-10,0.0,0.0,0.005853,0.2
4,0.325581,0.000000,8.326747e-07,0.0,0.0,0.000223,0.0
...,...,...,...,...,...,...,...
45058,0.116279,0.000000,2.526406e-06,0.0,0.0,0.001672,0.1
45059,0.325581,0.000000,2.000355e-02,0.0,0.0,0.000000,0.5
45060,0.197674,0.000000,1.457659e-06,0.0,0.0,0.001672,0.2
45061,0.302326,0.010204,1.486322e-06,0.0,0.0,0.006533,0.5


In [91]:
be.score(X, y)

0.7776224396955373

In [92]:
gs.best_score_

0.7838703187956915

In [93]:
rf = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=42,
    oob_score=True,
    class_weight="balanced",
)

## Будем искать лучшие параметры среди следующего набора
parameters = {
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}

gs2 = GridSearchCV(rf, parameters, n_jobs=-1, scoring="roc_auc", cv=skf)

In [94]:
gs2.fit(new_X, y)

In [96]:
gs2.best_score_ - gs.best_score_

0.04406550483663518

In [97]:
gs2.best_params_

{'max_depth': 15, 'max_features': 1, 'min_samples_leaf': 9}

In [101]:
be2 = gs2.best_estimator_

In [103]:
be2.feature_importances_

array([0.14519692, 0.28149458, 0.06784855, 0.24659682, 0.14992738,
       0.08821249, 0.02072327])

In [104]:
be2.feature_names_in_

array(['age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse',
       'MonthlyIncome', 'NumberOfDependents'], dtype=object)

In [114]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    "max_features": [2, 3, 4],
    "max_samples": [0.5, 0.7, 0.9],
    "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100],
}
l = LogisticRegression()
bc = BaggingClassifier(estimator=l, n_estimators=100, random_state=42)

In [115]:
rs = RandomizedSearchCV(bc, parameters, n_iter=20, n_jobs=-1, cv=5, random_state=1)

In [116]:
rs.fit(X, y)

  clone(base_estimator).set_params(**self.best_params_)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [117]:
rs.best_params_

{'max_samples': 0.5, 'max_features': 4, 'base_estimator__C': 10}

In [118]:
rs.best_score_


0.7868095076357872