In [4]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('seaborn-dark')
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
import matplotlib as plt
plt.rc('font', size=14)

def plot_2d_separator(classifier, X, fill=False, line=True, ax=None, eps=None):
    if eps is None:
        
        eps = 1.0
    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
    xx = np.linspace(x_min, x_max, 100)
    yy = np.linspace(y_min, y_max, 100)
    
    X1, X2 = np.meshgrid(xx, yy)
    X_grid = np.c_[X1.ravel(), X2.ravel()]
    try:
        decision_values = classifier.decision_function(X_grid)
        levels = [0]
        fill_levels = [decision_values.min(), 0, decision_values.max()]
    except:
        decision_values = classifier.predict_proba(X_grid)[:, 1]
        levels = [.5]
        fill_levels = [0, .5, 1]
        
    if ax is None:
        ax = plt.gca()
    if fill:
        ax.contourf(X1, X2, decision_values.reshape(X1.shape),
                    levels=fill_levels, colors=['cyan', 'pink'])
    if line:
        ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
                   colors="black")
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())

## Загружаем данные

train = pd.read_csv('/home/paniquex/PycharmProjects/IML_homeworks/credit_scoring/train.csv')
test = pd.read_csv('/home/paniquex/PycharmProjects/IML_homeworks/credit_scoring/test.csv')

print(train.shape, test.shape)

y = train.pop('плохой_клиент')
train.shape, y.shape

## Предобработка

## Очень много пропусков в доходе: 

print(train.columns)
print(train.info())

print(test.info())

## Интересный человек №21595

train.sort_values(by='недвижимость', ascending=False).head()

train.describe()

train['доход'][train["доход"] < 1].value_counts()

## Заполним пропуски в доходе средним значением по всей таблице, а значение семьи = 0 

train.sort_values(by='доход', ascending=False).head(10)

mean_income_train = train['доход'][train['доход'] < 3800000].mean()
mean_income_test = test['доход'][test['доход'] < 3800000].mean()
print(mean_income_train, mean_income_test)
train['доход'].fillna(mean_income_train, inplace=True)
test['доход'].fillna(mean_income_test, inplace=True)

### Семья

train['семья'].fillna(1, inplace=True)
test['семья'].fillna(1, inplace=True)

(112500, 11) (37500, 10)
Index(['линии', 'возраст', 'поведение_30-59_дней', 'Debt_Ratio', 'доход',
       'число_кредитов', 'поведение_90_дней', 'недвижимость',
       'поведение_60-89_дней', 'семья'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112500 entries, 0 to 112499
Data columns (total 10 columns):
линии                   112500 non-null float64
возраст                 112500 non-null int64
поведение_30-59_дней    112500 non-null int64
Debt_Ratio              112500 non-null float64
доход                   90274 non-null float64
число_кредитов          112500 non-null int64
поведение_90_дней       112500 non-null int64
недвижимость            112500 non-null int64
поведение_60-89_дней    112500 non-null int64
семья                   109549 non-null float64
dtypes: float64(4), int64(6)
memory usage: 8.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37500 entries, 0 to 37499
Data columns (total 10 columns):
линии                   37500 non-

6662.379910051621 6693.8207701283545


In [5]:
print(train.shape, test.shape)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
strCV = StratifiedKFold(n_splits = 5, random_state = 10)
model_rand_forest_line = RandomForestClassifier(n_estimators = 40)
rand_forest_line_grid_param = {'max_depth' : range(8, 14), 'max_features' : range(3, 9), 
                               'criterion' : ['entropy'], 
                              'min_samples_split': [30],
                              'min_samples_leaf' : [30],
                               'min_impurity_decrease' : [0.00001, 0.0004, 0.1],
                                'min_weight_fraction_leaf' : [0.00005, 0.0001, 0.01],
                                 'min_impurity_split' : [0.00004, 0.1, 0.001, 1, 0.0001]
                              }
                                            
model_rand_forest_line_grid = GridSearchCV(estimator=model_rand_forest_line, param_grid=rand_forest_line_grid_param, cv=strCV, verbose=1, n_jobs=-1)
#model_rand_forest_line_grid.fit(train, y)

In [7]:
model_rand_forest_line_grid.best_params_
{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 5,
 'min_impurity_decrease': 1e-05,
 'min_impurity_split': 0.001,
 'min_samples_leaf': 30,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 0.0001}
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.8, random_state = 20)

model_rand_forest_new = RandomForestClassifier(n_estimators=250, criterion= 'entropy', min_impurity_decrease=1e-05,
 min_weight_fraction_leaf = 0.0001,
 min_impurity_split = 0.001,
 max_depth= 8,
 max_features= 5,
 class_weight = {1 : 1, 0 : 3},
 min_samples_leaf= 30,
 min_samples_split= 30,
 oob_score= False,
 random_state = 20)
model_rand_forest_new.fit(X_train, y_train)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import warnings
warnings.simplefilter('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg' 
from pylab import rcParams
rcParams['figure.figsize'] = 8, 5
plt.figure(figsize=(6,5))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
fpr9, tpr9, _= roc_curve(y_test, model_rand_forest_new.predict_proba(X_test)[:, 1] )
auc9 = roc_auc_score(y_test, model_rand_forest_new.predict_proba(X_test)[:, 1])
plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2,
color='#990000')
plt.legend(loc="best")


NameError: name 'y_test' is not defined

<Figure size 432x360 with 1 Axes>

In [825]:

model_rand_forest = RandomForestClassifier(n_estimators=150, criterion= 'entropy', min_impurity_decrease=0.00001,
 min_weight_fraction_leaf = 0.00005,
 min_impurity_split = 0.00004,
 max_depth= 9,
 max_features= 4,
 class_weight = {1:4, 0:1},
 min_samples_leaf= 30,
 min_samples_split= 30,
 oob_score= False,
 random_state = 20)

train_with_new_features =  train.copy()
train_log_reg_data = train_with_new_features.copy()
##большие значения у линий - выбросы, уберем их



train_log_reg_data['линии'][train_log_reg_data['линии'] >= 10] = 1
feature_1 = (train_log_reg_data['поведение_30-59_дней'] >= 1) #| (train_log_reg_data['поведение_60-89_дней'] >= 1) \
#             | (train_log_reg_data['поведение_90_дней'] >= 1)
train_log_reg_data['feature_1'] = feature_1
train_log_reg_data['поведение_90_дней'][train_log_reg_data['поведение_90_дней'] > 35] = 1
train_log_reg_data['поведение_60-89_дней'][train_log_reg_data['поведение_60-89_дней'] > 35] = 1
train_log_reg_data['поведение_30-59_дней'][train_log_reg_data['поведение_30-59_дней'] > 35] = 1
# 
# 
# 
# 
# feature_2 = (train_log_reg_data['недвижимость'] >= 1) 
# train_log_reg_data['feature_2'] = feature_2
# 
# feature_3 = (train_log_reg_data['число_кредитов'] >= 15)
# train_log_reg_data['feature_3'] = feature_3

# feature_4 = (train_log_reg_data['доход'] <= train_log_reg_data['доход'].quantile(q=0.35))
# train_log_reg_data['feature_4'] = feature_4
# 
# feature_5 = (train_log_reg_data['Debt_Ratio'] <= train_log_reg_data['Debt_Ratio'].quantile(q=0.5))
# train_log_reg_data['feature_5'] = feature_5
# 
# feature_6 = (train_log_reg_data['линии'] <= train_log_reg_data['линии'].quantile(q=0.05))
# train_log_reg_data['feature_6'] = feature_6
# 
# feature_7 = (train_log_reg_data['поведение_60-89_дней'] >= 1)
# train_log_reg_data['feature_7'] = feature_7
# 
# feature_8 = (train_log_reg_data['поведение_90_дней'] >= 1)
# train_log_reg_data['feature_8'] = feature_8
# 
# feature_9 = (train_log_reg_data['возраст'] >= train_log_reg_data['возраст'].quantile(q=0.6))
# train_log_reg_data['feature_9'] = feature_9
# 
feature_10 = (3*train_log_reg_data['поведение_60-89_дней'] + train_log_reg_data['поведение_30-59_дней']) > train_log_reg_data['поведение_90_дней']
train_log_reg_data['feature_10'] = feature_10

X_train, X_test, y_train, y_test = train_test_split(train_log_reg_data, y, train_size = 0.2, random_state=20)

from sklearn.ensemble import GradientBoostingClassifier
passiveAggre = GradientBoostingClassifier(loss='exponential',learning_rate=0.1,  min_impurity_decrease=0.00001,
 min_weight_fraction_leaf = 0.00005,
 min_impurity_split = 0.00004,
 max_depth= 4,
 max_features= 4,
 min_samples_leaf= 30,
 min_samples_split= 3,
 random_state = 20)
rand_forest_line_grid_param = {'max_depth' : range(2, 7), 'max_features' : range(3, 7),
                               'learning_rate' : [1, 0.1, 0.01],
                               'loss' : ['exponential', 'deviance'], 
                              'min_samples_split': [30],
                              'min_samples_leaf' : [30],
                               'min_impurity_decrease' : [0.00001, 0.1],
                                'min_weight_fraction_leaf' : [0.00005, 0.01],
                                 'min_impurity_split' : [0.00004, 0.001, 1, 0.0001]
                              }
strCV = StratifiedKFold(n_splits = 4, random_state = 10)                                            
#model_boosting_forest_line_grid = GridSearchCV(estimator=passiveAggre, param_grid=rand_forest_line_grid_param, cv=strCV, verbose=1, n_jobs=-1)
#model_boosting_forest_line_grid.fit(train_log_reg_data, y)

#model_rand_forest.fit(X_train, y_train)


In [521]:
model_boosting_forest_line_grid.best_params_
{'learning_rate': 0.1,
 'loss': 'exponential',
 'max_depth': 5,
 'max_features': 4,
 'min_impurity_decrease': 0.1,
 'min_impurity_split': 4e-05,
 'min_samples_leaf': 30,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 5e-05}

{'learning_rate': 0.1,
 'loss': 'exponential',
 'max_depth': 5,
 'max_features': 4,
 'min_impurity_decrease': 0.1,
 'min_impurity_split': 4e-05,
 'min_samples_leaf': 30,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 5e-05}

In [900]:
X_train, X_test, y_train, y_test = train_test_split(train, y, train_size = 0.8, random_state=20)
assiveAggre_best = GradientBoostingClassifier(loss='deviance',learning_rate=0.1,  min_impurity_decrease=0.00001, subsample=0.7, n_estimators=200,
 min_weight_fraction_leaf = 0.00005,
 min_impurity_split = 0.00004,
 max_depth= 4,
 max_features= 4,
 min_samples_leaf= 150,
 min_samples_split= 760,                                           
 random_state = 20)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatur


degrees = [1, 4, 15]
#polynomial_features = PolynomialFeatures(degree=2, interaction_only=True,
#                                             include_bias=False)
#pipeline = Pipeline([("polynomial_features", polynomial_features),
#                         ("assiveAggre_best", assiveAggre_best)])
#train_poly = train_log_reg_data.copy()

assiveAggre_best.fit(X_train, y_train)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=4, max_leaf_nodes=None,
              min_impurity_decrease=1e-05, min_impurity_split=4e-05,
              min_samples_leaf=150, min_samples_split=760,
              min_weight_fraction_leaf=5e-05, n_estimators=200,
              presort='auto', random_state=20, subsample=0.7, verbose=0,
              warm_start=False)

In [899]:
rcParams['figure.figsize'] = 8, 5
#plt.figure(figsize=(6,5))
#plt.xlabel("False Positive Rate")
#plt.ylabel("True Positive Rate (recall)")
#fpr9, tpr9, _= roc_curve(y, model_rand_forest_new.predict_proba(train_log_reg_data)[:, 1] )
auc9 = roc_auc_score(y_test, boost_slow.predict_proba(X_test)[:, 1])
#plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2,
#color='#990000')

#plt.legend(loc="best")
print("auc=%.4f" % auc9)
#auc=0.8715
#auc=0.8618

auc=0.8692


In [887]:
model_rand_forest_prediction = assiveAggre_best.predict_proba(test)[:, 1]
pd.DataFrame({'id' : np.arange(37500), 'a' : model_rand_forest_prediction}).to_csv('/home/paniquex/PycharmProjects/IML_homeworks/credit_scoring/solution_boost_from_sklearn_with_gridsearch1.csv', index=False)

In [150]:
rand_forest_line_grid_param_new = {'max_depth' : [9], 'max_features' : [3], 
                               'criterion' : ['entropy'], 
                              'min_samples_split': [35, 40, 100],
                              'min_samples_leaf' : [10],
                              'min_impurity_decrease' : [0, 0.01, 0.5, 1],
                              'min_weight_fraction_leaf' : [0., 0.000001, 0.00001, 0.5],
                              'n_estimators' : [50]}
                                            
model_rand_forest_line_grid_new = GridSearchCV(estimator=model_rand_forest_line, scoring='roc_auc', param_grid=rand_forest_line_grid_param_new, cv=strCV, verbose=1, n_jobs=-1)

In [21]:
model_rand_forest_line_grid_new.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [22]:
model_rand_forest_line_grid_new.best_params_
{'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 3,
 'min_impurity_decrease': 0,
 'min_samples_leaf': 20,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50}

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [23]:
rcParams['figure.figsize'] = 8, 5
plt.figure(figsize=(6,5))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
fpr9, tpr9, _= roc_curve(y_test, model_rand_forest_line_grid_new.predict_proba(X_test)[:, 1] )
auc9 = roc_auc_score(y_test, model_rand_forest_line_grid_new.predict_proba(X_test)[:, 1])
plt.plot(fpr9, tpr9, label=("auc=%.3f" % auc9), linewidth=2,
color='#990000')
plt.legend(loc="best")

NameError: name 'y_test' is not defined

<Figure size 432x360 with 1 Axes>

In [138]:
model_rand_forest_new = RandomForestClassifier(n_estimators=100, criterion= 'entropy',
 max_depth= 10,
 max_features= 10,
 min_samples_leaf= 20,
 min_samples_split= 30,
 oob_score= False,
 random_state = 20)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=20, verbose=0, warm_start=False)

In [140]:
rcParams['figure.figsize'] = 8, 5
#plt.figure(figsize=(6,5))
#plt.xlabel("False Positive Rate")
#plt.ylabel("True Positive Rate (recall)")
#fpr9, tpr9, _= roc_curve(y, model_rand_forest_new.predict_proba(train_log_reg_data)[:, 1] )
auc9 = roc_auc_score(y_test, model_rand_forest_new.predict_proba(X_test)[:, 1])
#plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2,
#color='#990000')
#plt.legend(loc="best")
print("auc=%.4f" % auc9)
#auc=0.8742

auc=0.8567


## Предобработка данных для использования логистической регрессии

In [26]:
import numpy as np
train_log = train.copy()
test_log = test.copy()

from sklearn.preprocessing import StandardScaler
#col_names = ['линии', 'доход', 'Debt_Ratio']
#features = train_log[col_names]
#scaler = StandardScaler(with_mean = False, with_std = True).fit(features.values)
#features = scaler.transform(features.values)
#train_log[col_names] = features
#from sklearn.model_selection import train_test_split
#X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(train_log, y, train_size = 0.2)
train_log.describe()

Unnamed: 0,линии,возраст,поведение_30-59_дней,Debt_Ratio,доход,число_кредитов,поведение_90_дней,недвижимость,поведение_60-89_дней,семья
count,112500.0,112500.0,112500.0,112500.0,112500.0,112500.0,112500.0,112500.0,112500.0,112500.0
mean,6.057904,52.292702,0.425538,350.965444,6662.38,8.464267,0.271627,1.01968,0.245333,0.762071
std,257.959993,14.765029,4.246085,1864.557746,13796.56,5.149137,4.223498,1.130324,4.208686,1.099412
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.02984,41.0,0.0,0.175069,3900.0,5.0,0.0,0.0,0.0,0.0
50%,0.154015,52.0,0.0,0.366336,6600.0,8.0,0.0,1.0,0.0,0.0
75%,0.559389,63.0,0.0,0.863569,7400.0,11.0,0.0,2.0,0.0,1.0
max,50708.0,109.0,98.0,326442.0,3008750.0,58.0,98.0,54.0,98.0,13.0


In [27]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
train_with_out_60_30 = train_log.copy()
train_with_out_60_30.pop('поведение_30-59_дней')
train_with_out_60_30.pop('поведение_60-89_дней')
log_reg_first = LogisticRegression()
rfecv = RFECV(estimator=log_reg_first, cv=5, scoring='accuracy', n_jobs=-1, step=1)
rfecv.fit(train, y)

KeyboardInterrupt: 

In [399]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=200, penalty='l2', C=0.01, random_state=20, class_weight=  'balanced', tol=0.00004, solver = 'sag')
train_with_new_features =  train.copy()
train_log_reg_data = train_with_new_features.copy()
##большие значения у линий - выбросы, уберем их



train_log_reg_data['линии'][(train_log_reg_data['линии'] >= 10)] = 0.3
train_log_reg_data['доход'][train_log_reg_data['доход'] > train_log_reg_data['доход'].quantile(q=0.85)] = train_log_reg_data['доход'].quantile(q=0.4)
train_log_reg_data['Debt_Ratio'][train_log_reg_data['Debt_Ratio'] > train_log_reg_data['Debt_Ratio'].quantile(q=0.8)] = train_log_reg_data['Debt_Ratio'].quantile(q=0.4)

feature_1 = (train_log_reg_data['поведение_30-59_дней'] >= 1) #| (train_log_reg_data['поведение_60-89_дней'] >= 1) \
            # | (train_log_reg_data['поведение_90_дней'] >= 1)
train_log_reg_data['feature_1'] = feature_1
train_log_reg_data['поведение_90_дней'][train_log_reg_data['поведение_90_дней'] > 35] = 1
train_log_reg_data['поведение_60-89_дней'][train_log_reg_data['поведение_60-89_дней'] > 35] = 1
train_log_reg_data['поведение_30-59_дней'][train_log_reg_data['поведение_30-59_дней'] > 35] = 1




feature_2 = (train_log_reg_data['недвижимость'] >= 1) 
train_log_reg_data['feature_2'] = feature_2

feature_3 = (train_log_reg_data['число_кредитов'] >= 15)
train_log_reg_data['feature_3'] = feature_3

feature_4 = (train_log_reg_data['доход'] <= train_log_reg_data['доход'].quantile(q=0.35))
train_log_reg_data['feature_4'] = feature_4

feature_5 = (train_log_reg_data['Debt_Ratio'] <= train_log_reg_data['Debt_Ratio'].quantile(q=0.5))
train_log_reg_data['feature_5'] = feature_5

feature_6 = (train_log_reg_data['линии'] <= train_log_reg_data['линии'].quantile(q=0.05))
train_log_reg_data['feature_6'] = feature_6

feature_7 = (train_log_reg_data['поведение_60-89_дней'] >= 1)
train_log_reg_data['feature_7'] = feature_7

feature_8 = (train_log_reg_data['поведение_90_дней'] >= 1)
train_log_reg_data['feature_8'] = feature_8

feature_9 = (train_log_reg_data['возраст'] >= train_log_reg_data['возраст'].quantile(q=0.6))
train_log_reg_data['feature_9'] = feature_9

feature_10 = (3*train_log_reg_data['поведение_60-89_дней'] + 5 * train_log_reg_data['поведение_30-59_дней']) > train_log_reg_data['поведение_90_дней']
train_log_reg_data['feature_10'] = feature_10



# train_log_reg_data.pop('поведение_30-59_дней')
# train_log_reg_data.pop('поведение_60-89_дней')
# train_log_reg_data.pop('поведение_90_дней')
# train_log_reg_data['feature_1'][train_log_reg_data['feature_1'] == True] = 1
# train_log_reg_data['feature_1'][train_log_reg_data['feature_1'] == False] = 0
# train_log_reg_data['feature_2'][train_log_reg_data['feature_2'] == True] = 1
# train_log_reg_data['feature_2'][train_log_reg_data['feature_2'] == False] = 0
# train_log_reg_data['feature_3'][train_log_reg_data['feature_3'] == True] = 1
# train_log_reg_data['feature_3'][train_log_reg_data['feature_3'] == False] = 0
# train_log_reg_data['feature_4'][train_log_reg_data['feature_4'] == True] = 1
# train_log_reg_data['feature_4'][train_log_reg_data['feature_4'] == False] = 0
# train_log_reg_data['feature_5'][train_log_reg_data['feature_5'] == True] = 1
# train_log_reg_data['feature_5'][train_log_reg_data['feature_5'] == False] = 0
# 
train_log_reg_data['feature_1'] = train_log_reg_data['feature_1'].astype(bool)
train_log_reg_data['feature_2'] = train_log_reg_data['feature_2'].astype(bool)
# train_log_reg_data['feature_3'] = train_log_reg_data['feature_3'].astype(bool)
train_log_reg_data['семья'] = train_log_reg_data['семья'].astype(int)


col_names = ['линии', 'доход', 'Debt_Ratio']
features = train_log_reg_data[col_names]
features = StandardScaler().fit_transform(features)
train_log_reg_data[col_names] = features


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, y, train_size = 0.8, random_state=20)
grid_rfecv_params = {'tol' : [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001],
                     'C' : [1, 0.5, 0.1, 0.01, 0.001, 0.0001],
                     'fit_intercept' : [True, False],
                      'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                      'max_iter' : [100, 200, 300, 500]
                       }
log_reg_with_grid = LogisticRegression(penalty='l2', C=1, fit_intercept= True,  max_iter= 300, solver= 'liblinear', tol=0.0001, class_weight='balanced', )
from sklearn.ensemble import GradientBoostingClassifier
passiveAggre = GradientBoostingClassifier(learning_rate=0.1)
passiveAggre.fit(X_train, y_train)
#grid_rfecv = GridSearchCV(estimator=logreg, param_grid=grid_rfecv_params, n_jobs=-1, scoring='roc_auc', cv=StratifiedKFold(n_splits=5), verbose=1)
rfecv = RFECV(estimator=passiveAggre, cv=StratifiedKFold(n_splits=4, random_state=20), step=2, scoring='roc_auc', verbose=1)
#train_log_reg_data.info()
rfecv.fit(X_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112500 entries, 0 to 112499
Data columns (total 20 columns):
линии                   112500 non-null float64
возраст                 112500 non-null int64
поведение_30-59_дней    112500 non-null int64
Debt_Ratio              112500 non-null float64
доход                   112500 non-null float64
число_кредитов          112500 non-null int64
поведение_90_дней       112500 non-null int64
недвижимость            112500 non-null int64
поведение_60-89_дней    112500 non-null int64
семья                   112500 non-null int64
feature_1               112500 non-null bool
feature_2               112500 non-null bool
feature_3               112500 non-null bool
feature_4               112500 non-null bool
feature_5               112500 non-null bool
feature_6               112500 non-null bool
feature_7               112500 non-null bool
feature_8               112500 non-null bool
feature_9               112500 non-null bool
feature_10        

Fitting estimator with 8 features.


Fitting estimator with 6 features.


Fitting estimator with 4 features.


Fitting estimator with 2 features.


Fitting estimator with 10 features.


Fitting estimator with 8 features.


Fitting estimator with 6 features.


Fitting estimator with 4 features.


Fitting estimator with 2 features.


Fitting estimator with 10 features.


Fitting estimator with 8 features.


Fitting estimator with 6 features.


Fitting estimator with 4 features.


Fitting estimator with 2 features.


Fitting estimator with 10 features.


Fitting estimator with 8 features.


Fitting estimator with 6 features.


Fitting estimator with 4 features.


Fitting estimator with 2 features.


RFECV(cv=StratifiedKFold(n_splits=4, random_state=20, shuffle=False),
   estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
   n_jobs=1, scoring='roc_auc', step=2, verbose=1)

In [401]:
rcParams['figure.figsize'] = 8, 5
plt.figure(figsize=(6,5))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
fpr9, tpr9, _= roc_curve(y_test, rfecv.predict_proba(X_test)[:, 1] )
auc9 = roc_auc_score(y_test, rfecv.predict_proba(X_test)[:, 1])
plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2,
color='#990000')
plt.legend(loc="best")
print("auc=%.4f" % auc9)
#auc=0.8675

#auc=0.8687


auc=0.8687


<Figure size 432x360 with 1 Axes>

In [403]:
rfecv.best_params_
#log_reg_with_grid = LogisticRegression(C=1, fit_intercept= True, max_iter= 100, solver= 'sag', tol=0.001)

AttributeError: 'RFECV' object has no attribute 'best_params_'

In [28]:
rfecv.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [29]:
rcParams['figure.figsize'] = 8, 5
plt.figure(figsize=(6,5))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
fpr9, tpr9, _= roc_curve(y_test, rfecv.predict_proba(X_test)[:, 1] )
auc9 = roc_auc_score(y_test, rfecv.predict_proba(X_test)[:, 1])
plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2,
color='#990000')
plt.legend(loc="best")

<matplotlib.legend.Legend at 0x7f416f2e6908>

<Figure size 432x360 with 1 Axes>

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
degrees = [1, 4, 15]
polynomial_features = PolynomialFeatures(degree=2, interaction_only=True,
                                             include_bias=False)
pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("rfecv", rfecv)])
train_poly = train_log_reg_data.copy()
#polynomial_features.fit(train_poly, y)
#train_poly = polynomial_features.transform(train_poly)
pipeline.fit(train_poly, y)

print('2')


Fitting estimator with 210 features.


Fitting estimator with 200 features.


Fitting estimator with 190 features.


Fitting estimator with 180 features.


Fitting estimator with 170 features.


Fitting estimator with 160 features.


Fitting estimator with 150 features.


Fitting estimator with 140 features.


Fitting estimator with 130 features.


Fitting estimator with 120 features.


Fitting estimator with 110 features.


Fitting estimator with 100 features.


Fitting estimator with 90 features.


Fitting estimator with 80 features.


Fitting estimator with 70 features.


Fitting estimator with 60 features.


Fitting estimator with 50 features.


Fitting estimator with 40 features.


Fitting estimator with 30 features.


Fitting estimator with 20 features.


Fitting estimator with 10 features.


Fitting estimator with 210 features.


Fitting estimator with 200 features.


Fitting estimator with 190 features.


Fitting estimator with 180 features.


Fitting estimator with 170 features.


Fitting estimator with 160 features.


Fitting estimator with 150 features.


Fitting estimator with 140 features.


Fitting estimator with 130 features.


Fitting estimator with 120 features.


Fitting estimator with 110 features.


Fitting estimator with 100 features.


Fitting estimator with 90 features.


Fitting estimator with 80 features.


Fitting estimator with 70 features.


Fitting estimator with 60 features.


Fitting estimator with 50 features.


Fitting estimator with 40 features.


Fitting estimator with 30 features.


Fitting estimator with 20 features.


Fitting estimator with 10 features.


Fitting estimator with 210 features.


Fitting estimator with 200 features.


Fitting estimator with 190 features.


Fitting estimator with 180 features.


Fitting estimator with 170 features.


Fitting estimator with 160 features.


Fitting estimator with 150 features.


Fitting estimator with 140 features.


Fitting estimator with 130 features.


Fitting estimator with 120 features.


Fitting estimator with 110 features.


Fitting estimator with 100 features.


Fitting estimator with 90 features.


Fitting estimator with 80 features.


Fitting estimator with 70 features.


Fitting estimator with 60 features.


Fitting estimator with 50 features.


Fitting estimator with 40 features.


Fitting estimator with 30 features.


Fitting estimator with 20 features.


Fitting estimator with 10 features.


Fitting estimator with 210 features.


Fitting estimator with 200 features.


Fitting estimator with 190 features.


Fitting estimator with 180 features.


Fitting estimator with 170 features.


Fitting estimator with 160 features.


Fitting estimator with 150 features.


Fitting estimator with 140 features.


Fitting estimator with 130 features.


Fitting estimator with 120 features.


Fitting estimator with 110 features.


Fitting estimator with 100 features.


Fitting estimator with 90 features.


Fitting estimator with 80 features.


Fitting estimator with 70 features.


Fitting estimator with 60 features.


Fitting estimator with 50 features.


Fitting estimator with 40 features.


Fitting estimator with 30 features.


Fitting estimator with 20 features.


Fitting estimator with 10 features.


2


In [106]:
rfecv.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 2,
       1, 1, 1, 1, 2, 3, 1, 3, 3, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [109]:
rcParams['figure.figsize'] = 8, 5
plt.figure(figsize=(6,5))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
#fpr9, tpr9, _= roc_curve(y_train, pipeline.predict_proba(X_train)[:, 1] )
auc9 = roc_auc_score(y, pipeline.predict_proba(train_log_reg_data)[:, 1])
#plt.plot(fpr9, tpr9, label=("auc=%.4f" % auc9), linewidth=2, color='#990000')
#plt.legend(loc="best")
#auc=0.8653 // 2, interaction = True
print("auc=%.4f" % auc9)

auc=0.8653


<Figure size 432x360 with 1 Axes>

In [110]:
test_with_new_features =  test.copy()
test_log_reg_data = test_with_new_features.copy()
##большие значения у линий - выбросы, уберем их



test_log_reg_data['линии'][test_log_reg_data['линии'] >= 10] = 0.3
feature_1 = (test_log_reg_data['поведение_30-59_дней'] >= 1) #| (test_log_reg_data['поведение_60-89_дней'] >= 1) \
            # | (test_log_reg_data['поведение_90_дней'] >= 1)
test_log_reg_data['feature_1'] = feature_1
test_log_reg_data['поведение_90_дней'][test_log_reg_data['поведение_90_дней'] > 35] = 1
test_log_reg_data['поведение_60-89_дней'][test_log_reg_data['поведение_60-89_дней'] > 35] = 1
test_log_reg_data['поведение_30-59_дней'][test_log_reg_data['поведение_30-59_дней'] > 35] = 1




feature_2 = (test_log_reg_data['недвижимость'] >= 1) 
test_log_reg_data['feature_2'] = feature_2

feature_3 = (test_log_reg_data['число_кредитов'] >= 15)
test_log_reg_data['feature_3'] = feature_3

feature_4 = (test_log_reg_data['доход'] <= test_log_reg_data['доход'].quantile(q=0.35))
test_log_reg_data['feature_4'] = feature_4

feature_5 = (test_log_reg_data['Debt_Ratio'] <= test_log_reg_data['Debt_Ratio'].quantile(q=0.5))
test_log_reg_data['feature_5'] = feature_5

feature_6 = (test_log_reg_data['линии'] <= test_log_reg_data['линии'].quantile(q=0.05))
test_log_reg_data['feature_6'] = feature_6

feature_7 = (test_log_reg_data['поведение_60-89_дней'] >= 1)
test_log_reg_data['feature_7'] = feature_7

feature_8 = (test_log_reg_data['поведение_90_дней'] >= 1)
test_log_reg_data['feature_8'] = feature_8

feature_9 = (test_log_reg_data['возраст'] >= test_log_reg_data['возраст'].quantile(q=0.6))
test_log_reg_data['feature_9'] = feature_9

feature_10 = (3*test_log_reg_data['поведение_60-89_дней'] + 5 * test_log_reg_data['поведение_30-59_дней']) > test_log_reg_data['поведение_90_дней']
test_log_reg_data['feature_10'] = feature_10



# test_log_reg_data.pop('поведение_30-59_дней')
# test_log_reg_data.pop('поведение_60-89_дней')
# test_log_reg_data.pop('поведение_90_дней')
# test_log_reg_data['feature_1'][test_log_reg_data['feature_1'] == True] = 1
# test_log_reg_data['feature_1'][test_log_reg_data['feature_1'] == False] = 0
# test_log_reg_data['feature_2'][test_log_reg_data['feature_2'] == True] = 1
# test_log_reg_data['feature_2'][test_log_reg_data['feature_2'] == False] = 0
# test_log_reg_data['feature_3'][test_log_reg_data['feature_3'] == True] = 1
# test_log_reg_data['feature_3'][test_log_reg_data['feature_3'] == False] = 0
# test_log_reg_data['feature_4'][test_log_reg_data['feature_4'] == True] = 1
# test_log_reg_data['feature_4'][test_log_reg_data['feature_4'] == False] = 0
# test_log_reg_data['feature_5'][test_log_reg_data['feature_5'] == True] = 1
# test_log_reg_data['feature_5'][test_log_reg_data['feature_5'] == False] = 0
# 
test_log_reg_data['feature_1'] = test_log_reg_data['feature_1'].astype(bool)
test_log_reg_data['feature_2'] = test_log_reg_data['feature_2'].astype(bool)
# test_log_reg_data['feature_3'] = test_log_reg_data['feature_3'].astype(bool)
test_log_reg_data['семья'] = test_log_reg_data['семья'].astype(int)


col_names = ['линии', 'доход', 'Debt_Ratio']
features = test_log_reg_data[col_names]
features = StandardScaler().fit_transform(features)
test_log_reg_data[col_names] = features



In [112]:
model_logreg_pipeline_prediction = pipeline.predict_proba(test_log_reg_data)[:, 1]
pd.DataFrame({'id' : np.arange(37500), 'a' : model_logreg_pipeline_prediction}).to_csv('/home/paniquex/PycharmProjects/IML_homeworks/credit_scoring/solution_logreg_pipeline_polynomial_with_new_features.csv', index=False)

In [262]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112500 entries, 0 to 112499
Data columns (total 10 columns):
линии                   112500 non-null float64
возраст                 112500 non-null int64
поведение_30-59_дней    112500 non-null int64
Debt_Ratio              112500 non-null float64
доход                   112500 non-null float64
число_кредитов          112500 non-null int64
поведение_90_дней       112500 non-null int64
недвижимость            112500 non-null int64
поведение_60-89_дней    112500 non-null int64
семья                   112500 non-null float64
dtypes: float64(4), int64(6)
memory usage: 8.6 MB
