In [7]:
# package imports
import pandas as pd
import numpy as np 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split

# 查看数据

In [6]:
# load train data to play with 
banking = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
banking.drop(columns = ['ID'], inplace = True)
test.drop(columns = ['ID'], inplace = True)


In [8]:
banking.shape
banking.head()
banking.info()
banking.describe()

(25317, 17)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        25317 non-null  int64 
 1   job        25317 non-null  object
 2   marital    25317 non-null  object
 3   education  25317 non-null  object
 4   default    25317 non-null  object
 5   balance    25317 non-null  int64 
 6   housing    25317 non-null  object
 7   loan       25317 non-null  object
 8   contact    25317 non-null  object
 9   day        25317 non-null  int64 
 10  month      25317 non-null  object
 11  duration   25317 non-null  int64 
 12  campaign   25317 non-null  int64 
 13  pdays      25317 non-null  int64 
 14  previous   25317 non-null  int64 
 15  poutcome   25317 non-null  object
 16  y          25317 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 3.3+ MB


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
count,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0
mean,40.935379,1357.555082,15.835289,257.732393,2.77205,40.248766,0.591737,0.116957
std,10.634289,2999.822811,8.31948,256.975151,3.136097,100.213541,2.568313,0.321375
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,33.0,73.0,8.0,103.0,1.0,-1.0,0.0,0.0
50%,39.0,448.0,16.0,181.0,2.0,-1.0,0.0,0.0
75%,48.0,1435.0,21.0,317.0,3.0,-1.0,0.0,0.0
max,95.0,102127.0,31.0,3881.0,55.0,854.0,275.0,1.0


## 查看类别数据

In [4]:
#划分为数字特征、类别特征
cate_features = ['job', 'marital','education','default','housing','loan','contact','month','poutcome']

num_features = ['age', 'balance','day','duration','campaign','pdays','previous']

for feature in cate_features:
    feature
    banking[feature].value_counts()

'job'

blue-collar      5456
management       5296
technician       4241
admin.           2909
services         2342
retired          1273
self-employed     884
entrepreneur      856
unemployed        701
housemaid         663
student           533
unknown           163
Name: job, dtype: int64

'marital'

married     15245
single       7157
divorced     2915
Name: marital, dtype: int64

'education'

secondary    12957
tertiary      7447
primary       3848
unknown       1065
Name: education, dtype: int64

'default'

no     24869
yes      448
Name: default, dtype: int64

'housing'

yes    14020
no     11297
Name: housing, dtype: int64

'loan'

no     21258
yes     4059
Name: loan, dtype: int64

'contact'

cellular     16391
unknown       7281
telephone     1645
Name: contact, dtype: int64

'month'

may    7655
jul    3937
aug    3482
jun    2968
nov    2243
apr    1669
feb    1464
jan     777
oct     411
sep     339
mar     269
dec     103
Name: month, dtype: int64

'poutcome'

unknown    20677
failure     2735
other       1070
success      835
Name: poutcome, dtype: int64

## 查看相关系数

In [5]:
corr_matrix = banking.corr()
corr_matrix['y'].sort_values(ascending=False)

y           1.000000
duration    0.394746
pdays       0.107565
previous    0.088337
balance     0.057564
age         0.029916
day        -0.031886
campaign   -0.075173
Name: y, dtype: float64

## 划分训练集和测试集

In [6]:
X_banking = banking.drop('y', axis = 1)
label = banking['y']


# 将数据集按照`2:8`的比例，切分为训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X_banking, label, test_size = 0.2, random_state = 1)


# 特征工程

## 删除缺省值过多的特征（待定）

## StandardScaler、OneHot

In [7]:
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ('std_scaler', StandardScaler(), num_features),
        ("cat", OneHotEncoder(), cate_features)
    ])

train_prepared = full_pipeline.fit_transform(X_train)

In [13]:
train_prepared

array([[ 0.0046424 , -0.35006117, -0.93815229, ...,  0.        ,
         0.        ,  1.        ],
       [-0.83987151, -0.07321703, -1.41708048, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.66148655, -0.33697161, -1.77627663, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.12137614, -0.43579777,  0.85782842, ...,  0.        ,
         0.        ,  1.        ],
       [-0.18302736, -0.31962795,  1.21702457, ...,  0.        ,
         0.        ,  0.        ],
       [-0.65220175,  0.19871852, -1.05788434, ...,  0.        ,
         0.        ,  1.        ]])

# 模型选择

In [29]:
#对测试集特征工程
test_prepared = full_pipeline.transform(X_test)

## GradientBoostingClassifier模型

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=10)

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)

## LinearSVC

In [19]:
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=42)

## 训练并验证模型

In [31]:
model.fit(train_prepared, Y_train) 

Y_pred = model.predict(test_prepared)
Y_predprob = model.predict_proba(test_prepared)
Y_predprob = Y_predprob[:,1]

print("Accuracy : %.4g" % metrics.accuracy_score(Y_test, Y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(Y_test, Y_predprob))

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=10, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Accuracy : 0.9111
AUC Score (Train): 0.925350


## 交叉验证拟合情况

## 网格搜索选出最佳超参数

# 预测test.csv并写入Result.csv

In [35]:
test = full_pipeline.transform(test)

y_pred = model.predict(test)
y_predprob = model.predict_proba(test)
y_predprob = y_predprob[:,1]

ID = list(range(25318,36170))
submission = pd.DataFrame(ID, columns = ['ID'])

# 将pred_y从array转化成DataFrame
y_predprob = pd.DataFrame(y_predprob)

submission['pred'] =y_predprob

submission.to_csv('Result.csv', index = False)