In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import auc, roc_curve
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import datetime
np.random.seed(2020) 

train_file_name = 'G://classStudy/II 2/机器学习/大作业/贷款违约预测/train.csv'
test_file_name = 'G://classStudy/II 2/机器学习/大作业/贷款违约预测/testA.csv'

df_train = pd.read_csv(train_file_name)
df_test = pd.read_csv(test_file_name)

In [19]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
df_train['grade'] = labelEncoder.fit_transform(df_train['grade'])
df_train['grade'] = labelEncoder.fit_transform(df_train['grade'])

In [20]:
df_train['grade']

0         4
1         3
2         3
3         0
4         2
         ..
799995    2
799996    0
799997    2
799998    0
799999    1
Name: grade, Length: 800000, dtype: int64

In [21]:
df_train['employmentLength']

0           2 years
1           5 years
2           8 years
3         10+ years
4               NaN
            ...    
799995      7 years
799996    10+ years
799997    10+ years
799998    10+ years
799999      5 years
Name: employmentLength, Length: 800000, dtype: object

In [22]:
employmentLength = ['< 1 year','1 year','2 years',
                    '3 years',  '4 years', '5 years', 
                    '6 years', '8 years', '7 years','9 years','10+ years']
j = 0
for i in employmentLength:
    df_train['employmentLength'] = df_train['employmentLength'].replace(i, j)
    j += 1

In [23]:
_ = pd.crosstab(df_train.subGrade, df_train.isDefault)
_["yp"] = _[1]/(_[0]+_[1])
_.reset_index(inplace=True)
_.sort_values(by="yp", inplace=True)

In [24]:
_

isDefault,subGrade,0,1,yp
0,A1,25082,827,0.031919
1,A2,21113,1011,0.045697
2,A3,21389,1266,0.055882
3,A4,28849,2079,0.067221
4,A5,34796,3249,0.085399
5,B1,38020,4362,0.102921
6,B2,39262,4965,0.112262
7,B3,42319,6281,0.129239
8,B4,42156,7360,0.148639
9,B5,40854,8111,0.165649


In [25]:
df_train = pd.merge(df_train, _[["subGrade", "yp"]], on="subGrade", how="left")
df_train['subGrade'] = labelEncoder.fit_transform(df_train['subGrade'])
df_train['issueDate'] = pd.to_datetime(df_train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
df_train['issueDateDT'] = df_train['issueDate'].apply(lambda x: x-startdate).dt.days
df_train['earliesCreditLine'] = df_train['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [26]:
df_train['earliesCreditLine']

0         2001
1         2002
2         2006
3         1999
4         1977
          ... 
799995    2011
799996    1989
799997    2002
799998    1994
799999    2002
Name: earliesCreditLine, Length: 800000, dtype: int64

In [27]:
tags = ['loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDateDT', 'earliesCreditLine',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'title',
       'n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14','yp']

In [28]:
df_test['grade'] = labelEncoder.fit_transform(df_test['grade'])

In [29]:
employmentLength = ['< 1 year','1 year','2 years',
                    '3 years',  '5 years', '4 years', 
                    '6 years', '8 years', '7 years','9 years','10+ years']
j = 0
for i in employmentLength:
    df_test['employmentLength'] = df_test['employmentLength'].replace(i, j)
    j += 1

In [30]:
df_test = pd.merge(df_test, _[["subGrade", "yp"]], on="subGrade", how="left")
df_test['subGrade'] = labelEncoder.fit_transform(df_test['subGrade'])
df_test['issueDate'] = pd.to_datetime(df_test['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
df_test['issueDateDT'] = df_test['issueDate'].apply(lambda x: x-startdate).dt.days
df_test['earliesCreditLine'] = df_test['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [31]:
Standard_scaler = StandardScaler()
Standard_scaler.fit(df_train[tags].values)
x = Standard_scaler.transform(df_train[tags].values)
x_ = Standard_scaler.transform(df_test[tags].values)
y = df_train['isDefault'].values

In [32]:
lgbr = LGBMRegressor(num_leaves=30
                        ,max_depth=10
                        ,learning_rate=0.01
                        ,n_estimators=13000
                        ,subsample_for_bin=5000
                        ,min_child_samples=200
                        ,colsample_bytree=.2
                        ,reg_alpha=.1
                        ,reg_lambda=.1
                        ,seed=2020                       
                        )

In [33]:
cat = CatBoostRegressor(depth=9, 
                            l2_leaf_reg=1, 
                            learning_rate=0.01, 
                            eval_metric = 'AUC' ,
                            border_count = 128, 
                            bagging_temperature = 0.9 , 
                            n_estimators=16000,
                            early_stopping_rounds=500, 
                            subsample = 0.9,
                            random_seed=1,
                            verbose = 0)

In [34]:
from sklearn.ensemble import VotingRegressor

rg_model = VotingRegressor([('lgb', lgbr), ('catboost', cat)],n_jobs=12)

In [35]:
rg_model.fit(x,y)
pre = pd.DataFrame(rg_model.predict(x_),columns=['isDefault'])

In [36]:
results = pd.concat([df_test['id'],pre],axis = 1)
results.to_csv('submit.csv', index=False)