In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv(r"D:\jupyter name\数据\train.csv")
X_test=pd.read_csv(r"D:\jupyter name\数据\test.csv")

## 数据预处理

因为是比赛数据，比较干净，没有缺失值

In [3]:
data.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [4]:
X_test.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
dtype: int64

文件无重复值

In [5]:
data.duplicated().sum(),X_test.duplicated().sum()

(np.int64(0), np.int64(0))

特征与标签选择

In [6]:
y=data['loan_paid_back']
X=data.drop(['loan_paid_back','id'],axis=1)

确定数据类型，对数据进行标准化或者进行编码，方便预测

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


对训练数据与测试字符串数据进行编码利用pandas

In [8]:
X=pd.get_dummies(X)
X_test=pd.get_dummies(X_test)

数据非常庞大

In [9]:
X.shape

(593994, 60)

将数字与字符型分开

In [10]:
object_cols=[col for col in X.columns if X[col].dtype=='object']
num_cols=[col for col in X.columns if X[col].dtype in['int64','float64'] and col !='id']

In [11]:
from sklearn.preprocessing import StandardScaler
stand=StandardScaler()
X[num_cols]=stand.fit_transform(X[num_cols])
X_test[num_cols]=stand.transform(X_test[num_cols])

In [12]:
print("标准化后 - 均值：\n", X_test[num_cols].mean())
print("标准化后 - 方差：\n", X_test[num_cols].var())

标准化后 - 均值：
 annual_income           0.000782
debt_to_income_ratio   -0.001646
credit_score            0.002195
loan_amount            -0.000512
interest_rate          -0.002002
dtype: float64
标准化后 - 方差：
 annual_income           1.000580
debt_to_income_ratio    1.000265
credit_score            1.007201
loan_amount             0.998742
interest_rate           1.008625
dtype: float64


将训练数据分成训练集与验证集

In [13]:
from sklearn.model_selection import train_test_split,cross_val_score
X_train,X_valid,y_train,y_valid=train_test_split(X,y)

首先用logistic回归进行预测

In [26]:
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [22]:
model1=LogisticRegression()
model1.fit(X_train,y_train)
pred_proba=model1.predict_proba(X_valid)[:,1]
auc=roc_auc_score(y_valid,pred_proba)
print("logisticAUV:",auc)
    

logisticAUV: 0.9116253267431591


区分能力比较出色

使用集成模型xgboost进行预测，梯度增强模型

In [25]:
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_valid)[:, 1]
print("xgboostAUV:",roc_auc_score(y_valid, xgb_preds))

ERROR! Session/line number was not unique in database. History logging moved to new session 410
xgboostAUV: 0.9206696751399488


区分能力较logistic强

利用随机森林进行预测

In [17]:
rf = RandomForestClassifier(
    n_estimators=100,  # 树的数量
    max_depth=8,
    random_state=42
)
rf.fit(X_train, y_train)
rf_preds = rf.predict_proba(X_valid)[:, 1]
print("随机森林AUC:", roc_auc_score(y_valid, rf_preds))

随机森林AUC: 0.903479671351326


利用网格搜索进行预测，暴力遍历所有参数组合，可以寻找到更好的方案，但预测效率比较低

In [24]:
param_grid = {
    'max_depth': [5, 8, 10],        
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 3, 5]   
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc'  
)
grid.fit(X_train, y_train)
best_tree = grid.best_estimator_
from sklearn.metrics import roc_auc_score
tree_preds = best_tree.predict_proba(X_valid)[:, 1]
print("决策树最优AUC:", roc_auc_score(y_valid, tree_preds))


KeyboardInterrupt


KeyboardInterrupt



利用随机搜索提高效率

In [27]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist = {
    'max_depth': [5, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

# 随机搜索，指定迭代次数n_iter（比如选20次，远少于网格搜索的27次）
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,  # 随机选20组参数
    cv=5,
    scoring='roc_auc',
    n_jobs=-1  # 并行计算，利用所有CPU核心
)

random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

# 评估最优模型
rf_preds = best_rf.predict_proba(X_valid)[:, 1]
print("随机搜索最优AUC:", roc_auc_score(y_valid, rf_preds))

ERROR! Session/line number was not unique in database. History logging moved to new session 411
随机搜索最优AUC: 0.9063088144338604


比较多个模型，发现xgboost模型的roc最大，所以用xgboost的model进行预测

In [31]:
X_test=X_test.drop('id',axis=1)

In [46]:

predict=xgb_model.predict_proba(X_test)[:,1]

In [48]:
df=[]
for i in range(len(X_test.iloc[:,0])):
    df.append((len(X)+i,predict[i]))
df

[(593994, np.float32(0.9180146)),
 (593995, np.float32(0.98118997)),
 (593996, np.float32(0.45914838)),
 (593997, np.float32(0.9361616)),
 (593998, np.float32(0.962252)),
 (593999, np.float32(0.9750649)),
 (594000, np.float32(0.98572356)),
 (594001, np.float32(0.97265244)),
 (594002, np.float32(0.9223796)),
 (594003, np.float32(0.0028775162)),
 (594004, np.float32(0.059382018)),
 (594005, np.float32(0.9905633)),
 (594006, np.float32(0.6321437)),
 (594007, np.float32(0.024052154)),
 (594008, np.float32(0.9709991)),
 (594009, np.float32(0.7641121)),
 (594010, np.float32(0.9934655)),
 (594011, np.float32(0.8469005)),
 (594012, np.float32(0.9804243)),
 (594013, np.float32(0.9270779)),
 (594014, np.float32(0.6806282)),
 (594015, np.float32(0.9861042)),
 (594016, np.float32(0.87404716)),
 (594017, np.float32(0.877229)),
 (594018, np.float32(0.96339184)),
 (594019, np.float32(0.019713474)),
 (594020, np.float32(0.7248544)),
 (594021, np.float32(0.5216683)),
 (594022, np.float32(0.9761055)),
 

In [49]:
df=pd.DataFrame(df)
df.columns=['id','loan_paid_back']

In [50]:
df.to_csv('loan_predict1.csv',index=False,sep=',',encoding='utf-8')


另一种方法，构建预处理管道

In [16]:
# 步骤1：确认列筛选正确
object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64'] and col != 'id']

# 步骤2：构建预处理管道（添加handle_unknown='ignore'）
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), object_cols),
        ('num', StandardScaler(), num_cols)
    ]
)

# 步骤3：拟合并转换训练集
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4900445 stored elements and shape (445495, 60)>