In [40]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression


In [41]:
data = pd.read_csv('credit_risk_dataset.csv')
data.head(10)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
6,26,77100,RENT,8.0,EDUCATION,B,35000,12.42,1,0.45,N,3
7,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,1,0.44,N,4
8,24,83000,RENT,8.0,PERSONAL,A,35000,8.9,1,0.42,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3


## 特征工程与数据预处理

In [42]:
data_description = data.describe(include='all')
missing_values = data.isnull().sum()
missing_values

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [43]:
# 分离数值和分类特征
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_features.remove('loan_status')  # 移除目标列
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

# 使用中位数填充数值特征的缺失值
numeric_imputer = SimpleImputer(strategy='median')
data[numeric_features] = numeric_imputer.fit_transform(data[numeric_features])

# 使用最频繁的值填充分类特征的缺失值
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_features] = categorical_imputer.fit_transform(data[categorical_features])

# 对 loan_grade 使用标签编码
label_encoder = LabelEncoder()
data['loan_grade_encoded'] = label_encoder.fit_transform(data['loan_grade'])

# 对其他分类特征进行独热编码
categorical_features.remove('loan_grade')
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(categorical_features))

# 将编码的特征与原始数据集合并
data_encoded_label = pd.concat([data, encoded_df], axis=1)
data_encoded_label = data_encoded_label.drop(columns=categorical_features + ['loan_grade'])

data_encoded_label.head(100)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,loan_grade_encoded,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,cb_person_default_on_file_Y
0,22.0,59000.0,123.0,35000.0,16.02,1,0.59,3.0,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,21.0,9600.0,5.0,1000.0,11.14,0,0.10,2.0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,25.0,9600.0,1.0,5500.0,12.87,1,0.57,3.0,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,23.0,65500.0,4.0,35000.0,15.23,1,0.53,2.0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,24.0,54400.0,8.0,35000.0,14.27,1,0.55,4.0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,23.0,120000.0,1.0,25600.0,12.69,0,0.21,3.0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
96,24.0,12360.0,2.0,1600.0,13.57,0,0.13,3.0,2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
97,22.0,60000.0,0.0,25475.0,10.99,1,0.42,3.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
98,25.0,234000.0,3.0,20000.0,14.27,0,0.09,4.0,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## 划分数据集并导入模型 （LightGBM）

In [44]:
X_lgb = data_encoded_label.drop('loan_status', axis=1)
y_lgb = data_encoded_label['loan_status']
X_train_lgb, X_test_lgb, y_train_lgb, y_test_lgb = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=42)

In [45]:
d_train = lgb.Dataset(X_train_lgb, label=y_train_lgb)
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9
}

clf = lgb.train(params, d_train, 100)


[LightGBM] [Info] Number of positive: 5663, number of negative: 20401
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 926
[LightGBM] [Info] Number of data points in the train set: 26064, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217273 -> initscore=-1.281630
[LightGBM] [Info] Start training from score -1.281630


## 评估模型结果

In [46]:
y_pred_prob = clf.predict(X_test_lgb)
y_pred_lgb = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

# 评估模型的性能
accuracy_lgb = accuracy_score(y_test_lgb, y_pred_lgb)
classification_rep_lgb = classification_report(y_test_lgb, y_pred_lgb)
accuracy_lgb

0.9344790547798066

In [47]:
precision, recall, fscore, support = precision_recall_fscore_support(y_test_lgb, y_pred_lgb)

# 创建一个 DataFrame 来显示结果
class_report_df = pd.DataFrame({
    'class': [0, 1],
    'precision': precision,
    'recall': recall,
    'fscore': fscore,
    'support': support
})

class_report_df


Unnamed: 0,class,precision,recall,fscore,support
0,0,0.926225,0.995071,0.959415,5072
1,1,0.976592,0.721799,0.830084,1445


## 划分数据集并导入逻辑回归模型

In [48]:
# 分割数据集为训练集和测试集
X_lr = data_encoded_label.drop('loan_status', axis=1)
y_lr = data_encoded_label['loan_status']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)

# 使用逻辑回归模型
lr_label_encoded = LogisticRegression(max_iter=1000)
lr_label_encoded.fit(X_train_lr, y_train_lr)

# 在测试集上进行预测
y_pred_lr = lr_label_encoded.predict(X_test_lr)

# 评估模型的性能
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
classification_rep_lr = classification_report(y_test_lr, y_pred_lr)

accuracy_lr


0.7983734847322388

## 评估模型

In [49]:
precision_lr, recall_lr, fscore_lr, support_lr = precision_recall_fscore_support(y_test_lr, y_pred_lr)

class_report_df_lr = pd.DataFrame({
    'class': [0, 1],
    'precision': precision_lr,
    'recall': recall_lr,
    'fscore': fscore_lr,
    'support': support_lr
})

class_report_df_lr

Unnamed: 0,class,precision,recall,fscore,support
0,0,0.802674,0.982453,0.883511,5072
1,1,0.711974,0.152249,0.250855,1445
