In [23]:
import numpy as np
import pandas as pd
import xgboost
import sklearn

import matplotlib.pyplot as plt

In [24]:
# step1：去掉数据不完整的行
df = pd.read_csv("2017.csv").dropna()
# step2：性别、是否吸烟等特征转one-hot
df = pd.get_dummies(df, columns=['sex','SMK_STAT_TYPE_RSPS_CD'])
# step3: 去掉creatinine高于2.5的样本
df = df.loc[(df["CREATININE "]<=2.5)&(df["CREATININE "]>=0.0)]
# step4: 切分训练集与测试集
df_train = df[0:900000]; df_test = df[900000:990112] 

In [25]:
# step5: 训练集与测试集下采样
intervals = [(i * 0.1 + 0.05, i * 0.1 + 0.15) for i in range(25)]

temp_train, temp_test = [], []

for index, (low, high) in enumerate(intervals):
    
    df_sample_train = df_train.loc[(df_train["CREATININE "] > low) & (df_train["CREATININE "] < high)]
    df_sample_test  = df_test.loc[(df_test["CREATININE "] > low) & (df_test["CREATININE "] < high)]
    
    if index in range(3, 14): df_sample_train = df_sample_train.sample(2500)
    if index in range(3, 16): df_sample_test  = df_sample_test.sample(100)
    
    temp_train.append(df_sample_train)
    temp_test.append(df_sample_test)

df_train = pd.concat(temp_train, ignore_index=True)
df_test = pd.concat(temp_test, ignore_index=True)

# step6: 选取特征x与目标y
y_train = df_train["CREATININE "].values.reshape(-1, 1)
x_train = df_train.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values

y_test = df_test["CREATININE "].values.reshape(-1, 1)
x_test = df_test.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values

In [32]:
# step7: 使用自定义loss_func与自定义eval_metric训练模型，，计算预测值与真实值的R2
mean_value = df["CREATININE "].mean()

weights = [1] * 10

intervals = [0.1 * i * (df['CREATININE '].max() - df['CREATININE '].min()) + df['CREATININE '].min() for i in range(0, 11)]
intervals[10] += 0.001

def custom_loss(true,pred):
    
    error = pred - true
    
    global weights, intervals
    
    for i in range(len(true)):
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                
                error[i] = error[i] * (weights[j] / np.mean(weights))
                
                break
                
    grad = error
    
    return grad, np.ones_like(pred)

def custom_eval(true, pred):

    error_sum, sample_count = [0] * 10, [0] * 10
    
    global weights, intervals

    for i in range(len(true)):
        
        error_abs = abs(true[i] - pred[i])
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                error_sum[j] += error_abs
                sample_count[j] += 1
                break

    weights = [error_sum[j] / sample_count[j] if sample_count[j] > 0 else 0 for j in range(10)]
    
    return np.mean(abs(pred-true))

model = xgboost.XGBRegressor(
            max_depth=10, n_estimators=500, learning_rate=0.05, base_score = mean_value,
            min_child_weight=100, nthread=4, subsample=1, early_stopping_rounds=10, 
            colsample_bytree=1, reg_alpha=0, reg_lambda=0, gamma=0, 
            eval_metric=custom_eval, objective=custom_loss,seed=10
        )

model.fit(
    x_train, y_train, verbose=1, 
    eval_set=[(x_train, y_train),(x_test, y_test)],  
    
)

print("R2指数：",model.score(x_test, y_test))

[0]	validation_0-rmse:0.45574	validation_0-custom_eval:0.36836	validation_1-rmse:0.53903	validation_1-custom_eval:0.43711
[1]	validation_0-rmse:0.44288	validation_0-custom_eval:0.35816	validation_1-rmse:0.52274	validation_1-custom_eval:0.42431
[2]	validation_0-rmse:0.43100	validation_0-custom_eval:0.34859	validation_1-rmse:0.50792	validation_1-custom_eval:0.41235
[3]	validation_0-rmse:0.41995	validation_0-custom_eval:0.33953	validation_1-rmse:0.49397	validation_1-custom_eval:0.40076
[4]	validation_0-rmse:0.40981	validation_0-custom_eval:0.33108	validation_1-rmse:0.48139	validation_1-custom_eval:0.39035
[5]	validation_0-rmse:0.40032	validation_0-custom_eval:0.32310	validation_1-rmse:0.46986	validation_1-custom_eval:0.38055
[6]	validation_0-rmse:0.39157	validation_0-custom_eval:0.31562	validation_1-rmse:0.45898	validation_1-custom_eval:0.37128
[7]	validation_0-rmse:0.38354	validation_0-custom_eval:0.30871	validation_1-rmse:0.44918	validation_1-custom_eval:0.36284
[8]	validation_0-rmse:0.