In [7]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib
import matplotlib.pyplot as plt
import time 
import operator
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [8]:
#压缩数据内存#
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)


    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [9]:
#加载数据#
train = import_data('C:/input/train_data.csv')
target = import_data('C:/input/train_target.csv')
test = import_data('C:/input/test_data.csv')
print(train.shape)
print(target.shape)
print(test.shape)

Memory usage of dataframe is 1796.90 MB
Memory usage after optimization is: 488.64 MB
Decreased by 72.8%
Memory usage of dataframe is 1.42 MB
Memory usage after optimization is: 0.41 MB
Decreased by 70.8%
Memory usage of dataframe is 526.70 MB
Memory usage after optimization is: 143.23 MB
Decreased by 72.8%
(61866, 3807)
(61866, 3)
(18134, 3807)


In [10]:
data = pd.concat([train,test],axis = 0) # train，test索引上连接
print(train.shape)
print(test.shape)
print(data.shape)

(61866, 3807)
(18134, 3807)
(80000, 3807)


In [11]:
idx_test = test['idx']

In [14]:
data.isnull().sum(axis=0).sort_values(ascending = False) #查看各个特征缺失情况#
missing_percentage = data.isnull().mean() * 100
features_missing_more_than = missing_percentage[missing_percentage > 40]
sorted_missing_features = features_missing_more_than.sort_values(ascending=False)
print(sorted_missing_features)

X2255    100.00000
X867     100.00000
X823     100.00000
X824     100.00000
X825     100.00000
           ...    
X1937     40.89250
X1253     40.82875
X842      40.82875
X706      40.82875
X1946     40.18125
Length: 2419, dtype: float64


In [15]:
features_missing_more_than = sorted_missing_features.index.tolist()
data = data.drop(features_missing_more_than,axis=1)

In [16]:
#拆分训练集和测试集#
n_train = len(train)
n_test = len(test)
df_train = data.head(n_train)
df_test = data.tail(n_test)
df_train = df_train.drop('idx',axis = 1)
df_test = df_test.drop('idx',axis=1)

In [17]:
label = target['y']

In [59]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

# 假设 df_train 是训练数据集，label 是目标变量
# df_train = pd.DataFrame(...)  # 你的训练数据集
# label = pd.Series(...)  # 你的目标变量
# df_test = pd.DataFrame(...)  # 你的测试数据集
# idx_test = pd.Series(...)  # 测试集的索引

params = {
    'metric': 'auc',
    'num_leaves': 30,
    'max_depth': 10,
    'learning_rate': 0.01,
    'n_estimators':1500,
    'subsample': 0.8,
    'subsample_freq': 5,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'reg_alpha': 1.5,
    'reg_lambda': 1.5,
    'min_data_in_leaf':60,
    'bagging_freq': 1,
    'importance_type': 'gain'
}

# 初始化5折交叉验证生成器
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3407)

auc_list = []
pred_list = []

# 定义早停和日志记录的回调函数
callbacks = [
    lgb.early_stopping(stopping_rounds=400, first_metric_only=True, verbose=True),
    lgb.log_evaluation(period=100)
]

for train_index, test_index in skf.split(df_train, label):
    X_train, X_test = df_train.iloc[train_index], df_train.iloc[test_index]
    y_train, y_test = label.iloc[train_index], label.iloc[test_index]
    
    # 创建LightGBM数据集
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test)
    
    # 训练模型
    model = lgb.train(
        params,
        train_data,
        num_boost_round=4000,  # 使用000作为最大迭代次数
        valid_sets=[valid_data],
        callbacks=callbacks  # 使用callbacks参数替代early_stopping_rounds和verbose_eval
    )
    
    # 预测验证集
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_test, pred)
    auc_list.append(auc)
    
    # 测试集进行预测，可以在这里添加代码
    if 'df_test' in locals() and 'idx_test' in locals():
        pred_test = model.predict(df_test, num_iteration=model.best_iteration)
        pred_list.append(pred_test)

# 计算平均AUC
mean_auc = np.mean(auc_list)
print(f'Mean AUC: {mean_auc:.4f}')

# 保存预测结果
if 'pred_list' in locals() and len(pred_list) > 0:
    res = np.array(pred_list)
    print("5折结果：", res.shape)
    r = res.mean(axis=0)
    print('result shape:', r.shape)
    result = pd.DataFrame()
    result['idx'] = idx_test  # 假设idx_test是测试集的索引
    result['y_pred'] = r
    result.to_csv('lightgbm.csv', index=False, sep=",")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.342805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106414
[LightGBM] [Info] Number of data points in the train set: 49492, number of used features: 1387
[LightGBM] [Info] Start training from score 0.183302
Training until validation scores don't improve for 400 rounds
[100]	valid_0's auc: 0.670399
[200]	valid_0's auc: 0.679759
[300]	valid_0's auc: 0.684507
[400]	valid_0's auc: 0.686781
[500]	valid_0's auc: 0.688364
[600]	valid_0's auc: 0.689535
[700]	valid_0's auc: 0.690475
[800]	valid_0's auc: 0.691207
[900]	valid_0's auc: 0.691808
[1000]	valid_0's auc: 0.691998
[1100]	valid_0's auc: 0.692606
[1200]	valid_0's auc: 0.692645
[1300]	valid_0's auc: 0.69279


KeyboardInterrupt: 

In [None]:
auc_list

In [None]:
mean_auc = np.mean(auc_list)
print("mean auc:", mean_auc)
filepath = 'result/lgb_' + str(mean_auc) + '.csv'  # 线下平均分数
# 转为array
res = np.array(pred_list)
print("5折结果：", res.shape)

# 最后结果，mean，max，min
r = res.mean(axis=0)
print('result shape:', r.shape)

result = pd.DataFrame()
result['idx'] = idx_test
result['y_pred'] = r
result.to_csv('nplgbm.csv', index=False, sep=",")