In [1]:
import pandas as pd

# 用分号作为分隔符，同时指定正确编码（这个数据集常用latin-1）
df = pd.read_csv(
    r"D:\Englishnamefiles\aaa\tree_enhanced_dl\data\dataset\bank_marketing\bank-additional\bank-additional\bank-additional-full.csv",
    sep=";",  # 关键：指定分隔符为分号
    encoding="latin-1"  # 这个数据集的编码通常是latin-1
)
df.rename(columns={'y': 'label'}, inplace=True)
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [3]:
# 数值型字段
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# 分类型字段
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("数值型字段：", numeric_cols)
print("分类型字段：", categorical_cols)

数值型字段： ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
分类型字段： ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'label']


In [4]:
from sklearn.model_selection import train_test_split

# 先分训练集+验证集 与 测试集（比如8:2）
X = df.drop('label', axis=1)  # 特征
y = df['label']  # 目标变量
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 再把训练集+验证集 分成训练集和验证集（比如7.5:2.5，最终整体是6:2:2）
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [9]:
from sklearn.model_selection import train_test_split


# 2. 划分特征和目标变量
X = df.drop('label', axis=1)
y = df['label']

# 3. 第一次划分：训练集+验证集（90%）、测试集（10%）→ 加 stratify=y 分层
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, 
    test_size=0.1, 
    random_state=42,
    stratify=y  # 关键：按目标变量y分层，保证测试集类别占比和原始一致
)

# 4. 第二次划分：训练集（70%）、验证集（20%）→ 加 stratify=y_train_val 分层
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, 
    test_size=2/9, 
    random_state=42,
    stratify=y_train_val  # 关键：按训练集+验证集的目标变量分层
)

# 5. 合并特征和目标变量
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# 6. 保存到指定路径（这里以你项目中的data目录为例，可自行修改）
save_path = r"D:\Englishnamefiles\aaa\tree_enhanced_dl\data\dataset\processed"  # 替换为你的目标路径
train_df.to_csv(f"{save_path}/train.csv", index=False, sep=";")
val_df.to_csv(f"{save_path}/val.csv", index=False, sep=";")
test_df.to_csv(f"{save_path}/test.csv", index=False, sep=";")

# 验证分层效果（打印类别占比）
print("原始数据类别占比：")
print(df['label'].value_counts(normalize=True).round(4))
print("\n训练集类别占比：")
print(train_df['label'].value_counts(normalize=True).round(4))
print("\n验证集类别占比：")
print(val_df['label'].value_counts(normalize=True).round(4))
print("\n测试集类别占比：")
print(test_df['label'].value_counts(normalize=True).round(4))
print(f"\n数据集已保存到：{save_path}")

原始数据类别占比：
label
no     0.8873
yes    0.1127
Name: proportion, dtype: float64

训练集类别占比：
label
no     0.8873
yes    0.1127
Name: proportion, dtype: float64

验证集类别占比：
label
no     0.8874
yes    0.1126
Name: proportion, dtype: float64

测试集类别占比：
label
no     0.8874
yes    0.1126
Name: proportion, dtype: float64

数据集已保存到：D:\Englishnamefiles\aaa\tree_enhanced_dl\data\dataset\processed


In [None]:
import pandas as pd

# 加载你的数据
df = pd.read_csv('/gpool/home/wanghongyang/WangHY/WYY/tree_enhanced_dl/data/processed/train.csv')
print(df.head())
print("数据列名：")
print(df.columns.tolist())
print("\n前5行数据：")
print(df.head())