## **前置操作**

In [1]:
# 数据处理
import numpy as np
import pandas as pd
from scipy import stats
from scipy.sparse import hstack
import random
import itertools

# 数据可视化
import matplotlib.pyplot as plt
import seaborn as sns

# 特征工程
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# 模型相关
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 其他
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline
sns.set(palette='muted', style='whitegrid')
np.random.seed(13154)

In [2]:
train = pd.read_csv("train_cleanned.csv")
test = pd.read_csv("test_cleanned.csv")

In [3]:
train.shape , test.shape

((76020, 106), (75818, 105))

In [4]:
train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,...,saldo_medio_var5_hace3,saldo_medio_var5_ult1,saldo_medio_var5_ult3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,var38,TARGET,var15_0
0,1,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.576564,0,1
1,3,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,88.89,0.0,0.0,0.0,0.0,300.0,240.75,10.805234,0,1
2,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18,3.0,2.07,0.0,0.0,0.0,0.0,11.117417,0,1
3,8,2,1,0.0,195.0,195.0,195.0,195.0,0.0,0.0,...,0.0,91.56,138.84,0.0,0.0,0.0,0.0,11.066763,0,1
4,10,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.3,40501.08,13501.47,85501.89,11.356294,0.0,0.0,11.672584,0,1


## **特征工程（Feature Engineering）**

### 1. 特征构造

#### **规范定类数据**

- 由于机器学习仅接受数值型数据，这里想找到所有潜在的定类数据，对定类数据进行编码处理，以便于后续的处理。

In [5]:
def encode_categories(train, test, target_column):
    # 遍历所有列，找到唯一值小于10个的列，不包括目标列
    columns_to_recode = [column for column in train.columns if column != target_column and train[column].nunique() < 10]
    
    for column in columns_to_recode:
        # 将列的数据类型修改为分类数据
        combined = pd.concat([train[[column]], test[[column]]], axis=0)
        combined[column] = combined[column].astype('category')
        
        # 为训练集和测试集重新编码
        train[column], test[column] = combined.iloc[:len(train)], combined.iloc[len(train):]
        train[column] = train[column].cat.codes + 1  # 从1开始编码
        test[column] = test[column].cat.codes + 1

    # 将修改后的列转换为category类型，确保一致性
    for column in columns_to_recode:
        train[column] = train[column].astype('category')
        test[column] = test[column].astype('category')

    # 打印信息确认
    print("训练集信息：")
    train.info()
    print("测试集信息：")
    test.info()

encode_categories(train, test, 'TARGET')


训练集信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 106 entries, ID to var15_0
dtypes: category(43), float64(29), int64(34)
memory usage: 39.7 MB
测试集信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 105 entries, ID to var15_0
dtypes: category(43), float64(29), int64(33)
memory usage: 39.0 MB


In [6]:
print(train['TARGET'].dtype)

int64


- 在这里我们已经对潜在的分类数据进行了编码处理，并保证了不改变TARGET列，以便于下面的独热编码。

#### **独热编码**

- 下面对刚才改变的category，对他们进行独热编码。

In [7]:
def process_data(train, test):
    # 独热编码：首先标识训练集和测试集中的分类列
    train_category_columns = train.select_dtypes(include=['category']).columns
    test_category_columns = test.select_dtypes(include=['category']).columns
    
    # 确保测试集和训练集有相同的分类列进行独热编码
    common_category_columns = train_category_columns.intersection(test_category_columns)
    
    # 对训练集和测试集应用独热编码
    train = pd.get_dummies(train, columns=common_category_columns, drop_first=True)
    test = pd.get_dummies(test, columns=common_category_columns, drop_first=True)
    
    # 处理布尔型数据：将布尔型转换为整型，再转为分类型
    for dataset in [train, test]:
        for column in dataset.columns:
            if dataset[column].nunique() == 2 and dataset[column].dtype == bool:
                dataset[column] = dataset[column].astype(int).astype('category')

    # 打印信息确认
    print("训练集类型和形状：")
    print(train.dtypes)
    print(train.shape)
    
    print("测试集类型和形状：")
    print(test.dtypes)
    print(test.shape)
    
    return train, test

In [8]:
train_processed, test_processed = process_data(train, test)

训练集类型和形状：
ID                                 int64
var3                               int64
imp_ent_var16_ult1               float64
imp_op_var39_comer_ult1          float64
imp_op_var39_comer_ult3          float64
                                  ...   
num_meses_var13_corto_ult3_4    category
num_meses_var39_vig_ult3_2      category
num_meses_var39_vig_ult3_3      category
num_meses_var39_vig_ult3_4      category
var15_0_2                       category
Length: 184, dtype: object
(76020, 184)
测试集类型和形状：
ID                                 int64
var3                               int64
imp_ent_var16_ult1               float64
imp_op_var39_comer_ult1          float64
imp_op_var39_comer_ult3          float64
                                  ...   
num_meses_var13_corto_ult3_4    category
num_meses_var39_vig_ult3_2      category
num_meses_var39_vig_ult3_3      category
num_meses_var39_vig_ult3_4      category
var15_0_2                       category
Length: 183, dtype: object
(75818, 183

In [9]:
train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 184 entries, ID to var15_0_2
dtypes: category(121), float64(29), int64(34)
memory usage: 45.3 MB


#### **多项式特征构造**

- 这里我们对数值型数据进行多项式特征构造，来看各种特征之间的交互作用。

In [10]:
def generate_polynomial_features(train, test, target_column='TARGET', id_column='ID', degree=2):
    # 创建多项式特征生成器，设定最高次数为2
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    # 筛选出非ID、非TARGET且非category类型的列
    feature_cols = [col for col in train.columns if col not in [id_column, target_column] and train[col].dtype != 'category']
    
    # 应用多项式特征转换到所选列
    train_poly = poly.fit_transform(train[feature_cols])
    test_poly = poly.transform(test[feature_cols])
    
    # 将生成的多项式特征转换为DataFrame
    train_poly_df = pd.DataFrame(train_poly, columns=poly.get_feature_names_out(feature_cols), index=train.index)
    test_poly_df = pd.DataFrame(test_poly, columns=poly.get_feature_names_out(feature_cols), index=test.index)
    
    # 合并原始数据集中的ID、TARGET和category列与新生成的多项式特征
    train_final = pd.concat([train[[id_column, target_column] + list(train.select_dtypes('category').columns)], train_poly_df], axis=1)
    test_final = pd.concat([test[[id_column] + list(test.select_dtypes('category').columns)], test_poly_df], axis=1)
    
    return train_final, test_final

In [11]:
train_processed_1, test_processed_1 = generate_polynomial_features(train_processed, test_processed)

In [12]:
train_processed_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 2075 entries, ID to var38^2
dtypes: category(121), float64(1952), int64(2)
memory usage: 1.1 GB


In [13]:
test_processed_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 2074 entries, ID to var38^2
dtypes: category(121), float64(1952), int64(1)
memory usage: 1.1 GB


### 2. 特征选择

- 构造完特征后我们对特征之间，特征与目标变量之间相关性高的进行删除，防止多重共线性，保证特征是独立的。

In [14]:
def remove_corr_var(train, test, target_threshold=0.001, within_threshold=0.95):
    """
    删除与目标变量相关性低的特征，删除彼此之间相关性高的特征（保留一个）
    """
    # 计算训练集相关性矩阵
    corr = train.drop("ID", axis=1).corr().abs()
    # 筛选与目标变量相关性低的特征
    corr_target = corr['TARGET'].sort_values()
    low_corr_features = corr_target[corr_target <= target_threshold].index.tolist()
    print(f"有 {len(low_corr_features)} 个特征因为与目标变量TARGET的相关系数绝对值小于 {target_threshold} 而被删除")

    # 确保只删除在训练集和测试集都存在的特征
    low_corr_features = [feat for feat in low_corr_features if feat in train.columns and feat in test.columns]
    train.drop(low_corr_features, axis=1, inplace=True)
    test.drop(low_corr_features, axis=1, inplace=True)

    # 删除彼此之间相关性高的特征
    corr = train.drop(["ID", "TARGET"], axis=1).corr().abs()
    upper_tri = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > within_threshold)]
    print(f"有 {len(to_drop)} 个特征与另一个特征高度相关且相关系数为 {within_threshold} 及以上而被删除")

    # 确保只删除在训练集和测试集都存在的特征
    to_drop = [col for col in to_drop if col in train.columns and col in test.columns]
    train.drop(to_drop, axis=1, inplace=True)
    test.drop(to_drop, axis=1, inplace=True)

    print(f"特征数变成 {train.shape[1]} 个，其中特征已被删除")
    
    return train, test

In [15]:
train_processed_2, test_processed_2 = remove_corr_var(train_processed_1, test_processed_1)

有 143 个特征因为与目标变量TARGET的相关系数绝对值小于 0.001 而被删除
有 1014 个特征与另一个特征高度相关且相关系数为 0.95 及以上而被删除
特征数变成 934 个，其中特征已被删除


In [16]:
train_processed_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 934 entries, ID to saldo_medio_var5_ult3 saldo_medio_var12_ult3
dtypes: category(94), float64(838), int64(2)
memory usage: 494.0 MB


In [17]:
test_processed_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 933 entries, ID to saldo_medio_var5_ult3 saldo_medio_var12_ult3
dtypes: category(94), float64(838), int64(1)
memory usage: 492.1 MB


### 3. 特征增强

#### **数据的标准化、归一化**

- 这里我们完成模型构造之前的标准化、归一化的操作。

In [18]:
x_train = train_processed_2.copy()
x_test = test_processed_2.copy()

In [19]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 934 entries, ID to saldo_medio_var5_ult3 saldo_medio_var12_ult3
dtypes: category(94), float64(838), int64(2)
memory usage: 494.0 MB


In [20]:
columns_to_normalize = x_train.select_dtypes(include=['float64']).columns.tolist()

# 创建 StandardScaler 对象
scaler = StandardScaler()

# 对所选列进行标准化归一化
x_train[columns_to_normalize] = scaler.fit_transform(x_train[columns_to_normalize])

In [21]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 934 entries, ID to saldo_medio_var5_ult3 saldo_medio_var12_ult3
dtypes: category(94), float64(838), int64(2)
memory usage: 494.0 MB


In [22]:
columns_to_normalize = x_test.select_dtypes(include=['float64']).columns.tolist()

# 创建 StandardScaler 对象
scaler = StandardScaler()

# 对所选列进行标准化归一化
x_test[columns_to_normalize] = scaler.fit_transform(x_test[columns_to_normalize])

In [23]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 933 entries, ID to saldo_medio_var5_ult3 saldo_medio_var12_ult3
dtypes: category(94), float64(838), int64(1)
memory usage: 492.1 MB


In [24]:
# 找出 data1 和 data2 中不共有的列名
unique_columns_x_train = x_train.columns.difference(x_test.columns)
unique_columns_x_test = x_test.columns.difference(x_train.columns)

print("在x_train中不共有的列名:", unique_columns_x_train)
print("在x_test中不共有的列名:", unique_columns_x_test)

在x_train中不共有的列名: Index(['TARGET', 'num_var12_0_5', 'num_var12_0_7', 'num_var12_4',
       'num_var24_0_4', 'num_var30_10', 'num_var39_0_7', 'num_var41_0_6',
       'num_var41_0_7', 'num_var42_0_7', 'num_var42_0_9', 'num_var42_7',
       'num_var5_0_6', 'num_var5_6'],
      dtype='object')
在x_test中不共有的列名: Index(['num_var12_0_6', 'num_var13_0_8', 'num_var13_8', 'num_var30_9',
       'num_var39_0_11', 'num_var39_0_9', 'num_var41_0_11', 'num_var41_0_9',
       'num_var42_0_8', 'num_var4_10', 'num_var4_9', 'num_var5_0_5',
       'num_var5_5'],
      dtype='object')


In [25]:
x_train.to_csv('x_train.csv', index=False)

# 保存 x_test 到 CSV 文件
x_test.to_csv('x_test.csv', index=False)

### 4. 特征变换（PCA-LDA降维）

- 由于特征筛选后还有900多个特征，所以这里采用特征变换（降维的方法）。而这里在机器学习流水线中，我们一般采用的是PCA和LDA两种方法结合应用。

In [26]:
# 加载数据
x_train = pd.read_csv('x_train.csv')
x_test = pd.read_csv('x_test.csv')

In [27]:
columns_to_drop = ['num_var12_0_5', 'num_var12_0_7', 'num_var12_4', 'num_var24_0_4', 'num_var30_10',
                   'num_var39_0_7', 'num_var41_0_6', 'num_var41_0_7', 'num_var42_0_7', 'num_var42_0_9',
                   'num_var42_7', 'num_var5_0_6', 'num_var5_6', 'num_var12_0_6', 'num_var13_0_8',
                   'num_var13_8', 'num_var30_9', 'num_var39_0_11', 'num_var39_0_9', 'num_var41_0_11',
                   'num_var41_0_9', 'num_var42_0_8', 'num_var4_10', 'num_var4_9', 'num_var5_0_5',
                   'num_var5_5']

# 从 x_train 中删除指定列
for col in columns_to_drop:
    if col in x_train.columns:
        x_train.drop(columns=col, inplace=True)

# 从 x_test 中删除指定列
for col in columns_to_drop:
    if col in x_test.columns:
        x_test.drop(columns=col, inplace=True)

In [28]:
# 分离特征和标签
X_train = x_train.drop(['TARGET', 'ID'], axis=1)
y_train = x_train['TARGET']
X_test = x_test.drop('ID', axis=1)

# 计算PCA需要保留的组件数以解释至少98%的方差
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.98) + 1  # 加1因为索引从0开始

print(f"Number of components to explain 98% variance: {d}")

# 使用计算出的组件数设置PCA
pca = PCA(n_components=d)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# 应用LDA
lda = LDA()
X_train_lda = lda.fit_transform(X_train_pca, y_train)
X_test_lda = lda.transform(X_test_pca)


Number of components to explain 98% variance: 305


## **保存数据**

In [29]:
# 将X_train_lda转换为DataFrame
train_final = pd.DataFrame(X_train_lda, columns=[f'component_{i}' for i in range(X_train_lda.shape[1])])

# 将y_train转换为DataFrame并与X_train_lda合并
train_final['TARGET'] = y_train

# 将数据保存为CSV文件
train_final.to_csv('train_final.csv', index=False)

# 将X_test_lda转换为DataFrame
test_final = pd.DataFrame(X_test_lda, columns=[f'component_{i}' for i in range(X_test_lda.shape[1])])

# 将数据保存为CSV文件
test_final.to_csv('test_final.csv', index=False)