# Load the fused fusion

In [1]:
import pandas as pd

# 加载特征数据
features_path = 'fused_new.csv'
features_df = pd.read_csv(features_path, low_memory=False)
features_df = features_df.dropna()

# 查看前数据
# print(features_df.head()
# 查看label分布
#label_distribution = features_df['label'].value_counts()
# print(label_distribution)

# 确保所有特征列都是数值类型（排除 'image_id' 和 'label' 列）
features_df.iloc[:, :-2] = features_df.iloc[:, :-2].apply(pd.to_numeric, errors='coerce')

# 按照label进行分组
label_0 = features_df[features_df['label'] == 0]
label_1 = features_df[features_df['label'] == 1]

# 打印每个标签的样本数量
# print(f"标签为0的样本数量: {label_0.shape[0]}")
# print(f"标签为1的样本数量: {label_1.shape[0]}")

# 检查样本数量是否足够
if label_0.shape[0] < 5000 or label_1.shape[0] < 5000:
    print("样本数量不足，无法从每个标签中抽取5000个样本。")
else:
    # 选择5000个label为0的数据和5000个label为1的数据
    label_0_sample = label_0.sample(n=5000, random_state=42)
    label_1_sample = label_1.sample(n=5000, random_state=42)

    # 合并两个样本
    balanced_df = pd.concat([label_0_sample, label_1_sample], ignore_index=True)

    # 打印总行数和总列数
    print(f"总行数: {balanced_df.shape[0]}")
    print(f"总列数: {balanced_df.shape[1]}")

    # 打印前五行数据
    # print(balanced_df.head())

    # 检查是否存在任何缺失值
    print(balanced_df.isnull().sum())

总行数: 10000
总列数: 1026
feature_0       0
feature_1       0
feature_2       0
feature_3       0
feature_4       0
               ..
feature_1021    0
feature_1022    0
feature_1023    0
image_id        0
label           0
Length: 1026, dtype: int64


# Split training & test set

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# 从 DataFrame 中分离出特征数据和标签
X = balanced_df.drop(['label', 'image_id'], axis=1)  # 移除 'label' 和 'image_id' 列
y = balanced_df['label']  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 存储 image_id 以便之后合并
image_id_train = balanced_df.loc[X_train.index, 'image_id']
image_id_test = balanced_df.loc[X_test.index, 'image_id']

# 再次确保所有数据都是数值类型
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

# 标准化特征数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

# 统计数据的基本信息
print("数据的基本统计信息:")
print(pd.DataFrame(X_train).describe())
print(pd.DataFrame(X_test).describe())

# 打印检查
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)
print("Training image_ids shape:", image_id_train.shape)
print("Test image_ids shape:", image_id_test.shape)

数据的基本统计信息:
               0             1             2             3             4     \
count  8.000000e+03  8.000000e+03  8.000000e+03  8.000000e+03  8.000000e+03   
mean  -7.152557e-10 -2.622604e-09  1.192093e-09  3.099442e-09 -7.152557e-10   
std    1.000062e+00  1.000062e+00  1.000062e+00  1.000062e+00  1.000062e+00   
min   -2.451431e+00 -3.655570e+00 -3.939831e+00 -3.620988e+00 -1.264867e+00   
25%   -3.133890e-01  2.543819e-01 -1.395582e-01  5.805079e-02 -5.512592e-01   
50%   -1.340701e-01  3.921014e-01  2.639310e-01  3.156925e-01 -3.271695e-01   
75%    3.212019e-02  5.027586e-01  6.014655e-01  5.108766e-01 -3.935668e-02   
max    3.613493e+00  8.825111e-01  2.085891e+00  1.284997e+00  3.613818e+00   

               5             6             7             8             9     \
count  8.000000e+03  8.000000e+03  8.000000e+03  8.000000e+03  8.000000e+03   
mean  -1.430511e-09 -1.668930e-09 -1.192093e-09 -1.192093e-10  1.788139e-09   
std    1.000062e+00  1.000062e+00  1.000

# PCA - 1 

In [3]:
from sklearn.decomposition import PCA

# 应用 PCA 降维，保留95%的方差
pca1 = PCA(n_components=0.95)  
X_train_pca1 = pca1.fit_transform(X_train) # pca后的训练集
X_test_pca1 = pca1.transform(X_test) # pca后的测试集

# 检查新的维数和解释的方差比
print("New training dimensions:", X_train_pca1.shape[1])
print("New testing dimensions:", X_test_pca1.shape[1])
# print("Total explained variance:", np.sum(pca1.explained_variance_ratio_))
# print("Explained variance ratio:", pca1.explained_variance_ratio_)


New training dimensions: 30
New testing dimensions: 30


# MLP - scikit-learn

## 2 hidden layers

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(100, 50), (50, 25), (30, 15)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca1, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca1, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca1)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


## PCA - 2 

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# 应用 PCA 降维
pca2 = PCA(n_components=200)  
X_train_pca2 = pca2.fit_transform(X_train) # pca后的训练集
X_test_pca2 = pca2.transform(X_test) # pca后的测试集

# 检查新的维数和解释的方差比
print("New training dimensions:", X_train_pca2.shape[1])
print("New testing dimensions:", X_test_pca2.shape[1])
# print("Explained variance ratio:", pca.explained_variance_ratio_)


## 3 hidden layers

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(200,100,50), (300,150,75)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca2, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca2, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca2)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


## 4 hidden layers 

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(200,100,50,25), (300,150,75,30)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca2, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca2, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca2)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

## CNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from skorch import NeuralNetClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# 重塑数据为1D CNN接受的形状
X_train_pca2 = X_train_pca2.reshape(X_train_pca2.shape[0], X_train_pca2.shape[1], 1)
X_test_pca2 = X_test_pca2.reshape(X_test_pca2.shape[0], X_test_pca2.shape[1], 1)

# 定义1D CNN模型
class CNN1D(nn.Module):
    def __init__(self, num_features, num_filters=32, kernel_size=3, pool_size=2, dense_units=128):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, num_filters, kernel_size)
        self.pool = nn.MaxPool1d(pool_size)
        self.conv2 = nn.Conv1d(num_filters, num_filters*2, kernel_size)
        self.fc1 = nn.Linear(num_filters*2 * ((num_features - 2 * (kernel_size - 1)) // pool_size**2), dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, x.shape[1] * x.shape[2])
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

# 创建skorch的NeuralNetClassifier
net = NeuralNetClassifier(
    module=CNN1D,
    module__num_features=X_train_pca2.shape[1],
    max_epochs=10,
    lr=0.01,
    optimizer=optim.Adam,
    optimizer__weight_decay=0.0001,
    criterion=nn.CrossEntropyLoss,
    batch_size=64,
    iterator_train__shuffle=True,
    callbacks=[skorch.callbacks.EarlyStopping(patience=5)],
    # device='cuda' 
)

# 设置网格搜索参数
param_grid = {
    'optimizer': [optim.Adam, optim.SGD, optim.RMSprop],
    'module__num_filters': [32, 64, 128],
    'module__kernel_size': [3, 5, 7],
    'module__pool_size': [2, 3],
    'module__dense_units': [64, 128, 256]
}

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行网格搜索
grid = GridSearchCV(estimator=net, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_result = grid.fit(X_train_pca2, y_train)

# 输出结果
print("Best parameters found:", grid_result.best_params_)
print("Best accuracy found:", grid_result.best_score_)

# 使用最佳模型预测并评估
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test_pca2)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
