# Load the fused fusion

In [1]:
import pandas as pd

# 加载特征数据
features_path = 'fused_combined.csv'
features_df = pd.read_csv(features_path, header=None, low_memory=False)

# 获取总列数
num_cols = features_df.shape[1]

# 设置列名,倒数第二列是image_id,最后一列是label
col_names = [f'feature_{i}' for i in range(num_cols - 2)] + ['image_id', 'label']
features_df.columns = col_names

# 确保所有特征列都是数值类型
for col in col_names[:-2]:  # 排除 'image_id' 和 'label'
    features_df[col] = pd.to_numeric(features_df[col], errors='coerce')

# 按照label进行分组
label_0 = features_df[features_df['label'] == 0]
label_1 = features_df[features_df['label'] == 1]

# 选择5000个label为0的数据和5000个label为1的数据
label_0_sample = label_0.sample(n=5000, random_state=42)
label_1_sample = label_1.sample(n=5000, random_state=42)

# 合并两个样本
balanced_df = pd.concat([label_0_sample, label_1_sample], ignore_index=True)

# 打印总行数和总列数
print(f"总行数: {balanced_df.shape[0]}")
print(f"总列数: {balanced_df.shape[1]}")

# 打印前五行数据
print(balanced_df.head())

# 检查是否存在任何缺失值
print(balanced_df.isnull().sum())

# 如果存在缺失值,可以选择填充或删除缺失值
# 这里选择填充缺失值为0
balanced_df = balanced_df.fillna(0)

总行数: 10000
总列数: 1538
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.135596  -0.059194  -0.061392  -0.063409  -0.198597   0.316938   
1   0.150803  -0.027447   0.003386  -0.048853  -0.257534   0.257739   
2   0.175314  -0.031636  -0.016272  -0.027421  -0.212084   0.295495   
3   0.111710  -0.053619   0.015866  -0.040812  -0.238548   0.307927   
4   0.126477  -0.029501   0.012588  -0.072348  -0.213603   0.290125   

   feature_6  feature_7  feature_8  feature_9  ...  feature_1528  \
0  -0.179819   0.119753  -0.207725  -0.100630  ...      0.178181   
1  -0.183425   0.165298  -0.256108  -0.106024  ...      0.135101   
2  -0.196441   0.161346  -0.269195  -0.084032  ...      0.197769   
3  -0.155139   0.151456  -0.222932  -0.083256  ...      0.195484   
4  -0.204443   0.161073  -0.224686  -0.122573  ...      0.188609   

   feature_1529  feature_1530  feature_1531  feature_1532  feature_1533  \
0      0.146170      0.048220     -0.137705      0.023962      0.108

# Split training & test set

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 从 DataFrame 中分离出特征数据和标签
X = balanced_df.drop(['label', 'image_id'], axis=1)  # 移除 'label' 和 'image_id' 列
y = balanced_df['label']  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 存储 image_id 以便之后合并
image_id_train = balanced_df.loc[X_train.index, 'image_id']
image_id_test = balanced_df.loc[X_test.index, 'image_id']

# 再次确保所有数据都是数值类型
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

# 标准化特征数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 打印检查
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)
print("Training image_ids shape:", image_id_train.shape)
print("Test image_ids shape:", image_id_test.shape)

Training set shape: (8000, 1536)
Test set shape: (2000, 1536)
Training labels shape: (8000,)
Test labels shape: (2000,)
Training image_ids shape: (8000,)
Test image_ids shape: (2000,)


# PCA 

In [4]:
import numpy as np
from sklearn.decomposition import PCA

# 应用 PCA 降维，保留95%的方差
pca = PCA(n_components=0.95)  #
X_train_pca = pca.fit_transform(X_train) # pca后的训练集
X_test_pca = pca.transform(X_test) # pca后的测试集

# 检查新的维数和解释的方差比
print("New training dimensions:", X_train_pca.shape[1])
print("New testing dimensions:", X_test_pca.shape[1])
# print("Explained variance ratio:", pca.explained_variance_ratio_)


New training dimensions: 187
New testing dimensions: 187


# MLP - scikit-learn

## 1- 5 hidden layers

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from keras.callbacks import EarlyStopping

# 创建Early Stopping回调函数
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# 创建并配置MLP分类器,添加Early Stopping回调函数
mlp = MLPClassifier(random_state=42, callbacks=[early_stopping])

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(200,),(200,100,),(200,100,50),(200,100,50,25),(200, 100, 50, 25, 12)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.001, 0.01],
    'learning_rate_init': [0.0001,0.001, 0.01,0.1,1],
    'max_iter': [500, 800]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


ModuleNotFoundError: No module named 'sklearn.callbacks'

## CNN

In [None]:
!pip install skorch

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from skorch import NeuralNetClassifier
import torch
import torch.nn as nn
import torch.optim as optim

# 加载数据
data = pd.read_csv('merge_df.csv')
X = data.drop('label', axis=1).values
y = data['label'].values

# 重塑数据为1D CNN接受的形状
X = X.reshape(X.shape[0], X.shape[1], 1)

# 定义1D CNN模型
class CNN1D(nn.Module):
    def __init__(self, num_features, num_filters=32, kernel_size=3, pool_size=2, dense_units=128):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, num_filters, kernel_size)
        self.pool = nn.MaxPool1d(pool_size)
        self.conv2 = nn.Conv1d(num_filters, num_filters*2, kernel_size)
        self.conv3 = nn.Conv1d(num_filters*2, num_filters*4, kernel_size)
        self.conv4 = nn.Conv1d(num_filters*4, num_filters*8, kernel_size)
        self.fc1 = nn.Linear(num_filters*8 * ((num_features - 4 * (kernel_size - 1)) // pool_size**4), dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = self.pool(self.relu(self.conv4(x)))
        x = x.view(-1, x.shape[1] * x.shape[2])
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

# 创建skorch的NeuralNetClassifier
net = NeuralNetClassifier(
    module=CNN1D,
    module__num_features=X.shape[1],
    max_epochs=10,
    lr=0.01,
    optimizer=optim.Adam,
    iterator_train__shuffle=True,
    verbose=0
)

# 设置网格搜索参数
param_grid = {
    'optimizer': [optim.Adam, optim.SGD, optim.RMSprop],
    'module__num_filters': [32, 64, 128],
    'module__kernel_size': [3, 5, 7],
    'module__pool_size': [2, 3],
    'module__dense_units': [64, 128, 256]
}

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行网格搜索
grid = GridSearchCV(estimator=net, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_result = grid.fit(X, y)

# 输出结果
print("Best parameters found:", grid_result.best_params_)
print("Best accuracy found:", grid_result.best_score_)


# SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 创建SVM模型，尝试使用RBF核
svc = SVC(kernel='rbf')

# 设置更广泛的参数网格
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 'scale']
}

# 使用网格搜索进行调参，应用StratifiedKFold
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(svc, param_grid, refit=True, verbose=2, cv=cv)
grid.fit(X_train_pca, y_train)

# 预测
predictions = grid.predict(X_test_pca)

# 评估模型
print("Best parameters found:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


# Random Forest 

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

# 加载数据
# 假设 X_train_pca 和 y_train_pca 已经预处理并准备好了

# 设置网格搜索参数
param_grid = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'max_depth': [None, 10, 20, 30],  # 树的最大深度
    'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],    # 叶子节点最小样本数
    'bootstrap': [True, False]        # 是否有放回地抽样
}

# 初始化随机森林分类器
rf = RandomForestClassifier(random_state=42)

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=2)
grid_result = grid_search.fit(X_train_pca, y_train_pca)

# 输出结果
print("Best parameters found:", grid_result.best_params_)
print("Best accuracy found:", grid_result.best_score_)

# 使用最佳参数训练模型并评估在测试集上的性能
best_rf = grid_result.best_estimator_
best_rf.fit(X_train_pca, y_train_pca)
y_pred = best_rf.predict(X_test_pca)
test_accuracy = accuracy_score(y_test_pca, y_pred)

print(f"Test accuracy: {test_accuracy}")
