# Load the fused fusion

In [1]:
import pandas as pd

# 加载特征数据
features_path = 'fused_combined.csv'
features_df = pd.read_csv(features_path, low_memory=False)
features_df = features_df.dropna()

# 查看前数据
print(features_df.head())
# 查看label分布
#label_distribution = features_df['label'].value_counts()
# print(label_distribution)


   0.13273102  -0.046325  0.04799986  -0.054121092  -0.26567036  0.28699276  \
0    0.166932  -0.040577    0.022252     -0.036686    -0.257072    0.275591   
1    0.141127  -0.032970    0.055993     -0.034640    -0.267474    0.290403   
2    0.172750  -0.065717   -0.025985     -0.075753    -0.179092    0.282487   
3    0.163937  -0.015764    0.043609     -0.071291    -0.271180    0.274031   
4    0.129085  -0.017540    0.014079     -0.054526    -0.234425    0.316496   

   -0.20155007  0.15216848  -0.21968047  -0.10451735  ...  0.1426529  \
0    -0.190400    0.157786    -0.211556    -0.094982  ...   0.141150   
1    -0.170492    0.175335    -0.225272    -0.110479  ...   0.135947   
2    -0.175625    0.167229    -0.230976    -0.117548  ...   0.156261   
3    -0.210409    0.149995    -0.210583    -0.079840  ...   0.133538   
4    -0.179008    0.167545    -0.199602    -0.086054  ...   0.193026   

   0.1861744  0.099532425  -0.15293124  0.056991853  0.14413288  0.077607326  \
0   0.185733

In [None]:
num_columns = 1538
columns = [f'feature_{i}' for i in range(1, num_columns - 1)] + ['image_id', 'label']

# 加载特征数据并添加列名
features_path = 'fused_combined.csv'
features_df = pd.read_csv(features_path, header=None, names=columns, low_memory=False)
features_df = features_df.dropna()

# 确保所有特征列都是数值类型（排除 'image_id' 和 'label' 列）
features_df.iloc[:, :-2] = features_df.iloc[:, :-2].apply(pd.to_numeric, errors='coerce')

# 按照label进行分组
label_0 = features_df[features_df['label'] == 0]
label_1 = features_df[features_df['label'] == 1]

# 打印每个标签的样本数量
# print(f"标签为0的样本数量: {label_0.shape[0]}")
# print(f"标签为1的样本数量: {label_1.shape[0]}")

# 检查样本数量是否足够
if label_0.shape[0] < 5000 or label_1.shape[0] < 5000:
    print("样本数量不足，无法从每个标签中抽取5000个样本。")
else:
    # 选择5000个label为0的数据和5000个label为1的数据
    label_0_sample = label_0.sample(n=5000, random_state=42)
    label_1_sample = label_1.sample(n=5000, random_state=42)

    # 合并两个样本
    balanced_df = pd.concat([label_0_sample, label_1_sample], ignore_index=True)

    # 确保列名的顺序是正确的
    cols = list(balanced_df.columns)
    cols.remove('image_id')
    cols.remove('label')
    balanced_df = balanced_df[cols + ['image_id', 'label']]

    # 打印总行数和总列数
    print(f"总行数: {balanced_df.shape[0]}")
    print(f"总列数: {balanced_df.shape[1]}")

    # 打印前五行数据
    # print(balanced_df.head())

    # 检查是否存在任何缺失值
    print(balanced_df.isnull().sum())

    # 如果需要将数据保存到新的CSV文件中
    # balanced_df.to_csv('balanced_features.csv', index=False)


# Split training & test set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# 从 DataFrame 中分离出特征数据和标签
X = balanced_df.drop(['label', 'image_id'], axis=1)  # 移除 'label' 和 'image_id' 列
y = balanced_df['label']  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 存储 image_id 以便之后合并
image_id_train = balanced_df.loc[X_train.index, 'image_id']
image_id_test = balanced_df.loc[X_test.index, 'image_id']

# 再次确保所有数据都是数值类型
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

# 标准化特征数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

# 统计数据的基本信息
print("数据的基本统计信息:")
print(pd.DataFrame(X_train).describe())
print(pd.DataFrame(X_test).describe())

# 打印检查
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)
print("Training image_ids shape:", image_id_train.shape)
print("Test image_ids shape:", image_id_test.shape)

# PCA 

In [None]:
from sklearn.decomposition import PCA

# 应用 PCA 降维，保留95%的方差
pca1 = PCA(n_components=0.95)  
X_train_pca1 = pca1.fit_transform(X_train) # pca后的训练集
X_test_pca1 = pca1.transform(X_test) # pca后的测试集

# 检查新的维数和解释的方差比
print("New training dimensions:", X_train_pca1.shape[1])
print("New testing dimensions:", X_test_pca1.shape[1])
# print("Total explained variance:", np.sum(pca1.explained_variance_ratio_))
# print("Explained variance ratio:", pca1.explained_variance_ratio_)


# MLP - scikit-learn

## 2 hidden layers

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(100, 50), (50, 25), (30, 15)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca1, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca1, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca1)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


## 3 hidden layers

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(200,100,50), (300,150,75)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca1, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca1, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca1)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


## 4 hidden layers 

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(200,100,50,25), (300,150,75,30)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [500,800,1100],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=1, verbose=1, n_iter=10)
random_search.fit(X_train_pca1, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca1, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca1)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 创建SVM模型，尝试使用RBF核
svc = SVC(kernel='rbf')

# 设置更广泛的参数网格
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 'scale']
}

# 使用网格搜索进行调参，应用StratifiedKFold
cv = StratifiedKFold(n_splits=5)
random_search = RandomizedSearchCV(svc, param_grid, refit=True, verbose=2, cv=cv)
random_search.fit(X_train_pca1, y_train)

# 预测
predictions = random_search.predict(X_test_pca1)

# 评估模型
print("Best parameters found:",random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 设置网格搜索参数
# 设置参数分布
param_dist = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'max_depth': [None, 10, 20, 30],  # 树的最大深度
    'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],    # 叶子节点最小样本数
    'bootstrap': [True, False]        # 是否有放回地抽样
}

# 初始化随机森林分类器
rf = RandomForestClassifier(random_state=42)

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行随机搜索
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=40, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)
random_result = random_search.fit(X_train_pca1, y_train)

# 输出结果
print("Best parameters found:", random_result.best_params_)
print("Best score:", random_result.best_score_)

# 使用最佳参数训练模型并评估在测试集上的性能
best_rf = random_result.best_estimator_
best_rf.fit(X_train_pca1, y_train)
y_pred = best_rf.predict(X_test_pca1)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy on test set: {test_accuracy}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# MLP - design baseline 

## 4 hidden -BCEWithLogitsLoss - Xavier

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from torch.utils.data import Dataset, DataLoader, TensorDataset

# 重塑数据为二维
X_train_pca_reshaped = X_train_pca.reshape(X_train_pca1.shape[0], X_train_pca.shape[2])
X_test_pca_reshaped = X_test_pca.reshape(X_test_pca1.shape[0], X_test_pca.shape[2])

# 准备数据
X_train_tensor = torch.from_numpy(X_train_pca_reshaped).float()
y_train_tensor = torch.from_numpy(y_train).float()
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 初始化模型
class MLP(nn.Module):
    def __init__(self, input_size=193):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_size, 200)
        self.prelu1 = nn.ReLU()
        self.layer2 = nn.Linear(200, 100)
        self.prelu2 = nn.ReLU()
        self.layer3 = nn.Linear(100, 50)
        self.prelu3 = nn.ReLU()
        self.layer4 = nn.Linear(50, 25)
        self.prelu4 = nn.ReLU()
        self.output_layer = nn.Linear(25, 1)
        self.init_weights()

    def init_weights(self):
        for layer in self.children():
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)  # Xavier 初始化
                if layer.bias is not None:
                    nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        x = self.prelu1(self.layer1(x))
        x = self.prelu2(self.layer2(x))
        x = self.prelu3(self.layer3(x))
        x = self.prelu4(self.layer4(x))
        x = self.output_layer(x)
        return x

model = MLP(input_size=193)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.001)  # 使用Adam优化器

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc.item()

# 开始训练
start_time = time.time()
best_acc = 0

for epoch in range(200):  # 训练200个epoch
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):  
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()
    
    # 计算训练集上的准确率
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_acc = binary_acc(train_outputs, y_train_tensor.view(-1, 1))
        if train_acc > best_acc:
            best_acc = train_acc

end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds")
print(f"Best Training Accuracy = {best_acc}")

# 模型评估
model.eval()  # 将模型设置为评估模式
with torch.no_grad():  # 禁止跟踪计算图
    X_test_tensor = torch.from_numpy(X_test_pca_reshaped).float() 
    y_test_tensor = torch.from_numpy(y_test).float()  # 将 NumPy 数组转换为 PyTorch 张量，并转换数据类型为浮点型
    test_outputs = model(X_test_tensor)
    test_acc = binary_acc(test_outputs, y_test_tensor.view(-1, 1))
    print(f"Accuracy on test set: {test_accuracy}")
    print("\nClassification Report:\n", classification_report(y_test_tensor, y_pred))


## 5 hidden - without PCA 

In [None]:

mlp = MLPClassifier(random_state=42,shuffle=True,tol=1e-4,early_stopping=True, n_iter_no_change=10)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(1536,768,384,192,96)],  
    'activation': ['logistic','relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001,0.001, 0.01],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter': [800,1100,1400],
    'batch_size': [64]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=10)
random_search.fit(X_train, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))
