# Load the fused fusion

In [1]:
# import pandas as pd

# 加载特征数据
# features_path = 'D:\\CAPSTONE5703_CNN\\final02.csv'
# features_df = pd.read_csv(features_path)

# 打印总行数和总列数
# print(f"总行数: {features_df.shape[0]}")
# print(f"总列数: {features_df.shape[1]}")

# 打印前五行数据
# print(features_df.head())

# Load the label 

In [2]:
# import pandas as pd
# import pickle

# def load_labels_from_pickle(pickle_file):
    # with open(pickle_file, 'rb') as f:
        # data = pickle.load(f)
        # labels = data['label']
    # return labels

# 初始化 PickleDataset
# pickle_path = 'D:\\CAPSTONE5703_CNN\\datasets_pickle\\train.pkl'
# labels = load_labels_from_pickle(pickle_path)

# 将标签转换为 DataFrame
# labels_df = pd.DataFrame({'label': labels})

# 打印标签总行数
# print(f"Total number of rows of labels: {len(labels_df)}")

# 打印前 5 行标签
# print("First 5 rows of labels:")
# print(labels_df.head())

# Data alignment - merge through 'image_id'

In [3]:
import pandas as pd
import pickle

def load_labels_from_pickle(pickle_file):
    with open(pickle_file, 'rb') as f:
        data = pickle.load(f)
        # 直接访问 data 字典中的 'image_id' 和 'label'
        image_ids = data['image_id']
        labels = data['label']
        labels_dict = dict(zip(image_ids, labels))
    return labels_dict

# 加载 pickle 文件
pickle_path = 'train.pkl'
labels_dict = load_labels_from_pickle(pickle_path)

# 加载 CSV 文件
features_path = 'fused.csv'
features_df = pd.read_csv(features_path)

# 确保 CSV 文件中最后一列为 image_id
if features_df.columns[-1] != 'image_id':
    print("Error: 'image_id' must be the last column in the CSV file.")
else:
    # 创建 DataFrame 用于包含从字典中提取的 image_id 和 label
    labels_df = pd.DataFrame(list(labels_dict.items()), columns=['image_id', 'label'])

    # 将 features_df 和 labels_df 中的 'image_id' 设置为索引
    features_df.set_index('image_id', inplace=True)
    labels_df.set_index('image_id', inplace=True)

    # 根据 image_id 合并 features_df 和 labels_df
    merged_df = pd.merge(features_df, labels_df, left_index=True, right_index=True, how='inner')

    # 重置索引以便导出或其他处理
    merged_df.reset_index(inplace=True)

    # 检查合并后的数据
    print(merged_df.head())

    # 可以选择保存合并后的 DataFrame
    # merged_df.to_csv('D:\\CAPSTONE5703_CNN\\merged_fused features.csv', index=False)


                           image_id         0         1         2         3  \
0  62b31d36gw1expsi2gfrdj20hm0loq8o -0.521330  0.065525 -0.028026  0.187444   
1  563a2b53jw1exl77nkup7j20c30f3q4j -0.482425  0.079567 -0.036039  0.194375   
2  005ldo0ygw1ex23rdfuqcj30xo0k6di0 -0.468386  0.084869 -0.049615  0.144983   
3  62b31d36gw1exfcmyz8agj20qq0hu77k -0.468805  0.067019 -0.031303  0.199792   
4  0060kjm0jw1exdjaeiqadj30xc0m8tdw -0.473862  0.089600 -0.023923  0.144352   

          4         5         6         7         8  ...      1527      1528  \
0  0.023185  0.102659 -0.126628 -0.045669  0.036374  ... -0.239570 -0.247390   
1  0.036827  0.083796 -0.112961  0.000671 -0.050403  ... -0.178980 -0.252651   
2  0.038398  0.096167 -0.096763 -0.018022  0.016321  ... -0.167772 -0.270531   
3  0.033397  0.084818 -0.119774 -0.028753 -0.030460  ... -0.165330 -0.279122   
4  0.033620  0.057612 -0.104852 -0.013622 -0.017427  ... -0.209840 -0.229515   

       1529      1530      1531      1532   

# Split training & test set

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 从DataFrame中分离出特征数据和标签
X = merged_df.drop(['label', 'image_id'], axis=1)  # 移除 'label' 和 'image_id' 列
y = merged_df['label']  # 标签

# 打印出每列的数据类型
# print(X.dtypes)

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train and X_test represent the training set and test set of feature data respectively.
# y_train and y_test represent the training set and test set of labeled data respectively.





# PCA 

In [5]:
import numpy as np
from sklearn.decomposition import PCA

# 应用 PCA 降维，只在训练数据上fit，然后transform训练数据和测试数据
pca = PCA(n_components=0.95)  # 保留95%的方差
X_train_pca = pca.fit_transform(X_train) # pca后的训练集
X_test_pca = pca.transform(X_test) # pca后的测试集

# 检查新的维数和解释的方差比
print("New training dimensions:", X_train_pca.shape[1])
# print("New testing dimensions:", X_test_pca.shape[1])
# print("Explained variance ratio:", pca.explained_variance_ratio_)


New training dimensions: 193


# MLP - scikit-learn

## 2-hidden

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(50,25),(100, 50),(200, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam','sgd'],
    'alpha': [0.001,0.01,0.1],
    'learning_rate_init': [0.0001,0.001, 0.01, 0.1],
    'max_iter': [200,500]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=40)
random_search.fit(X_train_pca, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 40 candidates, totalling 200 fits




Best parameters:  {'solver': 'adam', 'max_iter': 500, 'learning_rate_init': 0.0001, 'hidden_layer_sizes': (200, 100), 'alpha': 0.001, 'activation': 'relu'}
Best score:  0.6013390657173071
Accuracy on test set:  0.6214219759926131

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.58      0.59       517
           1       0.63      0.66      0.65       566

    accuracy                           0.62      1083
   macro avg       0.62      0.62      0.62      1083
weighted avg       0.62      0.62      0.62      1083



## 3 hidden

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(100,50,25),(200,100,50),(300,200,100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam','sgd'],
    'alpha': [0.001,0.01,0.1],
    'learning_rate_init': [0.0001,0.001, 0.01, 0.1],
    'max_iter': [200,500]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=40)
random_search.fit(X_train_pca, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))



Fitting 5 folds for each of 40 candidates, totalling 200 fits




Best parameters:  {'solver': 'sgd', 'max_iter': 500, 'learning_rate_init': 0.01, 'hidden_layer_sizes': (300, 200, 100), 'alpha': 0.1, 'activation': 'relu'}
Best score:  0.6038733548031358
Accuracy on test set:  0.628808864265928

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.58      0.60       517
           1       0.64      0.67      0.65       566

    accuracy                           0.63      1083
   macro avg       0.63      0.63      0.63      1083
weighted avg       0.63      0.63      0.63      1083





## 4 hidden

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(100,50,25,4),(200,100,50,25),(300,200,100,50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam','sgd'],
    'alpha': [0.001,0.01,0.1],
    'learning_rate_init': [0.0001,0.001, 0.01, 0.1],
    'max_iter': [200,500]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=40)
random_search.fit(X_train_pca, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train_pca, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test_pca)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 40 candidates, totalling 200 fits




Best parameters:  {'solver': 'sgd', 'max_iter': 200, 'learning_rate_init': 0.01, 'hidden_layer_sizes': (200, 100, 50, 25), 'alpha': 0.001, 'activation': 'relu'}
Best score:  0.5925617523194578
Accuracy on test set:  0.6223453370267775

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.72      0.64       517
           1       0.67      0.54      0.60       566

    accuracy                           0.62      1083
   macro avg       0.63      0.63      0.62      1083
weighted avg       0.63      0.62      0.62      1083



# MLP - design baseline 

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

train_loader_pca = DataLoader(X_train_pca, batch_size=64, shuffle=True)
test_loader_pca = DataLoader(X_test_pca, batch_size=64, shuffle=False)

# print(train_loader_pca.dataset)

In [12]:
## 4 hidden -BCEWithLogitsLoss & Parametric ReLU - with PCA 

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

class MLP(nn.Module):
    def __init__(self, input_size=193):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_size, 200)
        self.prelu1 = nn.ReLU()
        self.layer2 = nn.Linear(200, 100)
        self.prelu2 = nn.ReLU()
        self.layer3 = nn.Linear(100, 50)
        self.prelu3 = nn.ReLU()
        self.layer4 = nn.Linear(50, 25)
        self.prelu4 = nn.ReLU()
        self.output_layer = nn.Linear(25, 1)

    def forward(self, x):
        x = self.prelu1(self.layer1(x))
        x = self.prelu2(self.layer2(x))
        x = self.prelu3(self.layer3(x))
        x = self.prelu4(self.layer4(x))
        x = self.output_layer(x)
        return x

# 初始化模型
model = MLP(input_size=193)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.001)

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc.item()

# 开始训练
start_time = time.time()
total_acc = 0
print_count = 0

for epoch in range(200):  # 训练200个epoch
    batch_size = train_loader_pca.batch_size
    for i, inputs in enumerate(train_loader_pca):  
        inputs = inputs.float()  # 确保数据类型正确
        labels = y_train[i*batch_size : (i+1)*batch_size]  # 从y_train中获取相应的标签
        labels = labels.to_numpy()  # 将 Pandas Series 转换为 NumPy 数组
        labels = torch.from_numpy(labels).float()   # 将 NumPy 数组转换为 PyTorch 张量，并转换数据类型为浮点型
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

        if (i + 1) % 50 == 0:
            acc = binary_acc(outputs, labels.view(-1, 1))
            print(f"Epoch {epoch+1}, Batch {i+1}: Training Accuracy = {acc}%")
            total_acc += acc
            print_count += 1

end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds")

average_acc = total_acc / print_count
print(f"Average Training Accuracy over {print_count} intervals = {average_acc}%")

# 模型评估
model.eval()  # 将模型设置为评估模式
with torch.no_grad():  # 禁止跟踪计算图
    X_test_tensor = torch.from_numpy(X_test_pca).float() 
    y_test_numpy = y_test.to_numpy()  # 将 Pandas Series 转换为 NumPy 数组
    y_test_tensor = torch.from_numpy(y_test_numpy).float()  # 将 NumPy 数组转换为 PyTorch 张量，并转换数据类型为浮点型
    test_outputs = model(X_test_tensor)
    test_acc = binary_acc(test_outputs, y_test_tensor)
    print(f'Test accuracy: {test_acc}%')


Epoch 1, Batch 50: Training Accuracy = 42.0%
Epoch 2, Batch 50: Training Accuracy = 42.0%
Epoch 3, Batch 50: Training Accuracy = 58.0%
Epoch 4, Batch 50: Training Accuracy = 58.0%
Epoch 5, Batch 50: Training Accuracy = 58.0%
Epoch 6, Batch 50: Training Accuracy = 58.0%
Epoch 7, Batch 50: Training Accuracy = 58.0%
Epoch 8, Batch 50: Training Accuracy = 58.0%
Epoch 9, Batch 50: Training Accuracy = 58.0%
Epoch 10, Batch 50: Training Accuracy = 58.0%
Epoch 11, Batch 50: Training Accuracy = 58.0%
Epoch 12, Batch 50: Training Accuracy = 58.0%
Epoch 13, Batch 50: Training Accuracy = 58.0%
Epoch 14, Batch 50: Training Accuracy = 58.0%
Epoch 15, Batch 50: Training Accuracy = 58.0%
Epoch 16, Batch 50: Training Accuracy = 58.0%
Epoch 17, Batch 50: Training Accuracy = 58.0%
Epoch 18, Batch 50: Training Accuracy = 58.0%
Epoch 19, Batch 50: Training Accuracy = 58.0%
Epoch 20, Batch 50: Training Accuracy = 58.0%
Epoch 21, Batch 50: Training Accuracy = 58.0%
Epoch 22, Batch 50: Training Accuracy = 58.

## 7 hidden - without PCA 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 创建并配置MLP分类器
mlp = MLPClassifier(random_state=42)

# 定义需要优化的超参数和对应的值
param_distributions = {
    'hidden_layer_sizes': [(1536,768,384,192,96)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam','sgd'],
    'alpha': [0.001,0.01,0.1],
    'learning_rate_init': [0.0001,0.001, 0.01, 0.1],
    'max_iter': [200,500]
}

# 创建RandomizedSearchCV对象
random_search = RandomizedSearchCV(mlp, param_distributions=param_distributions, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, n_iter=10)
random_search.fit(X_train, y_train)  # 使用降维后的训练数据

# 打印最佳超参数和对应的分数
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

# 使用最佳超参数创建新的MLP模型
best_mlp = random_search.best_estimator_

# 在完整的训练集上训练最佳模型
best_mlp.fit(X_train, y_train)

# 在测试集上评估最佳模型
predictions = best_mlp.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


# othrer methods(PCA)

## CNN

In [None]:
!pip install skorch

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from skorch import NeuralNetClassifier
import torch
import torch.nn as nn
import torch.optim as optim

# 加载数据
data = pd.read_csv('merge_df.csv')
X = data.drop('label', axis=1).values
y = data['label'].values

# 重塑数据为1D CNN接受的形状
X = X.reshape(X.shape[0], X.shape[1], 1)

# 定义1D CNN模型
class CNN1D(nn.Module):
    def __init__(self, num_features, num_filters=32, kernel_size=3, pool_size=2, dense_units=128):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, num_filters, kernel_size)
        self.pool = nn.MaxPool1d(pool_size)
        self.conv2 = nn.Conv1d(num_filters, num_filters*2, kernel_size)
        self.conv3 = nn.Conv1d(num_filters*2, num_filters*4, kernel_size)
        self.conv4 = nn.Conv1d(num_filters*4, num_filters*8, kernel_size)
        self.fc1 = nn.Linear(num_filters*8 * ((num_features - 4 * (kernel_size - 1)) // pool_size**4), dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = self.pool(self.relu(self.conv4(x)))
        x = x.view(-1, x.shape[1] * x.shape[2])
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

# 创建skorch的NeuralNetClassifier
net = NeuralNetClassifier(
    module=CNN1D,
    module__num_features=X.shape[1],
    max_epochs=10,
    lr=0.01,
    optimizer=optim.Adam,
    iterator_train__shuffle=True,
    verbose=0
)

# 设置网格搜索参数
param_grid = {
    'optimizer': [optim.Adam, optim.SGD, optim.RMSprop],
    'module__num_filters': [32, 64, 128],
    'module__kernel_size': [3, 5, 7],
    'module__pool_size': [2, 3],
    'module__dense_units': [64, 128, 256]
}

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行网格搜索
grid = GridSearchCV(estimator=net, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_result = grid.fit(X, y)

# 输出结果
print("Best parameters found:", grid_result.best_params_)
print("Best accuracy found:", grid_result.best_score_)


# SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 创建SVM模型，尝试使用RBF核
svc = SVC(kernel='rbf')

# 设置更广泛的参数网格
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 'scale']
}

# 使用网格搜索进行调参，应用StratifiedKFold
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(svc, param_grid, refit=True, verbose=2, cv=cv)
grid.fit(X_train_pca, y_train)

# 预测
predictions = grid.predict(X_test_pca)

# 评估模型
print("Best parameters found:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


# Random Forest 

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

# 加载数据
# 假设 X_train_pca 和 y_train_pca 已经预处理并准备好了

# 设置网格搜索参数
param_grid = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'max_depth': [None, 10, 20, 30],  # 树的最大深度
    'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],    # 叶子节点最小样本数
    'bootstrap': [True, False]        # 是否有放回地抽样
}

# 初始化随机森林分类器
rf = RandomForestClassifier(random_state=42)

# 设置K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 运行网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=2)
grid_result = grid_search.fit(X_train_pca, y_train_pca)

# 输出结果
print("Best parameters found:", grid_result.best_params_)
print("Best accuracy found:", grid_result.best_score_)

# 使用最佳参数训练模型并评估在测试集上的性能
best_rf = grid_result.best_estimator_
best_rf.fit(X_train_pca, y_train_pca)
y_pred = best_rf.predict(X_test_pca)
test_accuracy = accuracy_score(y_test_pca, y_pred)

print(f"Test accuracy: {test_accuracy}")
