# All Features Preocessing 

## Data alignment - merge through 'image_id'

In [None]:
import pandas as pd
import pickle

def load_labels_from_pickle(pickle_file):
    with open(pickle_file, 'rb') as f:
        data = pickle.load(f)
        image_ids = data['image_id']
        labels = data['label']
        labels_dict = dict(zip(image_ids, labels))
    return labels_dict

# 加载 pickle 文件
pickle_path = 'train.pkl'
labels_dict = load_labels_from_pickle(pickle_path)

# 加载 CSV 文件
fused_features_path = 'fused_train.csv'
emotional_features_path = 'SA.csv'
fused_features_df = pd.read_csv(fused_features_path)
emotional_features_df = pd.read_csv(emotional_features_path)

import pandas as pd

# 加载 CSV 文件
fused_features_path = 'fused_train.csv'
emotional_features_path = 'SA.csv'
fused_features_df = pd.read_csv(fused_features_path)
emotional_features_df = pd.read_csv(emotional_features_path)

# 确保 CSV 文件中最后一列为 image_id
for df, name in [(fused_features_df, 'fused'), (emotional_features_df, 'emotional')]:
    if name == 'emotional':
        # 将 image_id 列移动到最后
        columns = list(df.columns)
        columns.append(columns.pop(columns.index('image_id')))
        df = df[columns]
    
    if df.columns[-1] != 'image_id':
        raise ValueError(f"Error: 'image_id' must be the last column in the {name} CSV file.")
    
    # 如果修改了 emotional_features_df，更新原始 DataFrame
    if name == 'emotional':
        emotional_features_df = df

# 确认调整后的列顺序
print(fused_features_df.head())
print(emotional_features_df.head())

# 加载标签文件
labels_path = 'labels.pkl'
with open(labels_path, 'rb') as f:
    labels_dict = pickle.load(f)
    labels_df = pd.DataFrame(list(labels_dict.items()), columns=['image_id', 'label'])

# 将 DataFrame 中的 'image_id' 列设置为索引
fused_features_df.set_index('image_id', inplace=True)
emotional_features_df.set_index('image_id', inplace=True)
labels_df.set_index('image_id', inplace=True)

# 根据 image_id 合并数据集
fused_merged_df = pd.merge(fused_features_df, labels_df, left_index=True, right_index=True, how='inner')
emotional_merged_df = pd.merge(emotional_features_df, labels_df, left_index=True, right_index=True, how='inner')

# 重置索引以便导出或其他处理
fused_merged_df.reset_index(inplace=True)
emotional_merged_df.reset_index(inplace=True)

# 检查合并后的数据
print(fused_merged_df.head())
print(emotional_merged_df.head())

# 检查合并后的行数
print(f"The number of rows in the fused merged DataFrame is: {fused_merged_df.shape[0]}")
print(f"The number of rows in the emotional merged DataFrame is: {emotional_merged_df.shape[0]}")


## Split training & test set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 分离特征和标签
X_fused = fused_merged_df.drop(['label', 'image_id'], axis=1)
y_fused = fused_merged_df['label']

X_emotional = emotional_merged_df.drop(['label', 'image_id'], axis=1)
y_emotional = emotional_merged_df['label']

# 划分训练集和测试集 X是特征，Y是label
X_fused_train, X_fused_test, y_fused_train, y_fused_test = train_test_split(X_fused, y_fused, test_size=0.2, random_state=42)
X_emotional_train, X_emotional_test, y_emotional_train, y_emotional_test = train_test_split(X_emotional, y_emotional, test_size=0.2, random_state=42)

# 标准化特征数据
scaler_fused = StandardScaler()
X_fused_train = scaler_fused.fit_transform(X_fused_train)
X_fused_test = scaler_fused.transform(X_fused_test)

scaler_emotional = StandardScaler()
X_emotional_train = scaler_emotional.fit_transform(X_emotional_train)
X_emotional_test = scaler_emotional.transform(X_emotional_test)


# Fused tonsor

## PCA 

In [5]:
import numpy as np
from sklearn.decomposition import PCA

# 应用 PCA 降维，只在训练数据上fit，然后transform训练数据和测试数据
pca = PCA(n_components=0.95)  # 保留95%的方差
X_fused_train_pca = pca.fit_transform(X_fused_train) # pca后的训练集
X_fused_test_pca = pca.transform(X_fused_test) # pca后的测试集

# 检查新的维数和解释的方差比
# print("New training dimensions:", X_train_pca.shape[1])
# print("New testing dimensions:", X_test_pca.shape[1])
# print("Explained variance ratio:", pca.explained_variance_ratio_)


## MLP - 4 hidden layers

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 根据最佳参数设置MLP分类器
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(300, 200, 100,50),
    activation='relu',
    solver='adam',
    alpha=0.1,
    learning_rate_init=0.01,
    max_iter=200,
    random_state=42
)

# 假设 X_train 和 y_train 已经是你的训练数据
mlp_classifier.fit(X_fused_train_pca, y_fused_train)


mlp_train_preds = mlp_classifier.predict(X_fused_train_pca) 
mlp_test_preds = mlp_classifier.predict(X_fused_test_pca) 


# print("Accuracy on test set: ", accuracy_score(y_test, predictions))
# print("\nClassification Report:\n", classification_report(y_test, predictions))



# Emotional Tensor

## Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 创建随机森林分类器，使用已知的最佳参数
rf = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=42)

# 使用训练数据拟合模型
rf.fit(X_emotional_train, y_emotional_train)


rf_train_preds = rf.predict(y_emotional_train)  
# 使用模型在测试集上进行预测
rf_test_preds = rf.predict(y_emotional_test)  

# 计算并打印测试集上的准确率
# print("Accuracy on test set: ", accuracy_score(y_test, predictions))

# 输出分类报告
# print("\nClassification Report:\n", classification_report(y_test, predictions))


# Meta Model

In [None]:
# 将预测结果合并成元特征集
X_train_meta = np.column_stack((mlp_train_preds, rf_train_preds))
X_test_meta = np.column_stack((mlp_test_preds, rf_test_preds))

## vote

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score


# 简单多数投票
final_predictions = np.round((mlp_test_preds + rf_test_preds) / 2.0)

# 计算最终的准确率
final_accuracy = accuracy_score(y_test, final_predictions)
print("Final accuracy with simple voting: ", final_accuracy)


## XGBoost

In [None]:
# pip install xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score


# 创建DMatrix，XGBoost优化的数据结构
dtrain = xgb.DMatrix(X_train_meta, label=y_fused_train)
dtest = xgb.DMatrix(X_test_meta, label=y_fused_test)

# 设置XGBoost的参数
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}
epochs = 50

# 训练模型
bst = xgb.train(params, dtrain, epochs)

# 预测
preds = bst.predict(dtest)
final_predictions = preds > 0.5  # 由于XGBoost输出概率，需要转换为类标签

# 计算准确率
final_accuracy = accuracy_score(y_fused_test, final_predictions)
print("Final accuracy with XGBoost as meta-model: ", final_accuracy)


Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 3.6 MB/s eta 0:00:28
   ---------------------------------------- 0.3/99.8 MB 4.8 MB/s eta 0:00:21
   ---------------------------------------- 0.5/99.8 MB 4.9 MB/s eta 0:00:21
   ---------------------------------------- 0.7/99.8 MB 5.5 MB/s eta 0:00:19
   ---------------------------------------- 1.0/99.8 MB 5.5 MB/s eta 0:00:18
   ---------------------------------------- 1.0/99.8 MB 5.1 MB/s eta 0:00:20
   ---------------------------------------- 1.0/99.8 MB 5.1 MB/s eta 0:00:20
   ---------------------------------------- 1.0/99.8 MB 5.1 MB/s eta 0:00:20
   ---------------------------------------- 1.2/99.8 MB 3.7 MB/s eta 0:00:27
    --------------------

## MLP 

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# 设定MLP参数
mlp_meta = MLPClassifier(hidden_layer_sizes=(10,5), activation='relu', 
                         solver='adam', random_state=42, max_iter=500)

# 训练MLP元模型
mlp_meta.fit(X_train_meta, y_fused_train)

# 在测试集上进行预测
meta_predictions = mlp_meta.predict(X_test_meta)

# 计算并打印准确率
print("Accuracy of MLP as meta-model: ", accuracy_score(y_fused_test, meta_predictions))
