# Load the Sentiment analysis data

In [1]:
import pandas as pd

# 加载特征数据
features_path = 'emotional_merged_df.csv'
features_df = pd.read_csv(features_path)

# 确保所有特征列都是数值类型
feature_cols = [col for col in features_df.columns if col not in ['image_id', 'label']]
for col in feature_cols:
    features_df[col] = pd.to_numeric(features_df[col], errors='coerce')

# 按照label进行分组
label_0 = features_df[features_df['label'] == 0]
label_1 = features_df[features_df['label'] == 1]

# 选择5000个label为0的数据和5000个label为1的数据
label_0_sample = label_0.sample(n=5000, random_state=42)
label_1_sample = label_1.sample(n=5000, random_state=42)

# 合并两个样本
balanced_df = pd.concat([label_0_sample, label_1_sample], ignore_index=True)

# 打印总行数和总列数
print(f"总行数: {balanced_df.shape[0]}")
print(f"总列数: {balanced_df.shape[1]}")

# 打印前五行数据
print(balanced_df.head())

# 检查是否存在任何缺失值
print(balanced_df.isnull().sum())

# 如果存在缺失值,可以选择填充或删除缺失值
# 这里选择填充缺失值为0
balanced_df = balanced_df.fillna(0)

总行数: 10000
总列数: 6
  image_id  neg    neu    pos  compound  label
0    t29f4  0.0  0.526  0.474    0.4019      0
1   3zlag1  0.0  1.000  0.000    0.0000      0
2   1oqtde  0.0  1.000  0.000    0.0000      0
3    k3n1m  0.0  0.476  0.524    0.2960      0
4   3k05xw  0.0  1.000  0.000    0.0000      0
image_id    0
neg         0
neu         0
pos         0
compound    0
label       0
dtype: int64


# Split training & test set

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 从 DataFrame 中分离出特征数据和标签
X = balanced_df.drop(['label', 'image_id'], axis=1)  # 移除 'label' 和 'image_id' 列
y = balanced_df['label']  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 存储 image_id 以便之后合并
image_id_train = balanced_df.loc[X_train.index, 'image_id']
image_id_test = balanced_df.loc[X_test.index, 'image_id']

# 再次确保所有数据都是数值类型
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

# 标准化特征数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 打印检查
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)
print("Training image_ids shape:", image_id_train.shape)
print("Test image_ids shape:", image_id_test.shape)

Training set shape: (8000, 4)
Test set shape: (2000, 4)
Training labels shape: (8000,)
Test labels shape: (2000,)
Training image_ids shape: (8000,)
Test image_ids shape: (2000,)


# Logical Regression

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



# 设置参数网格
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# 创建网格搜索对象
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'C': 0.01, 'solver': 'liblinear'}
Best cross-validation score: 0.58
Accuracy on test set:  0.5685

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.46      0.52      1012
           1       0.55      0.68      0.61       988

    accuracy                           0.57      2000
   macro avg       0.57      0.57      0.56      2000
weighted avg       0.57      0.57      0.56      2000



# SVM

In [4]:
from sklearn.svm import SVC

# 参数网格
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# 网格搜索
grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 结果
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score: 0.59
Accuracy on test set:  0.581

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.46      0.53      1012
           1       0.56      0.70      0.62       988

    accuracy                           0.58      2000
   macro avg       0.59      0.58      0.58      2000
weighted avg       0.59      0.58      0.58      2000



# Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

# 参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# 网格搜索
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 结果
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation score: 0.60
Accuracy on test set:  0.593

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.43      0.52      1012
           1       0.57      0.76      0.65       988

    accuracy                           0.59      2000
   macro avg       0.61      0.59      0.58      2000
weighted avg       0.61      0.59      0.58      2000



# Gradient Boosting

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# 创建网格搜索对象
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
Best cross-validation score: 0.60
Accuracy on test set:  0.5985

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.46      0.54      1012
           1       0.57      0.74      0.65       988

    accuracy                           0.60      2000
   macro avg       0.61      0.60      0.59      2000
weighted avg       0.61      0.60      0.59      2000



# XGBoost

In [7]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# 创建网格搜索对象
grid = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Best cross-validation score: 0.60
Accuracy on test set:  0.59

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.47      0.54      1012
           1       0.57      0.71      0.63       988

    accuracy                           0.59      2000
   macro avg       0.60      0.59      0.58      2000
weighted avg       0.60      0.59      0.58      2000



# KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# 创建网格搜索对象
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best cross-validation score: 0.57
Accuracy on test set:  0.588

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.39      0.49      1012
           1       0.56      0.79      0.65       988

    accuracy                           0.59      2000
   macro avg       0.61      0.59      0.57      2000
weighted avg       0.61      0.59      0.57      2000

