# Load the Sentiment analysis data

In [8]:
import pandas as pd

# 加载特征数据
features_path = 'SA.csv'
features_df = pd.read_csv(features_path)


# 打印总行数和总列数
print(f"总行数: {features_df.shape[0]}")
print(f"总列数: {features_df.shape[1]}")

# 打印前五行数据
print(features_df.head())

# 显示数据的基本统计描述
print(features_df.describe())

# 显示列名和数据类型
print(features_df.info())

总行数: 5413
总列数: 5
                           image_id    neg    neu    pos  compound
0  62b31d36gw1expsi2gfrdj20hm0loq8o  0.105  0.895  0.000   -0.8442
1  563a2b53jw1exl77nkup7j20c30f3q4j  0.000  0.868  0.132    0.5719
2  005ldo0ygw1ex23rdfuqcj30xo0k6di0  0.000  0.911  0.089    0.5994
3  62b31d36gw1exfcmyz8agj20qq0hu77k  0.147  0.827  0.026   -0.8360
4  0060kjm0jw1exdjaeiqadj30xc0m8tdw  0.065  0.793  0.141    0.7550
               neg          neu          pos     compound
count  5413.000000  5413.000000  5413.000000  5413.000000
mean      0.082708     0.836126     0.081158    -0.043016
std       0.090372     0.112274     0.092355     0.593741
min       0.000000     0.038000     0.000000    -0.993600
25%       0.000000     0.772000     0.000000    -0.585900
50%       0.062000     0.848000     0.057000     0.000000
75%       0.131000     0.913000     0.128000     0.476700
max       0.552000     1.000000     0.962000     0.999700
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5413 entr

# Load the label 

In [9]:
# import pandas as pd
# import pickle

# def load_labels_from_pickle(pickle_file):
    # with open(pickle_file, 'rb') as f:
        # data = pickle.load(f)
        # labels = data['label']
    # return labels

# 初始化 PickleDataset
# pickle_path = 'D:\\CAPSTONE5703_CNN\\datasets_pickle\\train.pkl'
# labels = load_labels_from_pickle(pickle_path)

# 将标签转换为 DataFrame
# labels_df = pd.DataFrame({'label': labels})

# 打印标签总行数
# print(f"Total number of rows of labels: {len(labels_df)}")

# 打印前 5 行标签
# print("First 5 rows of labels:")
# print(labels_df.head())

# Data alignment - merge through 'image_id'

In [10]:
import pandas as pd
import pickle

def load_labels_from_pickle(pickle_file):
    with open(pickle_file, 'rb') as f:
        data = pickle.load(f)
        # 直接访问 data 字典中的 'image_id' 和 'label'
        image_ids = data['image_id']
        labels = data['label']
        labels_dict = dict(zip(image_ids, labels))
    return labels_dict

# 加载 pickle 文件
pickle_path = 'train.pkl'
labels_dict = load_labels_from_pickle(pickle_path)

# 加载 CSV 文件
features_path = 'SA.csv'
features_df = pd.read_csv(features_path)

# 确保 CSV 文件中第一列为 image_id
if features_df.columns[0] != 'image_id':
    print("Error: 'image_id' must be the first column in the CSV file.")
else:
    # 创建 DataFrame 用于包含从字典中提取的 image_id 和 label
    labels_df = pd.DataFrame(list(labels_dict.items()), columns=['image_id', 'label'])

    # 将 features_df 和 labels_df 中的 'image_id' 设置为索引
    features_df.set_index('image_id', inplace=True)
    labels_df.set_index('image_id', inplace=True)

    # 根据 image_id 合并 features_df 和 labels_df
    merged_df = pd.merge(features_df, labels_df, left_index=True, right_index=True, how='inner')

    # 重置索引以便导出或其他处理
    merged_df.reset_index(inplace=True)

    # 检查合并后的数据
    print(merged_df.head())

    # 可以选择保存合并后的 DataFrame
    # merged_df.to_csv('D:\\CAPSTONE5703_CNN\\merged_Sentiment features.csv', index=False)


                           image_id    neg    neu    pos  compound  label
0  62b31d36gw1expsi2gfrdj20hm0loq8o  0.105  0.895  0.000   -0.8442      0
1  563a2b53jw1exl77nkup7j20c30f3q4j  0.000  0.868  0.132    0.5719      0
2  005ldo0ygw1ex23rdfuqcj30xo0k6di0  0.000  0.911  0.089    0.5994      0
3  62b31d36gw1exfcmyz8agj20qq0hu77k  0.147  0.827  0.026   -0.8360      0
4  0060kjm0jw1exdjaeiqadj30xc0m8tdw  0.065  0.793  0.141    0.7550      0


# Split training & test set

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 去掉'image_id'列
data = merged_df.drop(columns=['image_id'])

# 定义特征和标签
X = data[['neg', 'neu', 'pos', 'compound']]  # 情感分数作为特征
y = data['label']  # 标签



# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train and X_test represent the training set and test set of feature data respectively.
# y_train and y_test represent the training set and test set of labeled data respectively.





# Logical Regression

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



# 设置参数网格
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# 创建网格搜索对象
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'C': 10, 'solver': 'saga'}
Best cross-validation score: 0.55
Accuracy on test set:  0.5521698984302862

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.14      0.23       516
           1       0.54      0.93      0.68       567

    accuracy                           0.55      1083
   macro avg       0.59      0.53      0.46      1083
weighted avg       0.59      0.55      0.47      1083



# SVM

In [15]:
from sklearn.svm import SVC

# 参数网格
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# 网格搜索
grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 结果
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation score: 0.57
Accuracy on test set:  0.5761772853185596

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.32      0.42       516
           1       0.57      0.81      0.67       567

    accuracy                           0.58      1083
   macro avg       0.58      0.56      0.54      1083
weighted avg       0.58      0.58      0.55      1083



# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

# 参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# 网格搜索
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 结果
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.60
Accuracy on test set:  0.5761772853185596

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.46      0.51       516
           1       0.58      0.68      0.63       567

    accuracy                           0.58      1083
   macro avg       0.57      0.57      0.57      1083
weighted avg       0.57      0.58      0.57      1083



# Gradient Boosting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# 创建网格搜索对象
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Fitting 5 folds for each of 27 candidates, totalling 135 fits


# XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# 创建网格搜索对象
grid = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# 假设 X_train 和 y_train 已经准备好了

# 设置参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# 创建网格搜索对象
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# 打印最佳参数和模型评估
print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

predictions = grid.predict(X_test)
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))
