# Sequential Feature Selection(SFS)

In [None]:
from sklearn.datasets import load_wine # Wine dataset
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors classifier
from sklearn.model_selection import train_test_split # Train-test split
import seaborn as sns # Seaborn for data visualization
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# 設定中文字型
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  
# 矯正負號
plt.rcParams['axes.unicode_minus'] = False # 矯正負號

## 載入資料

In [None]:
X, y = load_wine(return_X_y=True, as_frame=True) # 載入葡萄酒資料集
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 切分訓練集與測試集
# knn = KNeighborsClassifier(n_neighbors=3) # 建立KNN分類器
# knn.fit(X_train, y_train) # 訓練模型
# print(f'原始特徵數量: {X.shape[1]}')

In [5]:
print(X.shape, y.shape) # 輸出特徵和目標的形狀
print((type(X), type(y))) # 輸出特徵和目標的類型

(178, 13) (178,)
(<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>)


In [6]:
print(y.unique()) # 輸出目標變數的唯一值,即葡萄酒的類別,去掉重複值

[0 1 2]


## 資料分割

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, 
                                                    random_state=42) # 分割資料集為訓練集和測試集
# test_size = 0.5 表示訓練與測試比是50%
# random_state=42 確保每次分割的結果相同
print(X_train.shape, y_train.shape) # 輸出訓練集特徵和目標的形狀
print(X_test.shape, y_test.shape) # 輸出測試集特徵和目標的形狀

(89, 13) (89,)
(89, 13) (89,)


## 特徵縮放

In [None]:
from sklearn.preprocessing import StandardScaler # 標準化特徵
# 標準化特徵使其均值為0，標準差為1

scaler = StandardScaler() # 初始化標準化器
# fit_transform() 方法計算訓練集的均值和標準差
# transform() 方法使用這些均值和標準差來轉換測試集
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 輸出標準化後的訓練集和測試集特徵形狀
print(X_train_std.shape, X_test_std.shape) # 輸出標準化後的訓練集和測試集特徵形狀
print('-' * 80) # 分隔線

# 輸出標準化後的特徵前5行
print(pd.DataFrame(X_train_std, columns=X_train.columns).head())

(89, 13) (89, 13)
--------------------------------------------------------------------------------
    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0 -0.297630    1.309714  0.117696           0.915475   0.099139   
1 -1.281675   -1.409577  0.823873          -0.469722  -0.428755   
2  1.441611    1.419215  0.549249          -1.799510   2.135303   
3 -0.412054   -0.998946 -1.843906          -0.331202  -0.881236   
4 -0.915519   -0.752568 -0.235392           0.860067  -0.730409   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.778303    0.527109              0.423164         0.666764   
1      -0.153766    0.200057             -1.152321         1.377234   
2       1.029814    0.998449             -1.302367         0.897667   
3      -1.263371   -0.617573             -0.627159        -0.398941   
4      -0.775145   -0.300140              0.423164        -0.025944   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  


## 選擇演算法

In [14]:
from sklearn.linear_model import LogisticRegression # Logistic Regression classifier
# 初始化邏輯回歸分類器

clf = LogisticRegression()
print(clf) # 輸出分類器的類型

LogisticRegression()


## 模型訓練

In [15]:
clf.fit(X_train_std, y_train)# y=13個特徵

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## 模型評估

In [None]:
clf.score(X_test_std, y_test) # 計算測試集的準確率
# 輸出測試集的準確率

0.9887640449438202

## 測試選取3個特徵的所有組合

In [None]:
# 特徵選擇
from sklearn.neighbors import KNeighborsClassifier # KNN 分類器
from sklearn.metrics import accuracy_score # 計算準確率
from sklearn.feature_selection import SequentialFeatureSelector # 序列特徵選擇
# 初始化 KNN 分類器 

knn = KNeighborsClassifier(n_neighbors=5) # n_neighbors=5 表示使用5個鄰居
# 初始化序列特徵選擇器
sfs = SequentialFeatureSelector(knn, n_features_to_select=3)#選 3 特徵,可以設定direction="backward"
sfs.fit(X_train_std, y_train) # fit() 擬合序列特徵選擇器
sfs.get_support() # get_support() 方法返回一個布林陣列
#True：代表該特徵較重要



array([False, False, False,  True, False, False,  True, False, False,
        True, False, False, False])

In [None]:

# 輸出選擇的特徵
selected_features = X_train.columns[sfs.get_support()] # 使用布林陣列選擇特徵名稱
print("Selected features:", selected_features) # 輸出選擇的特徵

Selected features: Index(['alcalinity_of_ash', 'flavanoids', 'color_intensity'], dtype='object')


In [None]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list()) # 轉換為 numpy 陣列
column_list[sfs.get_support()] # 使用布林陣列選擇特徵名稱

array(['alcalinity_of_ash', 'flavanoids', 'color_intensity'], dtype='<U28')

In [None]:
# 特徵選取名稱
sfs.get_feature_names_out(column_list) # 使用 get_feature_names_out() 方法獲取選擇的特徵名稱

array(['alcalinity_of_ash', 'flavanoids', 'color_intensity'], dtype=object)

In [None]:
# 特徵選取後的 X
sfs.transform(X_train_std).shape # 使用 transform() 方法轉換訓練集特徵

(89, 3)

## 選擇演算法

In [None]:
from sklearn.linear_model import LogisticRegression # Logistic Regression classifier
# 初始化邏輯回歸分類器
clf = LogisticRegression() # 建立邏輯回歸分類器

## 模型訓練

In [None]:
clf.fit(sfs.transform(X_train_std), y_train) # 使用選擇的特徵訓練模型
# 計算並輸出測試集的準確率
# print("Test set accuracy with selected features:", clf.score(sfs.transform(X_test_std), y_test))

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## 模型評估

In [27]:
clf.score(sfs.transform(X_test_std), y_test)

0.8876404494382022

## 模型簡化，相對準確率降低