# 特徵挑選

## 作業程式碼

本作業將請學員完成以下要求：
1. 請至 Kaggle 平台找尋欲探索的資料集，進行本次作業。
2. 實作 Exhaustive Search
3. 實作 Sequential Forward/Backward Feature Selection
4. 實作 Sequential Floating Forward/Backward Feature Selection
5. 實作 Recursive Feature Elimination
6. 實作 Recursive Feature Elimination with Cross-Validation

> 注意：由於目前尚未教學建立機器學習模型，資料集請以「預測類別特徵」為主，以利參考範例程式碼進行實作

# Import packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_data = pd.read_csv("C:/Users/Orianna/Desktop/marathon/dataset_2191_sleep.csv") # 此行要填入資料路徑
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   body_weight           62 non-null     float64
 1   brain_weight          62 non-null     float64
 2   max_life_span         62 non-null     object 
 3   gestation_time        62 non-null     object 
 4   predation_index       62 non-null     int64  
 5   sleep_exposure_index  62 non-null     int64  
 6   danger_index          62 non-null     int64  
 7   total_sleep           62 non-null     object 
dtypes: float64(2), int64(3), object(3)
memory usage: 4.0+ KB


In [3]:
raw_data.describe()

Unnamed: 0,body_weight,brain_weight,predation_index,sleep_exposure_index,danger_index
count,62.0,62.0,62.0,62.0,62.0
mean,198.789984,283.134194,2.870968,2.419355,2.612903
std,899.158011,930.278942,1.476414,1.604792,1.441252
min,0.005,0.14,1.0,1.0,1.0
25%,0.6,4.25,2.0,1.0,1.0
50%,3.3425,17.25,3.0,2.0,2.0
75%,48.2025,166.0,4.0,4.0,4.0
max,6654.0,5712.0,5.0,5.0,5.0


In [4]:
raw_data.head(20)

Unnamed: 0,body_weight,brain_weight,max_life_span,gestation_time,predation_index,sleep_exposure_index,danger_index,total_sleep
0,6654.0,5712.0,38.6,645,3,5,3,3.3
1,1.0,6.6,4.5,42,3,1,3,8.3
2,3.385,44.5,14,60,1,1,1,12.5
3,0.92,5.7,?,25,5,2,3,16.5
4,2547.0,4603.0,69,624,3,5,4,3.9
5,10.55,179.5,27,180,4,4,4,9.8
6,0.023,0.3,19,35,1,1,1,19.7
7,160.0,169.0,30.4,392,4,5,4,6.2
8,3.3,25.6,28,63,1,2,1,14.5
9,52.16,440.0,50,230,1,1,1,9.7


In [5]:
# 把遺失值過多的欄位排除掉
raw_data = raw_data.dropna(axis = 1, thresh = int(round(raw_data.shape[0]*0.5)) )


In [6]:
# Define Features and Target
raw_data["id"] = raw_data.index
PK = "id"
target = "total_sleep"

raw_data['max_life_span'] = raw_data['max_life_span'] \
    .replace('?', np.nan) \
    .pipe(pd.to_numeric, errors='coerce')
mean_life = raw_data['max_life_span'].mean()
raw_data['max_life_span'] = raw_data['max_life_span'].fillna(mean_life)

raw_data['gestation_time'] = raw_data['gestation_time'] \
    .replace('?', np.nan) \
    .pipe(pd.to_numeric, errors='coerce')
mean_life = raw_data['gestation_time'].mean()
raw_data['gestation_time'] = raw_data['gestation_time'].fillna(mean_life)

raw_data['total_sleep'] = raw_data['total_sleep'] \
    .replace('?', np.nan) \
    .pipe(pd.to_numeric, errors='coerce')
mean_life = raw_data['total_sleep'].mean()
raw_data['total_sleep'] = raw_data['total_sleep'].fillna(mean_life)

numerical_features = [i for i in raw_data.columns if i not in [PK, target]and raw_data[i].dtype != "object"]
classical_features = [i for i in raw_data.columns if i not in [PK, target]and raw_data[i].dtype == "object"]

In [7]:
raw_data.head(20)

Unnamed: 0,body_weight,brain_weight,max_life_span,gestation_time,predation_index,sleep_exposure_index,danger_index,total_sleep,id
0,6654.0,5712.0,38.6,645.0,3,5,3,3.3,0
1,1.0,6.6,4.5,42.0,3,1,3,8.3,1
2,3.385,44.5,14.0,60.0,1,1,1,12.5,2
3,0.92,5.7,19.877586,25.0,5,2,3,16.5,3
4,2547.0,4603.0,69.0,624.0,3,5,4,3.9,4
5,10.55,179.5,27.0,180.0,4,4,4,9.8,5
6,0.023,0.3,19.0,35.0,1,1,1,19.7,6
7,160.0,169.0,30.4,392.0,4,5,4,6.2,7
8,3.3,25.6,28.0,63.0,1,2,1,14.5,8
9,52.16,440.0,50.0,230.0,1,1,1,9.7,9


In [8]:
question_cols = [
    col for col in raw_data.columns
    if raw_data[col].astype(str).eq('?').any()
]

print("含有 '?' 的欄位：", question_cols)

含有 '?' 的欄位： []


In [9]:
# 切割成訓練、驗證與測試資料
xtrain, xtest, ytrain, ytest = train_test_split(raw_data[numerical_features+classical_features], raw_data[target].astype("float"), test_size = 0.2)

In [10]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(49, 7) (13, 7) (49,) (13,)


In [11]:
def generate_one_encoding_features(one_train_series,one_test_series):
    
    onehotencoder=OneHotEncoder(handle_unknown="ignore") #如果測試集中出現「訓練集中沒有出現過」的類別，encoder 會自動把該類別的 one-hot 向量整列都設為 0，而不會拋錯。

    onehotencoder=OneHotEncoder.fit(one_train_series.values.reshape((-1,1)))

    one_train_series=pd.DataFrame(onehotencoder.transform(one_train_series.values.reshape((-1,1))).toarry(),columns=onehotencoder.categories_[0].tolist())
    one_test_series=pd.DataFrame(onehotencoder.transform(one_test_series.values.reshape((-1,1))).toarray(),columns=onehotencoder.categories_[0].tolist())

    return one_test_series,one_train_series

In [12]:
# 把類別資料轉成 One-Hot Encoding(雖沒有還是保留參考)
OneHotEncoding_data = [generate_one_encoding_features(one_train_Series = xtrain[one_column], one_test_Series = xtest[one_column]) for one_column in classical_features]

# 建立 One-Hot Encoding 後的訓練資料
preprocessed_xtrain = pd.concat([xtrain.reset_index(drop = True)] + [
    data[0] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

# 建立 One-Hot Encoding 後的測試資料
preprocessed_xtest = pd.concat([xtest.reset_index(drop = True)]+[
    data[1] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

In [13]:
print(preprocessed_xtrain.shape, preprocessed_xtest.shape)

(49, 7) (13, 7)


# Exhaustive Feature Selection
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [14]:
import sys

In [15]:
print("Executable:", sys.executable)
print("Version   :", sys.version)

Executable: C:\Users\Orianna\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe
Version   : 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [17]:
### 建立模型物件 ###
iris=load_iris()
X=iris.data #樣本特徵 特徵矩陣
y=iris.target #樣本標籤 目標向量
knn=KNeighborsClassifier(n_neighbors=3)  
                                             

efs1=EFS(knn,
         min_features=1,
         max_features=4,
         scoring='accuracy',#以分類準確度作為選擇標準
         print_progress=True, #顯示每次子集評估的進度。  
         cv=5 ) #使用 5-fold 交叉驗證來評估每一個特徵子集的平均準確度

 # 要填入訓練資料與目標

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 15/15

Best accuracy score: 0.97
Best subset (indices): (0, 2, 3)
Best subset (corresponding names): ('0', '2', '3')


In [None]:
y, ytrain.values   #樣本對應的三個品種編號 0,1,2以及 y_train.values 是把「目標欄」拿出來，只保留裡面的數字，用來餵模型訓練用的！

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 array([13.        , 11.        ,  9.1       , 14.4       ,  5.4       ,
        15.8       ,  3.1       , 13.8       ,  8.4       ,  3.9       ,
         8.4       ,  2.6       , 18.1       , 14.5       , 12.5       ,
        13.2       , 19.4       , 12.5       ,  6.6       ,  6.2       ,
        10.53275862, 13.7       , 13.2       ,  9.8       , 10.6       ,
        10.3       ,  8.3       , 12.        , 19.7       , 10.7       ,
         3.9       , 17.4       ,  9.8       , 10.3   

In [None]:
from xgboost import XGBRegressor #適合迴歸問題（Regression）

In [20]:

model = XGBRegressor()

# 建立特徵挑選物件
      #是 XGBoost 提供的「梯度提升樹」分類器，帶有許多預設參數（learning_rate、n_estimators、max_depth…），可以用來進行多分類或二分類任務。
efs = EFS(model,
                 min_features = 1,
                 max_features = 3,
                 scoring = "r2",
                 print_progress = True,
                 cv = 5)



# 開始執行特徵挑選
efs.fit(preprocessed_xtrain, ytrain)   


#改用 XGBRegressor：它是 XGBoost 的回歸版本，能直接用來擬合連續目標值，不會去猜「類別」，而是最小化平方誤差等回歸損失函數
#選擇 r2：能直觀地看到模型「解釋了多少變異」，且越接近 1.0 越好，適合用來做特徵子集的比較。

Features: 63/63

In [21]:
# 輸出每一輪特徵挑選狀況
efs.subsets_ #會枚舉所有在 min_features～max_features 範圍內的特徵組合。

{0: {'feature_idx': (0,),
  'cv_scores': array([-0.65316509, -0.00359644, -1.51967345, -0.33894979, -0.79948979]),
  'avg_score': -0.662974910153551,
  'feature_names': ('body_weight',)},
 1: {'feature_idx': (1,),
  'cv_scores': array([-1.40646913, -0.54194186, -1.20822908, -0.41459568, -0.3646982 ]),
  'avg_score': -0.787186790142539,
  'feature_names': ('brain_weight',)},
 2: {'feature_idx': (2,),
  'cv_scores': array([-0.67990402, -0.39693707, -2.46774003, -0.30947402,  0.10364684]),
  'avg_score': -0.7500816613816121,
  'feature_names': ('max_life_span',)},
 3: {'feature_idx': (3,),
  'cv_scores': array([ 0.11731771,  0.57922471, -1.61653475, -0.44958089, -0.18323799]),
  'avg_score': -0.3105622417497131,
  'feature_names': ('gestation_time',)},
 4: {'feature_idx': (4,),
  'cv_scores': array([-0.05923341,  0.15621072, -0.07929644,  0.27350364, -0.13299866]),
  'avg_score': 0.031637169354719344,
  'feature_names': ('predation_index',)},
 5: {'feature_idx': (5,),
  'cv_scores': array

# Sequential Forward Selection

程式碼參考連結：http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#overview    
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [22]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [23]:
### 建立模型物件 ###
model = XGBRegressor()

# 建立特徵挑選物件
sfs = SequentialFeatureSelector(model, 
                 k_features = 4,
                 forward = True,
                 floating = False,
                 cv = 0)
# 開始執行特徵挑選
sfs.fit(preprocessed_xtrain, ytrain)

In [24]:
# 輸出特徵挑選過程
sfs.subsets_

{1: {'feature_idx': (1,),
  'cv_scores': array([0.99076227]),
  'avg_score': 0.9907622749197864,
  'feature_names': ('brain_weight',)},
 2: {'feature_idx': (1, 3),
  'cv_scores': array([0.99999988]),
  'avg_score': 0.9999998807341235,
  'feature_names': ('brain_weight', 'gestation_time')},
 3: {'feature_idx': (1, 2, 3),
  'cv_scores': array([0.99999995]),
  'avg_score': 0.9999999489615223,
  'feature_names': ('brain_weight', 'max_life_span', 'gestation_time')},
 4: {'feature_idx': (1, 2, 3, 6),
  'cv_scores': array([0.99999995]),
  'avg_score': 0.999999952185329,
  'feature_names': ('brain_weight',
   'max_life_span',
   'gestation_time',
   'danger_index')}}

In [25]:
# 輸出被選入的特徵
sfs.k_feature_names_

('brain_weight', 'max_life_span', 'gestation_time', 'danger_index')

# Sequential Backward Selection

In [26]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [27]:
model = XGBRegressor()

# 建立特徵挑選物件
sbs = SequentialFeatureSelector(model,
                 k_features = 5,
                 forward = False,
                 floating = False,
                 cv = 0)

# 開始執行特徵挑選
sbs.fit(preprocessed_xtrain,ytrain ) # 要填入訓練資料與目標

In [28]:
# 輸出特徵挑選過程
sbs.subsets_

{7: {'feature_idx': (0, 1, 2, 3, 4, 5, 6),
  'cv_scores': array([0.99999993]),
  'avg_score': 0.9999999330965873,
  'feature_names': ('body_weight',
   'brain_weight',
   'max_life_span',
   'gestation_time',
   'predation_index',
   'sleep_exposure_index',
   'danger_index')},
 6: {'feature_idx': (1, 2, 3, 4, 5, 6),
  'cv_scores': array([0.99999996]),
  'avg_score': 0.9999999631929926,
  'feature_names': ('brain_weight',
   'max_life_span',
   'gestation_time',
   'predation_index',
   'sleep_exposure_index',
   'danger_index')},
 5: {'feature_idx': (1, 2, 3, 5, 6),
  'cv_scores': array([0.99999996]),
  'avg_score': 0.9999999578772397,
  'feature_names': ('brain_weight',
   'max_life_span',
   'gestation_time',
   'sleep_exposure_index',
   'danger_index')}}

In [29]:
# 輸出被選入的特徵
sbs.feature_names

['body_weight',
 'brain_weight',
 'max_life_span',
 'gestation_time',
 'predation_index',
 'sleep_exposure_index',
 'danger_index']

# Sequential Floating Forward Selection

In [30]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [31]:
model = XGBRegressor()


# 建立特徵挑選物件
sffs = SequentialFeatureSelector(estimator = model,
                  k_features = 5,
                  scoring = "neg_mean_squared_error",
                  cv = 0,
                  floating = True,
                  forward = True)

# 開始執行特徵挑選
sffs.fit(preprocessed_xtrain, ytrain)

In [32]:
# 輸出特徵挑選過程
sffs.subsets_

{1: {'feature_idx': (1,),
  'cv_scores': array([-0.18645501]),
  'avg_score': -0.1864550132682593,
  'feature_names': ('brain_weight',)},
 2: {'feature_idx': (1, 3),
  'cv_scores': array([-2.4072724e-06]),
  'avg_score': -2.4072723958469132e-06,
  'feature_names': ('brain_weight', 'gestation_time')},
 3: {'feature_idx': (1, 2, 3),
  'cv_scores': array([-1.03016489e-06]),
  'avg_score': -1.0301648886319404e-06,
  'feature_names': ('brain_weight', 'max_life_span', 'gestation_time')},
 4: {'feature_idx': (1, 2, 3, 6),
  'cv_scores': array([-9.65095306e-07]),
  'avg_score': -9.650953059105089e-07,
  'feature_names': ('brain_weight',
   'max_life_span',
   'gestation_time',
   'danger_index')},
 5: {'feature_idx': (1, 2, 3, 5, 6),
  'cv_scores': array([-8.50209306e-07]),
  'avg_score': -8.502093064890428e-07,
  'feature_names': ('brain_weight',
   'max_life_span',
   'gestation_time',
   'sleep_exposure_index',
   'danger_index')}}

In [33]:
# 輸出被選入的特徵
sffs.k_feature_names_

('brain_weight',
 'max_life_span',
 'gestation_time',
 'sleep_exposure_index',
 'danger_index')

# Sequential Floating Backward Selection

In [34]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [35]:
model = XGBRegressor()

# 建立特徵挑選物件
sfbs = SequentialFeatureSelector(estimator = model,
                 k_features = 5,
                 floating = True,
                 cv = 0,
                 forward = False)

# 開始執行特徵挑選
sfbs.fit(preprocessed_xtrain, ytrain)

In [36]:
# 輸出被挑選到的特徵
sfbs.k_feature_names_

('brain_weight',
 'max_life_span',
 'gestation_time',
 'sleep_exposure_index',
 'danger_index')

# Recursive Feature Elimination

In [37]:
from sklearn.feature_selection import RFE

In [38]:
model = XGBRegressor()

# 建立 RFE 物件
rfe = RFE(estimator = model, 
      n_features_to_select = 200,
      step = 5)

# 執行 RFE
rfe.fit(preprocessed_xtrain, ytrain)# 要填入訓練資料與目標

In [None]:
# 輸出被選入的特徵
mask = rfe.get_support()  # array([False, True, False, True, ...])

# 直接用這個遮罩去擷取欄位名稱
selected_columns = preprocessed_xtrain.columns[mask]
print("Selected features:", selected_columns.tolist())



Selected features: ['body_weight', 'brain_weight', 'max_life_span', 'gestation_time', 'predation_index', 'sleep_exposure_index', 'danger_index']


# Recursive Feature Elimination with Cross-Validation

In [40]:
from sklearn.feature_selection import RFECV

In [41]:
model = XGBRegressor()

# 建立 RFECV 物件
rfecv = RFECV(estimator = model,
        min_features_to_select = 200,
        step = 5,
        cv = 5,
        scoring = "neg_mean_squared_error",
        verbose = 1)

# 執行 RFECV
rfecv.fit(preprocessed_xtrain, ytrain)# 要填入訓練資料與目標

In [42]:
rfecv.get_feature_names_out()

array(['body_weight', 'brain_weight', 'max_life_span', 'gestation_time',
       'predation_index', 'sleep_exposure_index', 'danger_index'],
      dtype=object)

In [None]:
rfecv.cv_results_  #每次 cross-validation 的分數評估結果。

{'mean_test_score': array([-14.42398488]),
 'std_test_score': array([4.19591926]),
 'split0_test_score': array([-8.28022782]),
 'split1_test_score': array([-11.71607035]),
 'split2_test_score': array([-20.1472914]),
 'split3_test_score': array([-17.61613685]),
 'split4_test_score': array([-14.36019797]),
 'n_features': array([7])}