# Permutation Feature Importance(PFI)

In [1]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# 設定中文字型
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  
# 矯正負號
plt.rcParams['axes.unicode_minus'] = False

## 載入資料

In [2]:
X, y = load_wine(return_X_y=True, as_frame=True)

## 資料分割

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, 
                                                    test_size=0.5, random_state=42)

## 特徵縮放

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 選擇演算法

In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

## 模型訓練

In [6]:
clf.fit(X_train_std, y_train)

## 模型評估

In [7]:
clf.score(X_test_std, y_test)

0.9887640449438202

## 測試選取3個特徵的所有組合

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)
model = permutation_importance(clf, X_test_std, y_test, n_repeats=10)
model.importances_mean, model.importances_std

(array([ 0.04719101,  0.02808989,  0.02247191,  0.00449438,  0.00561798,
        -0.01123596,  0.01235955, -0.00786517,  0.00561798,  0.04719101,
         0.01235955,  0.00786517,  0.05842697]),
 array([0.01997347, 0.01896848, 0.01329456, 0.01145847, 0.01443285,
        0.00870333, 0.01059998, 0.01334196, 0.01528255, 0.01310326,
        0.00605075, 0.01815224, 0.01573034]))

In [8]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list())

column_selected = []
for i in model.importances_mean.argsort()[::-1]:#由大到小排列 
    if model.importances_mean[i] - 2 * model.importances_std[i] > 0:
        print(f"{column_list[i]:20s} "
              f"{model.importances_mean[i]:.3f}"
              f" +/- {model.importances_std[i]:.3f}")
        column_selected.append(column_list[i])

proline              0.058 +/- 0.016
color_intensity      0.047 +/- 0.013
alcohol              0.047 +/- 0.020
hue                  0.012 +/- 0.006


In [9]:
# 特徵選取後的 X
X[column_selected].shape

(178, 4)

In [10]:
X = X[column_selected]

## 資料分割

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, 
                                                    test_size=0.5, random_state=42)

## 特徵縮放

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 選擇演算法

In [13]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [14]:
clf.fit(X_train_std, y_train)

## 模型評估

In [15]:
clf.score(X_test_std, y_test)

0.9438202247191011

## 模型簡化，準確率降低