# Scikit-learn PCA 實作

## 載入相關套件

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. 載入資料

In [None]:
ds = datasets.load_wine()
df = pd.DataFrame(ds.data, columns=ds.feature_names) # pd.DataFrame(ds.data, columns=ds.feature_names), columns=ds.feature_names
df.head()

In [None]:
df.info()

## 2. 資料清理、資料探索與分析

In [None]:
# 資料集說明
print(ds.DESCR)

## 3. 資料分割

In [None]:
from sklearn.model_selection import train_test_split # 資料分割
from sklearn.metrics import accuracy_score # 計算準確率

# 指定X、Y
X = df.values #二維
y = ds.target #一維

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2,
                                                    random_state=100)

# 查看陣列維度
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## 4. 特徵縮放

將資料移到座標中心點

In [None]:
from sklearn.preprocessing import StandardScaler
# 標準化
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train) # 
X_test_std = scaler.transform(X_test)

## 特徵萃取(PCA)

In [None]:
from sklearn.decomposition import PCA
# PCA降維
pca1 = PCA(n_components=2) # 主成分數量=2,設定成2維
X_train_pca = pca1.fit_transform(X_train_std) # 訓練資料降維,從13維降到2維
X_test_pca = pca1.transform(X_test_std) # 測試資料降維
X_train_pca.shape, X_test_pca.shape, pca1.explained_variance_ratio_ # 主成分解釋變異量比例,總和約0.57;表示降維後的2維資料約保留57%的資訊;
# pca1.eexplained_variance_ratio_ # 主成分解釋變異量比例 

## 5. 選擇演算法

In [None]:
from sklearn.linear_model import LogisticRegression # 邏輯斯迴歸
# 建立分類器

clf = LogisticRegression()

## 6. 模型訓練

In [None]:
clf.fit(X_train_pca, y_train)

## 7. 模型計分

In [None]:
# 計算準確率
y_pred = clf.predict(X_test_pca)
print(f'{accuracy_score(y_test, y_pred)*100:.2f}%') 

## 繪製決策邊界(Decision regions)

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.6, 
                    color=cmap(idx),
                    marker=markers[idx], 
                    label=cl)

In [None]:
plot_decision_regions(X_test_pca, y_test, classifier=clf)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
# plt.savefig('decision_regions.png', dpi=300)
plt.show()

## 使用全部特徵

In [None]:
# 載入資料集
# ds = datasets.load_wine()
# df = pd.DataFrame(ds.data, columns=ds.feature_names) # pd.DataFrame(ds.data, columns=ds.feature_names), columns=ds.feature_names
# df.head()

# 也可以用下面這行將上面兩行簡化

X, y = datasets.load_wine(return_X_y=True) # X, y = datasets.load_wine(return_X_y=True) # 直接取得X、y

# 資料分割

from sklearn.model_selection import train_test_split # 資料分割
from sklearn.metrics import accuracy_score # 計算準確率

# 指定X、Y
# X = df.values #二維
# y = ds.target #一維

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) # 20%測試資料,80%訓練資料

# 查看陣列維度
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# 特徵縮放
from sklearn.preprocessing import StandardScaler # 標準化
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 模型訓練
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_std, y_train)

# 模型計分
y_pred = clf.predict(X_test_std)
print(f'{accuracy_score(y_test, y_pred)*100:.2f}%') 

(142, 13) (36, 13) (142,) (36,)
94.44%


## 測試Scikit-learn 的PCA函數其他用法

In [None]:
# 不設定PCA參數數量
pca1 = PCA()
# 
X_train_pca = pca1.fit_transform(X_train_std)
pca1.explained_variance_ratio_

In [None]:
# 加總可解釋變異
np.sum(pca1.explained_variance_ratio_)

In [None]:
# 對可解釋變異繪製柏拉圖(Pareto)
plt.bar(range(1, 14), pca1.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(1, 14), np.cumsum(pca1.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.axhline(0.8, color='r', linestyle='--')

In [None]:
# 設定可解釋變異下限
pca2 = PCA(0.8)
X_train_pca = pca2.fit_transform(X_train_std)
X_train_pca.shape