# Метод главных компонент (Principal Components Analysis, PCA)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

RANDOM_STATE = 42

Загружаем данные

In [None]:
diabetes = load_diabetes()
df = pd.DataFrame(data=diabetes.data,
                  columns=diabetes.feature_names)

df.head()

Применяем PCA

Внутри метода данные уже центрированы

In [None]:
pca = PCA(n_components=10)

pca.fit_transform(df)

Вычисляем долю объясненной дисперсии

In [None]:
prop_var = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_

In [None]:
prop_var

In [None]:
PC_numbers = np.arange(pca.n_components_) + 1

plt.plot(PC_numbers,
         prop_var,
         'ro-')
plt.ylabel('Proportion of Variance', fontsize=8)
plt.show()

PCA для дальнейших задач

In [None]:
from sklearn.model_selection import train_test_split

X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

pca = PCA(n_components=5)
pca.fit(X_train)

X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(X_test)

In [None]:
X_train.shape, X_train_reduced.shape