In [1]:
import numpy as np
import pandas as pd
import pickle

## 2. Data preparation

In [2]:
from sklearn.datasets import load_breast_cancer, load_iris

data_breast_cancer = load_breast_cancer()

X_cancer = data_breast_cancer.data
y_cancer = data_breast_cancer.target

data_iris = load_iris()

X_iris = data_iris.data
y_iris = data_iris.target

## 3.1-3 Principal Component Analysis — PCA

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_cancer_scaled = scaler.fit_transform(X_cancer)
X_iris_scaled = scaler.fit_transform(X_iris)

In [4]:
from sklearn.decomposition import PCA

pca_cancer = PCA(n_components=0.9)
X_cancer_reduced = pca_cancer.fit_transform(X_cancer)
print("Breast cancer shape:", X_cancer.shape)
print("Reduced breast cancer shape:", X_cancer_reduced.shape)

pca_iris = PCA(n_components=0.9)
X_iris_reduced = pca_iris.fit_transform(X_iris)
print("Iris shape:", X_iris.shape)
print("Reduced iris shape:", X_iris_reduced.shape)

Breast cancer shape: (569, 30)
Reduced breast cancer shape: (569, 1)
Iris shape: (150, 4)
Reduced iris shape: (150, 1)


In [5]:
pca_cancer_scaled = PCA(n_components=0.9)
X_red_cancer_scaled = pca_cancer_scaled.fit_transform(X_cancer_scaled)
print("Scaled breast cancer shape:", X_cancer_scaled.shape)
print("Reduced breast cancer shape after scaling:", X_red_cancer_scaled.shape)

pca_iris_scaled = PCA(n_components=0.9)
X_red_iris_scaled = pca_iris_scaled.fit_transform(X_iris_scaled)
X_iris_reduced = pca_iris.fit_transform(X_iris)
print("Scaled iris shape:", X_iris_scaled.shape)
print("Reduced iris shape after scaling:", X_red_iris_scaled.shape)

Scaled breast cancer shape: (569, 30)
Reduced breast cancer shape after scaling: (569, 7)
Scaled iris shape: (150, 4)
Reduced iris shape after scaling: (150, 2)


In [6]:
with open("pca_bc.pkl", 'wb') as f:
    pickle.dump(pca_cancer_scaled.explained_variance_ratio_, f)

with open("pca_ir.pkl", 'wb') as f:
    pickle.dump(pca_iris_scaled.explained_variance_ratio_, f)

## 3.4 Most important features

In [7]:
most_important_features_cancer = [np.argmax(abs(x)) for x in pca_cancer_scaled.components_]
most_important_features_iris = [np.argmax(abs(x)) for x in pca_iris_scaled.components_]

pd.to_pickle(most_important_features_cancer, "idx_bc.pkl")
pd.to_pickle(most_important_features_iris, "idx_ir.pkl")

print(most_important_features_cancer)
print(most_important_features_iris)

[7, 9, 11, 21, 4, 28, 29]
[2, 1]
