# Biol 359  |  Principal Component Analysis
### Spring 2021, Week 5

<hr style="border:2px solid gray"> </hr>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Import breast cancer data 

In [None]:
from sklearn.datasets import load_breast_cancer
# NOTE:
# `breast_raw.data`: Stores the raw data (breast feature data)
# `breast_raw.feature_names`: Stores the raw data feature labels
# `breast_raw.target`: Stores the tumor type (0 = 'benign', 1 = 'malignant')
# `breast_raw.target_names`: Stores the tumor type labels ('benign' or 'malignant')
# `breast_raw.DESCR`: Description of the data
breast_raw = load_breast_cancer()

# Uncomment the following line to print a description of the data
print(breast_raw.DESCR)

#### Convert data into Pandas data frame. Review the features and sample labels.
Optional reference: https://pandas.pydata.org/docs/index.html

In [None]:
# Feature data set
features = pd.DataFrame(breast_raw.data, columns=breast_raw.feature_names)
features.head()

In [None]:
# Tumor label data set
tumor = pd.DataFrame(breast_raw.target, columns=['tumor'])
# tumor_set.replace({'tumor type': {0: 'benign', 1: 'malignant'}}, inplace=True)
tumor.head()

In [None]:
# Concantenate into one data frame
breast = pd.concat([features, tumor], axis=1)
# breast.loc[:, breast.columns != 'tumor'].head()
# breast.loc[:, breast.columns == 'tumor'].head()

#### Assess feature data statistics

In [None]:
features.describe()

#### Plot the data

In [None]:
# Plot each column -- maybe not the most useful
breast.plot(figsize=(20, 20), subplots=True);

In [None]:
# Create scatter plots of the various features
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
features.plot.scatter(ax=axs[0, 0], x="mean radius", y="mean area", alpha=0.5, color='red');
features.plot.scatter(ax=axs[0, 1], x="mean radius", y="mean texture", alpha=0.5, color='green');
features.plot.scatter(ax=axs[1, 0], x="mean concave points", y="mean concavity", alpha=0.5, color='blue');
features.plot.scatter(ax=axs[1, 1], x="mean concave points", y="mean fractal dimension", alpha=0.5, color='orange');

#### Standarized feature data (mean centered, unit standard deviation)

In [None]:
# features_centered = features.subtract(features.mean())
features_standarized = (features - features.mean()) / features.std()

In [None]:
# Re-plot scatter plots of the same various features after centering the data with the mean marked with an "x"
fig, axs = plt.subplots(2, 2, figsize=(15, 15))
features_standarized.plot.scatter(ax=axs[0, 0], x="mean radius", y="mean perimeter", alpha=0.5, color='red');
axs[0, 0].plot(0, 0, "x", color="black");
features_standarized.plot.scatter(ax=axs[0, 1], x="mean radius", y="mean texture", alpha=0.5, color='green');
axs[0, 1].plot(0, 0, "x", color="black");
features_standarized.plot.scatter(ax=axs[1, 0], x="mean concave points", y="mean concavity", alpha=0.5, color='blue');
axs[1, 0].plot(0, 0, "x", color="black");
features_standarized.plot.scatter(ax=axs[1, 1], x="mean concave points", y="mean fractal dimension", alpha=0.5, color='orange');
axs[1, 1].plot(0, 0, "x", color="black");

#### Solve for the covariance matrix of the standarized feature data

In [None]:
features_standarized.cov()

#### Solve for the eigen decomposition of the covariance matrix

In [None]:
eig_vals, eig_vecs = np.linalg.eig(features_standarized.cov().values)
inds = range(0, len(eig_vals))
print(eig_vals)

In [None]:
eig_vecs.shape

In [None]:
dot_product = np.inner(eig_vecs[:,3], eig_vecs[:,2])
print(dot_product)

#### Create the Scree Plot)

In [None]:
eig_vals_normalized = 100 * eig_vals / np.sum(eig_vals)
plt.bar(inds, eig_vals_normalized);
plt.xlabel("Eigenvalue Indices");
plt.ylabel("Percent Variance Explained");

#### Calculate the information stored in the first two principal components of the data

In [None]:
pc_first_two_info = np.sum(eig_vals_normalized[0:2])
print(f"Percentage of variance explained: {pc_first_two_info:0.2f}%")

#### Identify the feature that contributes most to the first two principal components 

In [None]:
def pc_stem_plot(ax, pc, columns):
    inds = range(0, len(pc)) 
    max_ind = np.argmax(np.abs(pc))
    print(f"Feature that contributes most to this principal component: {columns[max_ind]} (index {max_ind})")
        
    ax.stem(inds, pc, 'b', markerfmt='bo');
    ax.stem([max_ind], [pc[max_ind]], 'r', markerfmt='ro');

    ax.set_xlabel("Feature index");
    ax.set_ylabel("Feature contribution to principal component") ;

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
pc1 = eig_vecs[:, 0]
pc_stem_plot(axs[0], pc1, features.columns)
pc2 = eig_vecs[:, 1]
pc_stem_plot(axs[1], pc2, features.columns)

#### Transform data and reduct it to a 2D space

In [None]:
features_pc1 = features_standarized.values @ pc1
features_pc2 = features_standarized.values @ pc2
features_transformed = pd.DataFrame({"pc1": features_pc1, "pc2": features_pc2})
features_transformed.head()

In [None]:
features_transformed.describe()

#### Visualize reduced data in 2D

In [None]:
features_transformed.plot.scatter(x="pc1", y="pc2");

In [None]:
select_benign =(tumor == 0).values[:, 0]
select_malignant =(tumor == 1).values[:, 0]
benign = features_transformed.loc[select_benign, :]
malignant = features_transformed.loc[select_malignant, :]
ax = benign.plot.scatter(x="pc1", y="pc2", s=20, color="green", alpha=.4, label="benign");
malignant.plot.scatter(ax=ax, x="pc1", y="pc2", s=20, color="blue", alpha=.1, label="malignant");
plt.legend();

#### Revisit the feature scatter plots (now labeled)

In [None]:
benign = features_standarized.loc[select_benign, :]
malignant = features_standarized.loc[select_malignant, :]
ax = benign.plot.scatter(x="mean concave points", y="mean texture", s=20, color="green", alpha=.4, label="benign");
malignant.plot.scatter(ax=ax, x="mean concave points", y="mean texture", s=20, color="blue", alpha=.1, label="malignant");
plt.legend();