Prepare Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
breast = load_breast_cancer()

breast.data.shape

In [None]:
breast.target.shape

In [None]:
labels = np.reshape(breast.target,(breast.target.shape[0],1))
data = np.concatenate([breast.data,labels],axis=1)

data.shape

In [None]:
breast_dataset = pd.DataFrame(data)
features = breast.feature_names

features

In [None]:
features_labels = np.append(features,'label')
breast_dataset.columns = features_labels

breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)

breast_dataset.tail()

Data Visualization using PCA

In [None]:
x = breast_dataset.loc[:, features].values
x = StandardScaler().fit_transform(x) # normalizing the features

x.shape

In [None]:
np.mean(x),np.std(x)

In [None]:
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]
normalised_breast = pd.DataFrame(x,columns=feat_cols)

normalised_breast.tail()

In [None]:
pca_breast = PCA(n_components=2)
principalComponents_breast = pca_breast.fit_transform(x)

principalComponents_breast.shape

In [None]:
principal_breast_Df = pd.DataFrame(data=principalComponents_breast, columns=['principal component 1','principal component 2'])

principal_breast_Df.tail()

In [None]:
print('Explained variation per principal component: {}'.format(pca_breast.explained_variance_ratio_))

In [None]:
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title('Principal Component Analysis of Breast Cancer Dataset',fontsize=20)

targets = ['Benign', 'Malignant']
colors = ['r', 'g']

for target, color in zip(targets,colors):
    indicesToKeep = breast_dataset['label'] == target
    plt.scatter(principal_breast_Df.loc[indicesToKeep, 'principal component 1']
               , principal_breast_Df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)

plt.legend(targets,prop={'size':15})