In [16]:
from datasets import dutch_data
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
np.set_printoptions(suppress=True)

In [11]:
df = dutch_data()

In [12]:
train_X = df.drop('target', axis=1)
train_y = df['target']

## Choosing number of components

In [13]:
pca = PCA(n_components=min(df.shape[0], df.shape[1]))
pca = pca.fit(train_X)

In [14]:
pca.explained_variance_ratio_

array([0.24617676, 0.18437186, 0.13130546, 0.07771117, 0.0516459 ,
       0.02767381, 0.02412882, 0.01766811, 0.01311262, 0.01222623,
       0.01139273, 0.00875081, 0.00848846, 0.00766746, 0.00659192,
       0.00612297, 0.00572201, 0.00551054, 0.00529668, 0.00482377,
       0.00449477, 0.0042377 , 0.00415656, 0.00400076, 0.00373338,
       0.00368645, 0.00354271, 0.0033852 , 0.0032196 , 0.00309181,
       0.00301281, 0.00289096, 0.00278474, 0.00272756, 0.00267131,
       0.00262174, 0.00255365, 0.00243842, 0.00237685, 0.00230784,
       0.00223386, 0.00216498, 0.00212939, 0.0020366 , 0.00197666,
       0.00196883, 0.00182631, 0.00175908, 0.0017369 , 0.00170797,
       0.00164846, 0.00158426, 0.00157012, 0.0015564 , 0.00150123,
       0.00145   , 0.0014241 , 0.00138364, 0.00133758, 0.00129481,
       0.00124423, 0.00123528, 0.00122974, 0.00120365, 0.0011629 ,
       0.00114102, 0.00111959, 0.00104521, 0.00101524, 0.00100097,
       0.00098696, 0.0009523 , 0.00093202, 0.00089303, 0.00088

In [17]:
result = pd.DataFrame(zip(range(pca.explained_variance_ratio_.shape[0]), pca.explained_variance_ratio_), columns=['component', 'explained variance ratio'])
fig = px.line(result, x='component', y='explained variance ratio')
fig.show()

## PCA with 3 components

In [18]:
pca = PCA(n_components=3)
pca = pca.fit(train_X)

In [19]:
pca.explained_variance_ratio_

array([0.24617676, 0.18437186, 0.13130546])

In [20]:
transformed_train_X = pca.transform(train_X)

In [21]:
transformed_df = pd.DataFrame(transformed_train_X, columns=['x','y','z'])
transformed_df['target'] = train_y.values
transformed_df['name'] = df.index
transformed_df['size'] = [1 for i in range(transformed_df.shape[0])]
fig = px.scatter_3d(transformed_df, x='x', y='y', z='z',
              color='target', hover_data=['name'], size='size', size_max=15)
fig.show()

## PCA with 2 components

In [22]:
pca = PCA(n_components=2)
pca = pca.fit(train_X)

In [23]:
pca.explained_variance_ratio_

array([0.24617676, 0.18437186])

In [24]:
transformed_train_X = pca.transform(train_X)

In [25]:
transformed_df = pd.DataFrame(transformed_train_X, columns=['x','y'])
transformed_df['target'] = train_y.values
transformed_df['name'] = df.index
transformed_df['size'] = [1 for i in range(transformed_df.shape[0])]
fig = px.scatter(transformed_df, x='x', y='y',
              color='target', hover_data=['name'], size='size', size_max=5)
fig.show()