<a href="https://colab.research.google.com/github/paulinawdowiak1/ML-Bootcamp/blob/main/unsupervised/ML_Bootcamp_08_pca_wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### <a name='0'></a> Import bibliotek

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

### <a name='1'></a> Załadowanie danych

In [2]:
df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df = df_raw.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
data = df.iloc[:, 1:]
target = df.iloc[:, 0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
target.value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

### <a name='2'></a> Podział na zbiór treningowy i testowy

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (133, 13)
X_test shape: (45, 13)


### <a name='3'></a> Standaryzacja

In [11]:
from sklearn.preprocessing import StandardScaler

#trenujemy scaler metoda fit tylko na zbiorze X_train, ale transformujemy także X_test

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std[:5]

array([[ 0.5185,  1.2857, -0.8983, -0.1509, -0.6955,  0.2804,  0.694 , -0.684 , -0.1671, -0.3455, -0.1282,  0.5363,  0.8979],
       [-0.2498,  0.8789, -0.2014,  0.0578, -0.8329, -1.2667, -1.3848,  0.3408, -1.088 ,  2.327 , -0.9974, -1.1824, -0.2372],
       [ 1.0654, -0.6789,  1.0374,  0.207 ,  1.0213,  1.0778,  1.435 ,  0.3408,  0.2662,  0.6913,  0.8784, -0.0549,  1.206 ],
       [ 2.3676, -0.6183, -0.7047, -1.5823, -0.2148,  0.8386,  1.0027, -0.5263,  0.7357,  0.0657,  0.6496,  0.3301,  0.9304],
       [-0.6534, -0.9645, -0.395 , -0.5386, -1.0389, -0.4373, -1.4672,  1.9174, -0.5824,  0.1774, -0.8602, -1.5124, -0.3507]])

### <a name='4'></a> PCA

In [13]:
from sklearn.decomposition import PCA

#PCA również trenujemy tylko na X_train ale transform robimy na obu zbiorach

pca = PCA(n_components = 3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_train_pca.shape

(133, 3)

Wyjaśniona wariancja

In [14]:
results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.378508,0.378508,1
1,0.188772,0.567281,2
2,0.105235,0.672515,3


In [15]:
fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'], name='explained variance ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'], name='cumulative explained variance')],
                layout=go.Layout(title=f'PCA - {pca.n_components_} components', width=950, template='plotly_dark'))
fig.show()

In [16]:
X_train_pca_df = pd.DataFrame(data=np.c_[X_train_pca, y_train], columns=['pca1', 'pca2', 'pca3', 'target'])
X_train_pca_df.head()

Unnamed: 0,pca1,pca2,pca3,target
0,0.705295,0.042775,0.461469,1.0
1,-2.854897,1.406282,1.167645,3.0
2,2.094273,1.571594,-0.556745,1.0
3,2.622832,0.644624,1.598936,1.0
4,-2.426884,-0.393334,1.059707,3.0


In [17]:
px.scatter_3d(X_train_pca_df, x='pca1', y='pca2', z='pca3', color='target', template='plotly_dark', width=950)

In [18]:
X_train_pca[:5]

array([[ 0.7053,  0.0428,  0.4615],
       [-2.8549,  1.4063,  1.1676],
       [ 2.0943,  1.5716, -0.5567],
       [ 2.6228,  0.6446,  1.5989],
       [-2.4269, -0.3933,  1.0597]])

In [19]:
X_test_pca[:5]

array([[-1.9871, -1.4744, -0.1932],
       [ 0.219 , -2.421 ,  2.0986],
       [ 3.1804,  0.518 ,  0.4699],
       [ 2.2224,  0.476 ,  0.7302],
       [ 3.7933,  2.2749,  0.7631]])