In [188]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn import datasets
from sklearn.preprocessing import scale
from sklearn import decomposition

##Operazioni sul dataset

In [189]:
# Caricamento del dataset
wine = datasets.load_wine()

In [None]:
# Visualizzazione del dataset
wine

In [211]:
# Features del dataset
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [212]:
# Output features
print(wine.target_names)

['class_0' 'class_1' 'class_2']


In [193]:
X = wine.data
Y = wine.target

print(f"X: {X.shape} | Y: {Y.shape}")

X: (178, 13) | Y: (178,)


##PCA

In [194]:
# Normalizzazione dei dati
X = scale(X)

In [195]:
# Definizione del numero di Principal Components (PC)
pca = decomposition.PCA(n_components=10)

# Calcola componenti principali
pca.fit(X)

Calcoliamo gli score values

In [196]:
# Trasforma i dati dallo spazio originale delle caratteristiche allo spazio delle componenti principali ottenute dall'analisi PCA
scores = pca.transform(X)

In [197]:
scores_df = pd.DataFrame(scores, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
scores_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,3.316751,-1.443463,-0.165739,-0.215631,0.693043,-0.22388,0.596427,0.065139,0.641443,1.020956
1,2.209465,0.333393,-2.026457,-0.291358,-0.257655,-0.92712,0.053776,1.024416,-0.308847,0.159701
2,2.51674,-1.031151,0.982819,0.724902,-0.251033,0.549276,0.424205,-0.344216,-1.177834,0.113361
3,3.757066,-2.756372,-0.176192,0.567983,-0.311842,0.114431,-0.383337,0.643593,0.052544,0.239413
4,1.008908,-0.869831,2.026688,-0.409766,0.298458,-0.40652,0.444074,0.4167,0.326819,-0.078366


In [198]:
Y_label = []

for y in Y:
  if y == 0:
    Y_label.append('Extra Dry')
  elif y == 1:
    Y_label.append('Dry')
  else:
    Y_label.append('Brut')

Species = pd.DataFrame(Y_label, columns=['Tipology'])

In [199]:
df_scores = pd.concat([scores_df, Species], axis=1)
df_scores.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Tipology
0,3.316751,-1.443463,-0.165739,-0.215631,0.693043,-0.22388,0.596427,0.065139,0.641443,1.020956,Extra Dry
1,2.209465,0.333393,-2.026457,-0.291358,-0.257655,-0.92712,0.053776,1.024416,-0.308847,0.159701,Extra Dry
2,2.51674,-1.031151,0.982819,0.724902,-0.251033,0.549276,0.424205,-0.344216,-1.177834,0.113361,Extra Dry
3,3.757066,-2.756372,-0.176192,0.567983,-0.311842,0.114431,-0.383337,0.643593,0.052544,0.239413,Extra Dry
4,1.008908,-0.869831,2.026688,-0.409766,0.298458,-0.40652,0.444074,0.4167,0.326819,-0.078366,Extra Dry


In [200]:
# Rappresentazione dei carichi delle variabili originali rispetto alle componenti principali
loadings = pca.components_.T
df_loadings = pd.DataFrame(loadings, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'], index=wine.feature_names)

# Visualizza rappresentazione
df_loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
alcohol,0.144329,-0.483652,-0.207383,-0.017856,-0.265664,-0.213539,-0.056396,-0.396139,0.508619,0.211605
malic_acid,-0.245188,-0.224931,0.089013,0.53689,0.035214,-0.536814,0.420524,-0.065827,-0.075283,-0.30908
ash,-0.002051,-0.316069,0.626224,-0.214176,-0.143025,-0.154475,-0.149171,0.17026,-0.307694,-0.027125
alcalinity_of_ash,-0.23932,0.010591,0.61208,0.060859,0.066103,0.100825,-0.286969,-0.42797,0.200449,0.052799
magnesium,0.141992,-0.299634,0.130757,-0.351797,0.727049,-0.038144,0.322883,0.156361,0.271403,0.06787
total_phenols,0.394661,-0.06504,0.146179,0.198068,-0.149318,0.084122,-0.027925,0.405934,0.286035,-0.320131
flavanoids,0.422934,0.00336,0.150682,0.152295,-0.109026,0.01892,-0.060685,0.187245,0.049578,-0.163151
nonflavanoid_phenols,-0.298533,-0.028779,0.170368,-0.203301,-0.500703,0.258594,0.595447,0.233285,0.195501,0.215535
proanthocyanins,0.313429,-0.039302,0.149454,0.399057,0.13686,0.533795,0.372139,-0.368227,-0.209145,0.134184
color_intensity,-0.088617,-0.529996,-0.137306,0.065926,-0.076437,0.418644,-0.227712,0.033797,0.056218,-0.290775


Vediamo i valori di varianza e spiegata e cumulativa

In [201]:
# Calcola quanto i PC contribuiscono alla varianza del dataset
explained_variance = pca.explained_variance_ratio_

# Visualizza contributo
explained_variance

array([0.36198848, 0.1920749 , 0.11123631, 0.0706903 , 0.06563294,
       0.04935823, 0.04238679, 0.02680749, 0.02222153, 0.01930019])

In [202]:
explained_variance = np.insert(explained_variance, 0, 0)
cumulative_variance = np.cumsum(np.round(explained_variance, decimals=3))

In [203]:
pc_df = pd.DataFrame(['','PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'], columns=['PC'])
explained_variance_df = pd.DataFrame(explained_variance, columns=['Explained Variance'])
cumulative_variance_df = pd.DataFrame(cumulative_variance, columns=['Cumulative Variance'])

In [204]:
explained_variance = pd.concat([pc_df, explained_variance_df, cumulative_variance_df], axis=1)
explained_variance

Unnamed: 0,PC,Explained Variance,Cumulative Variance
0,,0.0,0.0
1,PC1,0.361988,0.362
2,PC2,0.192075,0.554
3,PC3,0.111236,0.665
4,PC4,0.07069,0.736
5,PC5,0.065633,0.802
6,PC6,0.049358,0.851
7,PC7,0.042387,0.893
8,PC8,0.026807,0.92
9,PC9,0.022222,0.942


I due grafici rappresentano la varianza spiegata dalle componenti principali (PC) ottenute da un'analisi PCA (Principal Component Analysis). Il primo grafico mostra la varianza cumulativa spiegata dalle prime PC rispetto al numero totale di PC considerate, mentre il secondo grafico mostra la varianza spiegata da ciascuna PC individuale. Insieme, forniscono una panoramica della variazione nei dati e dell'efficacia della PCA nel catturarla attraverso le componenti principali.

In [205]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(
        x=explained_variance['PC'],
        y=explained_variance['Cumulative Variance'],
        marker=dict(size=15, color="LightSeaGreen")
    ), row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=explained_variance['PC'],
        y=explained_variance['Explained Variance'],
        marker=dict(color="RoyalBlue"),
    ), row=1, col=2
)

fig.show()


Il grafico 3D rappresenta la distribuzione dei dati nello spazio tridimensionale delle prime tre componenti principali (PC1, PC2 e PC3) ottenute da un'analisi PCA, con punti colorati in base alla variabile "Tipology", consentendo di visualizzare eventuali pattern o strutture nei dati rispetto a questa variabile.

In [206]:
fig = px.scatter_3d(df_scores, x='PC1', y='PC2', z='PC3', color='Tipology')

fig.show()

Questo grafico permette di comprendere come le variabili originali del dataset contribuiscano alle prime tre componenti principali ottenute dall'analisi PCA.

In [207]:
loadings_label = df_loadings.index

fig = px.scatter_3d(df_loadings, x='PC1', y='PC2', z='PC3', text = loadings_label)
fig.show()

In [208]:
# Seleziona le prime 8 dimensioni che hanno una varianza cumulata adeguata
reduced = scores[:, :8]
pd.DataFrame(reduced)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.316751,-1.443463,-0.165739,-0.215631,0.693043,-0.223880,0.596427,0.065139
1,2.209465,0.333393,-2.026457,-0.291358,-0.257655,-0.927120,0.053776,1.024416
2,2.516740,-1.031151,0.982819,0.724902,-0.251033,0.549276,0.424205,-0.344216
3,3.757066,-2.756372,-0.176192,0.567983,-0.311842,0.114431,-0.383337,0.643593
4,1.008908,-0.869831,2.026688,-0.409766,0.298458,-0.406520,0.444074,0.416700
...,...,...,...,...,...,...,...,...
173,-3.370524,-2.216289,-0.342570,1.058527,-0.574164,-1.108788,0.958416,-0.146097
174,-2.601956,-1.757229,0.207581,0.349496,0.255063,-0.026465,0.146894,-0.552427
175,-2.677839,-2.760899,-0.940942,0.312035,1.271355,0.273068,0.679235,0.047024
176,-2.387017,-2.297347,-0.550696,-0.688285,0.813955,1.178783,0.633975,0.390829
