### Sample code for Principal Component Analysis (PCA)  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#### Parameters  

In [None]:
csv_in = 'wine-modified.csv'

# To show all rows and columns in the results 
pd.options.display.max_columns=999
pd.options.display.max_rows=999

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Set data  

In [None]:
dfX = df.loc[:, 'Alcohol':]
print(dfX.shape)
display(dfX.head())

#### Standardization  

In [None]:
sc = StandardScaler()
X_std = sc.fit_transform(dfX)

#### PCA  

In [None]:
n_pca = 7
pca = PCA(n_components=n_pca)
X_pca = pca.fit_transform(X_std)

#### PC coordinates  

In [None]:
print(X_pca.shape)
print(X_pca[:5])

#### Check contribution ratio  

In [None]:
print(pca.explained_variance_ratio_)
print(np.cumsum(pca.explained_variance_ratio_))

#### Draw graph of contribution  

In [None]:
xx = range(1, n_pca+1)
plt.bar(xx, pca.explained_variance_ratio_)
plt.step(xx, np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()

#### 2D plot 

In [None]:
ser_class = df['Class']
print(ser_class.value_counts())

In [None]:
classes = ser_class.unique()
print(classes)
colors = ['blue', 'red', 'green']

In [None]:
pca_x = X_pca[:, 0]
pca_y = X_pca[:, 1]
for i in range(len(classes)):
    cls = classes[i]
    c = colors[i]
    plt.scatter(pca_x[ser_class==cls], pca_y[ser_class==cls],
                c=c, label=cls)
plt.xlabel('PC_1')
plt.ylabel('PC_2')
plt.legend()
plt.show()

In [None]:
pca_x = X_pca[:, 0]
pca_y = X_pca[:, 2]
for i in range(len(classes)):
    cls = classes[i]
    c = colors[i]
    plt.scatter(pca_x[ser_class==cls], pca_y[ser_class==cls],
                c=c, label=cls)
plt.xlabel('PC_1')
plt.ylabel('PC_3')
plt.legend()
plt.show()

In [None]:
pca_x = X_pca[:, 1]
pca_y = X_pca[:, 2]
for i in range(len(classes)):
    cls = classes[i]
    c = colors[i]
    plt.scatter(pca_x[ser_class==cls], pca_y[ser_class==cls],
                c=c, label=cls)
plt.xlabel('PC_2')
plt.ylabel('PC_3')
plt.legend()
plt.show()

#### Draw biplot  

In [None]:
def biplot(X_2d, coef_2d, coef_labels=None):
    r1 = 6
    r2 = 1.01
    coef_2dT = coef_2d.T
    if coef_labels is None:
        coef_labels = range(len(coef_2dT))
    for i, coef in enumerate(coef_2dT):
        plt.arrow(0, 0, coef[0]*r1, coef[1]*r1, color='r')    
        plt.text(coef[0]*r1*r2, coef[1]*r1*r2, coef_labels[i],
                 color='b', fontsize=11)
    plt.scatter(X_2d[:,0], X_2d[:,1])
    plt.xlabel('PC_1')
    plt.ylabel('PC_2')
    return None

biplot(X_pca[:, :2], pca.components_[:2], coef_labels=dfX.columns)

##### X axis is similar to feature "Flavanoides"  
##### Y axis is similar to inverse of feature "Color_intensity"  

#### X $\sim$ Flavanoids, Y $\sim$ $-$Color_intensity    

In [None]:
plt.scatter(dfX.loc[:, 'Flavanoids'],
            -dfX.loc[:, 'Color_intensity'], c=ser_class)
plt.xlabel('Flavanoids')
plt.ylabel('-Color_intensity')
plt.show()

#### Use loadings instead of PC coefficients  

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
biplot(X_pca[:, :2], loadings[:, :2].T, coef_labels=dfX.columns)