In [None]:
!git clone https://github.com/mciprian/ml_class_content.git
!mv ml_class_content/notebooks/img img
!mv ml_class_content/notebooks/data data
!rm data/creditcard.zip
!rm -fr ml_class_content/


# Methods. Unsupervised. PCA

The aim of PCA (Principal component analysis) is to linearly reduce the N dimensions given for a dataset. Basically, this algorith follows the next steps:

$$1. \text{Scale data}\\ 2.\text{Calculate covariance matrix} \\ 3. \text{Calculate eigvalues}$$

This algorithm is very useful to extract and represent data sctructure.

In [None]:
import pandas as pd
import numpy as np
from IPython.display import YouTubeVideo

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression, make_moons

Lets create a simple lineal dataset with 3 features

In [None]:
samples = 400
X, _ = make_regression(n_samples=samples, random_state=12345, n_features=2, noise=0.9)

# Second feature is the sin of the first feature.
X[:, 1] = np.sin(X[:, 0]) + np.random.normal(0, 0.2, samples)

plt.scatter(X[:, 0], X[:, 1])
# plt.scatter(X[:, 0], X[:, 2])
plt.title("X0 and X1")
plt.show()

### 1. Scale

We scale only centered the columns of matrix (i.e. minus the mean)

In [None]:
# Columns of the matrix are centered
X_scaled = StandardScaler(with_std=False).fit_transform(X)
# Plot it
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], color="blue")
plt.show()

### 2. Covariance matrix

Covariance indicates how much two random variables vary together:

$$\sigma(x, y) = \frac{1}{n-1} \sum^{n}_{i=1}{(x_i-\bar{x})(y_i-\bar{y})}$$


(instead of variance, that indicates how much varies one variable itself ($\sigma^2_x = \frac{1}{n-1} \sum^{n}_{i=1}(x_i – \bar{x})^2$).


For two features, the covariance matrix is defined as follows:

$$C = \left( \begin{array}{ccc}  \sigma(x, x) & \sigma(x, y) \\  \sigma(y, x) & \sigma(y, y) \end{array} \right)$$

In [None]:
YouTubeVideo('WBlnwvjfMtQ',width=640, height=480)

In [None]:
cov_matrix = np.cov(X_scaled, rowvar=False)
cov_matrix

### 3. Eignvalues

Can be seen as a vector whose direction remains unchanged when a linear transformation is applied to it. The eigenvectors are unit vectors representing the direction of the largest variance of the data, while the eigenvalues represent the magnitude of this variance in the corresponding directions.

Like an arrow (direction=eingvector, length of arrow=eignvalue).

In [None]:
YouTubeVideo('ue3yoeZvt8E',width=640, height=480)

In [None]:
values, vectors = np.linalg.eig(cov_matrix)

The directions !

In [None]:
vectors

The lengths ! (or the explained variance)

In [None]:
values

Notice that we can explain most of the variance with only 2 principal components !

#### Plot reduced dimension versus the original one

Get the projections

In [None]:
projected_1 = X_scaled.dot(vectors.T[0])
projected_2 = X_scaled.dot(vectors.T[1])

In [None]:
plt.scatter(projected_1, projected_2, s=50, label="projected (PCA)")
plt.scatter(X[:, 0], X[:, 1], s=50, label="original")
plt.legend()
plt.show()

 <span style="color:blue">Discuss about the differences between original and projected space</span>


#### Let's use a library implementation and compare it

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=False)
pca_X = pca.fit_transform(X)

In [None]:
print(pca.components_)

In [None]:
print(pca.explained_variance_)

cumulative explained variance

In [None]:
np.cumsum(pca.explained_variance_/np.sum(pca.explained_variance_))

In [None]:
# Empirical mean i.e: X.mean(axis=0)
pca.mean_

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops = dict(arrowstyle="<-", linewidth=2, shrinkA=0, shrinkB=0)
    ax.annotate("", v0, v1, arrowprops=arrowprops)

plt.scatter(pca_X[:, 0], pca_X[:, 1], color="blue")

xs = pca.components_[0]
ys = pca.components_[1]

for i, varnames in enumerate(['X0', 'X1']):
    draw_vector([0,0],[xs[i],ys[i]])
    plt.text(xs[i], ys[i], varnames)

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title("Bi-plot")
plt.show()

#### Draw a eginvectors and eignvalues

In [None]:


# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v) # starting to the mean
plt.axis("equal")
plt.show()

 <span style="color:blue">Exercice: From this vector, infere the data structre </span>

In [None]:
from IPython.display import Image
Image(filename="img/pca_moon.png")

PCA visually explained

In [None]:
YouTubeVideo('FgakZw6K1QQ',width=640, height=480)

## Practice with financial dataset. Propensity to purchase

https://www.kaggle.com/benpowis/customer-propensity-to-purchase-data

In [None]:
from IPython.display import Image
Image(filename="img/purchase_kaggle.png")

In [None]:
df_purchase = pd.read_csv(
    "data/training_sample.csv", delimiter=",", quotechar='"'
)

from sklearn.model_selection import train_test_split

df_purchase_train, df_purchase_test = train_test_split(df_purchase, test_size=0.2)


In [None]:
df_purchase_train.head(3)

In [None]:
df_purchase_train.groupby('ordered')[["ordered"]].count()

In [None]:
df_purchase_test.head(3)

In [None]:
df_purchase_test.groupby('ordered')[["ordered"]].count()