# Dimensionality Reduction

Dimensionality Reduction အများစုက တွေ့ဖူးခဲ့ပြီးသားပါ။ 

* SVD (not to be confused with SVM)
* PCA
* Fast Fourier Transform နဲ့ 
* KMean centroid distane method တို့ ဖြစ်ကြတယ်။

## SVD

In [None]:
from scipy import linalg

from sklearn import datasets

df_X, ds_y = datasets.load_digits(as_frame=True, return_X_y=True)

U, s, Vt = linalg.svd(df_X.values.T)

k = 32 # or any number less than large_2d_array.shape[1]
s_ = s[:k]
Vt_ = Vt[:k,:]

Sigma = linalg.diagsvd(s_, U.shape[1], Vt_.shape[0])
smaller_version = df_X.values @ Sigma
df_X.shape, smaller_version.shape

## PCA

In [18]:
import logging

from numpy.random import RandomState
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces
from sklearn import cluster
from sklearn import decomposition

rng = RandomState(0)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

faces, fy = fetch_olivetti_faces(return_X_y=True, shuffle=True, random_state=rng)
n_samples, n_features = faces.shape

# Global centering (focus on one feature, centering all samples)
faces_centered = faces - faces.mean(axis=0)

# Local centering (focus on one sample, centering all features)
faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)

print("Dataset consists of %d faces" % n_samples)
print(faces.shape)

Dataset consists of 400 faces
(400, 4096)


In [None]:
n_row, n_col = 2, 3
n_components = n_row * n_col
image_shape = (64, 64)


def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
    fig, axs = plt.subplots(
        nrows=n_row,
        ncols=n_col,
        figsize=(2.0 * n_col, 2.3 * n_row),
        facecolor="white",
        constrained_layout=True,
    )
    fig.set_constrained_layout_pads(w_pad=0.01, h_pad=0.02, hspace=0, wspace=0)
    fig.set_edgecolor("black")
    fig.suptitle(title, size=16)
    for ax, vec in zip(axs.flat, images):
        vmax = max(vec.max(), -vec.min())
        im = ax.imshow(
            vec.reshape(image_shape),
            cmap=cmap,
            interpolation="nearest",
            vmin=-vmax,
            vmax=vmax,
        )
        ax.axis("off")

    fig.colorbar(im, ax=axs, orientation="horizontal", shrink=0.99, aspect=40, pad=0.01)
    plt.show()

plot_gallery("Faces from dataset", faces_centered[:6])

In [None]:
pca_estimator = decomposition.PCA(
    n_components=n_components, svd_solver="randomized", whiten=True
)
pca_estimator.fit(faces_centered)
plot_gallery(
    "Eigenfaces - PCA using randomized SVD", pca_estimator.components_[:n_components]
)

In [None]:
reduced_faces = pca_estimator.transform(faces_centered)
reduced_faces.shape

### Combined with `SVM`

In [22]:
from sklearn import model_selection as ms
from sklearn import preprocessing as pre
from sklearn import svm
from sklearn import metrics

In [25]:
n_components = 150

X_train, X_test, y_train, y_test = ms.train_test_split(
    faces, fy, test_size=0.25, random_state=42
)

scaler = pre.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

pca = decomposition.PCA(n_components=n_components, svd_solver="randomized", whiten=True)
pca.fit(X_train)

PCA(n_components=150, svd_solver='randomized', whiten=True)

In [26]:
pca.components_.shape

(150, 4096)

In [27]:
eigenfaces = pca.components_.reshape((n_components, 64, 64))

print("Projecting the input data on the eigenfaces orthonormal basis")
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

Projecting the input data on the eigenfaces orthonormal basis


In [28]:
model = svm.SVC(kernel="rbf", C=76823.03433306453, class_weight='balanced', gamma=0.003418945823095797)
model.fit(X_train_pca, y_train)
pred = model.predict(X_test_pca)
print (metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         5
           2       1.00      0.25      0.40         4
           3       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         5
           6       1.00      0.80      0.89         5
           7       1.00      1.00      1.00         5
           9       0.00      0.00      0.00         0
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         2
          15       1.00      0.67      0.80         3
          16       0.67      1.00      0.80         2
          17       0.50      1.00      0.67         1
          18       1.00      1.00      1.00         2
          19       1.00      1.00      1.00         6
          20       0.57    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## FFT

In [None]:
%pip install yfinance

In [None]:
import yfinance as yf

msft = yf.Ticker("MSFT")

# get historical market data
hist = msft.history(period="max")
print (type(hist))
hist.head()

In [None]:
import numpy as np
from scipy import fft

NUM_DAYS = 1500

close = hist.Close[-NUM_DAYS:].values
frequencies = fft.fft(close)
plt.plot(np.log(frequencies))
plt.show()

In [None]:
frequencies_ = frequencies.copy()
frequencies_[50:1450] = 0
print (np.count_nonzero(frequencies_))
print (len(frequencies_))
plt.plot(np.log(frequencies_))
plt.show()

In [None]:
close_ = fft.ifft(frequencies_)
plt.plot(hist.index[-NUM_DAYS:], close_)
plt.show()

In [None]:
print (frequencies_.shape)
frequencies_dict = {k: v for k, v in enumerate(frequencies_) if v > 0}
print (len(frequencies_dict))

## KMeans Centroid Distance

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, tree, cluster, model_selection, metrics

# 1. load the data
df_X, ds_y = datasets.load_digits(n_class=2, return_X_y=True, as_frame=True)

# 2. split into train and test sets
tr_X, ts_X, tr_y, ts_y = model_selection.train_test_split(df_X, ds_y, random_state=42)

# 3. some feature engineering
kmean = cluster.KMeans()
feat_tr_X = kmean.fit_transform(tr_X)
feat_ts_X = kmean.transform(ts_X)

# 4. build tree model
tree_model = tree.DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=3, max_features=3, max_leaf_nodes=3, random_state=24)
tree_model.fit(feat_tr_X, tr_y)

# 5. test the model
pred_y = tree_model.predict(feat_ts_X)

print (metrics.classification_report(y_true=ts_y, y_pred=pred_y))