<a href="https://colab.research.google.com/github/matsunori39/Small-Data-Analysis-and-Machine-Learning/blob/main/Small_Data_Analysis_and_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.ohmsha.co.jp/book/9784274227783/

# Chapter 2 Correlation and Principal Component Analysis

## 2.1 Data Preprocessing

### Data Standardization

In [15]:
import numpy as np

def autoscale(X):
  """
  Standardize data matrix

  Parameters
  ------------
  X: Data Matrix

  Return Value
  ------------
  Xscale: Data matrix after standardization
  meanX: mean vector
  stdX: standard deviation vector
  """

  meanX = np.mean(X, axis = 0)
  stdX = np.std(X, axis = 0, ddof = 1)
  Xscale = (X - meanX) / stdX
  return Xscale, meanX, stdX

def scaling(x, meanX, stdX):
  """
  Standardizes the sample from mean and standard deviation of the data matrix

  Parameters
  ------------
  x: Samples to be standardized
  meanX: mean vector
  stdX: standard deviation vector

  Return Value
  ------------
  xscale: Sample after standardization
  """

  xscale = (x - meanX) / stdX
  return xscale

def rescaling(xscale, meanX, stdX):
  """
  Restore standardized samples to original scale

  Parameters
  ------------
  xscale: Sample after standardization
  meanX: mean vector
  stdX: standard deviation vector

  Return Value
  ------------
  xscale: Sample of original scale
  """

  x = np.multiply(stdX, xscale) + meanX
  return x

### Standardize data

In [16]:
import numpy as np

# Define existing data and unknown samples with ndarray type
X = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

x = np.array([[10, 11, 12]])

# Standardize X
Xscale, meanX, stdX = autoscale(X)
print(Xscale)

[[-1. -1. -1.]
 [ 0.  0.  0.]
 [ 1.  1.  1.]]


In [17]:
print(meanX)

[4. 5. 6.]


In [18]:
print(stdX)

[3. 3. 3.]


In [19]:
# Stdndardize unknown samples
xscale = scaling(x, meanX, stdX)
print(xscale)

[[2. 2. 2.]]


In [20]:
# Restore the standardized sample to its original scale
xrescale = rescaling(xscale, meanX, stdX)
print(xrescale)

[[10. 11. 12.]]


## 2.8 Derivation of the rth principal component

### PCA program with eigenvalue problem

In [21]:
import numpy as np

def pca(X):
  """
  Performs principal component analysis with eigenvalues

  Parameter
  ------------
  X: Data Matrix

  Return Values
  ------------
  P: Loading Matrix
  t: Principal Component Score Matrix  
  """

  # Computes the covariance matrix
  V = np.cov(X.T)
  # Solve the eigenvalue problem for the covariance matrix
  _, P = np.linalg.eig(V)

  # Calculate principal component scores
  t = X @ P
  return P, t

## 2.9 Examples of PCA values

In [22]:
import numpy as np

data = [[2, 2], [1, -1], [-1, 1], [-2, -2]]
x = np.array(data)
P, T = pca(x)

print(P)

[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


## 2.12 PCA and Singular Value Decomposition

### PCA using SVD

In [23]:
import numpy as np

def pca_svd(X):
  """
  Perform principal component analysis using SVD

  Parameter
  ---------
  X: Data Matrix

  Return Values
  ---------
  P: Loading Matrix
  t: Principal Component Score Matrix
  """

  # Decompose a matrix into singular values
  _, _, P = np.linalg.svd(X)

  # Calculate principal component scores
  t = X @ P.T
  return P, t

In [24]:
P, t = pca_svd(np.array(data))
print(P)

[[-0.70710678 -0.70710678]
 [-0.70710678  0.70710678]]


# Chapter 3 Regression Analysis and Least Squares Method

## 3.2 Least Squares Method

In [25]:
import numpy as np

def least_squares(X, y):
  """
  Calculate regression coefficients using the least squares method

  Parameters
  ----------
  X: input data
  y: output data

  Return Value
  ----------
  beta: regression coefficient
  """

  # Vectorize y
  y = y.reshape(-1, 1)

  # normal equation
  beta = np.linalg.inv(X.T @ X) @ X.T @ y
  return beta

def ls_est(x, beta):
  """
  Predicts output using a linear regression model

  Parameters
  ----------
  x: unknown sample
  beta: regression coefficient

  Return Value
  ----------
  y_hat: Predicted value
  """

  y_hat = beta.T @ x
  return y_hat

### Numerical example of the least-squares method

In [26]:
import numpy as np

# Define data
X = np.array([[0.01, 0.50, -0.12],
              [0.97, -0.63, 0.02],
              [0.41, 1.15, -1.17],
              [-1.38, -1.02, 1.27]])

y = np.array([[0.25], [0.08], [1.03], [-1.37]])
x = np.array([1, 0.7, -0.2])

# Find the regression coefficient
beta = least_squares(X, y)
print(beta)

[[ 0.36347065]
 [ 0.41624871]
 [-0.34677593]]


In [27]:
# Predicts output from unknown samples
y_hat = ls_est(x, beta)
print(y_hat)

[0.72419993]


## 3.7 Multicollinearity issues

In [28]:
A = np.array([[2.01,   1 ],
              [  4 , 2.01]])
B = np.array([[1.99,   1 ],
              [  4 , 2.02]])

In [29]:
np.linalg.inv(A)

array([[ 50.12468828, -24.93765586],
       [-99.75062344,  50.12468828]])

In [30]:
np.linalg.inv(B)

array([[ 102.02020202,  -50.50505051],
       [-202.02020202,  100.50505051]])

In [31]:
C = np.array([[ 2 , 1 ],
              [ 4 , 2 ]])

In [32]:
np.linalg.cond(A)

625.4397976337879

In [33]:
np.linalg.cond(B)

1264.670926452252

In [34]:
np.linalg.cond(C)

2.517588727560788e+16

### For data without multicolinearity

In [None]:
import numpy as np

# 