## Machine Learning Code

---

### Dataset

Many of the following code examples utilize the following cancer recognition dataset.

In [29]:
from sklearn import datasets
import pandas as pd

CancerDataset = datasets.load_breast_cancer()

Features = pd.DataFrame(CancerDataset.data, columns = CancerDataset.feature_names)
Labels = pd.DataFrame(CancerDataset.target, columns = ['label'])

---

### Decision Tree

In [30]:
from sklearn import tree

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
DecTree = DecTreeModel.fit(Features, Labels)

Predictions = DecTree.predict(Features)

sum(Predictions == Labels['label']) / len(Labels)

0.9947275922671354

---

### K-fold Cross Validation

In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
KfoldDecTree = KFold(n_splits = 10, shuffle = True)

cross_val_score(DecTreeModel, Features, Labels['label'], cv = KfoldDecTree)

array([0.94736842, 0.94736842, 0.89473684, 0.94736842, 0.9122807 ,
       0.94736842, 0.94736842, 0.89473684, 0.96491228, 0.96428571])

---

### Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

RandomForestModel = RandomForestClassifier(n_estimators = 10, max_depth = 5)
KfoldRandomForest = KFold(n_splits=10, shuffle=True)

cross_val_score(RandomForestModel, Features, Labels['label'], cv = KfoldRandomForest)

array([1.        , 0.85964912, 0.96491228, 0.9122807 , 0.98245614,
       0.92982456, 0.94736842, 0.96491228, 0.96491228, 0.98214286])

---

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

RegressionModel = LogisticRegression(max_iter = 100)
Regression = RegressionModel.fit(Features, Labels['label'])

Coefficients = Regression.coef_

---

### Adaboost

In [33]:
from sklearn.ensemble import AdaBoostClassifier

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
AdaBoostModel = AdaBoostClassifier(estimator = DecTreeModel, n_estimators = 10, learning_rate = 1)
KfoldAdaBoost = KFold(n_splits = 10, shuffle = True)

cross_val_score(AdaBoostModel, Features, Labels['label'], cv = KfoldAdaBoost)

array([0.94736842, 0.92982456, 0.96491228, 0.98245614, 0.94736842,
       0.94736842, 0.96491228, 0.92982456, 0.92982456, 0.89285714])

---

### Gradient Descent
* Given a square loss function: 
$$\frac{1}{2}(\sigma (w' \cdot x) - y)^2$$

* Where $\sigma$ is the sigmoid activation function: 
$$\frac{1}{(1 + e^{-x})}$$

* And thus the gradient of the loss function with respect to the weights is:

$$ 
\frac{\partial f}{\partial w} = (\sigma(w' \cdot x_i) - y_i) * \sigma(w' \cdot x_i) * (1 - \sigma(w' \cdot x_i)) * x_i
$$


In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Learning rate.
eta = 0.01

# Generate true weights (w), observations (X), and labels (Y).
w, X, Y = GenerateData(m)

# Initialize weights.
w_prime = np.zeros(10)

gradient = np.zeros(10)

for o in range(n_iters):

    # Iterate through each data point in X, Y.
    for i in range(m):

        x_i = X[i]
        y_i = Y[i]

        gradient += (sigmoid(w_prime @ x_i) - y_i) * sigmoid(w_prime @ x_i) * (1 - sigmoid(w_prime @ x_i)) * x_i

    # Calculate the gradient for the whole dataset as the average of the summed
    # gradients from each data point.
    gradient = gradient / m

    w_prime = w_prime - eta * gradient


---

### Singular Value Decomposition (SVD)

In [35]:
import numpy as np
from numpy.linalg import svd

U, S, Vt = np.linalg.svd(Features, full_matrices=False)

---

### Principal Component Analysis (PCA)

In [36]:

cancer = datasets.load_breast_cancer()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Features = np.array(pd.DataFrame(cancer.data, columns=cancer.feature_names))
Labels = np.array(pd.DataFrame(cancer.target, columns=['label']))

# Rescale features to be mean 0 and standard deviation 1.
Features = StandardScaler().fit_transform(Features)

# Calculate the first two principal components.
PC2 = PCA(n_components=2)

# Project the dataset features onto the first two principal components.
Features2 = PC2.fit_transform(Features)

---

### K-means Clustering

In [None]:
from scipy.spatial import distance

def kmeans(X, k = 4, max_iter = 500, random_state=0):

  """
  Inputs:
      X: input data matrix, numpy array with shape (n * d), n: number of data points, d: feature dimension
      k: number of clusters
      max_iters: maximum iterations
  Output:
      clustering label for each data point
  """

  assert len(X) > k, 'illegal inputs'
  np.random.seed(random_state)

  # Randomly select k data points as centers.
  idx = np.random.choice(len(X), k, replace=False)
  centers = X[idx]
  print('Initial center observations: ', idx)

  for i in range(max_iter):

    # Compute distance from each data point to centers.
    H = distance.cdist(X, centers, 'euclidean')

    # Create vector assigning each data point to the closest centroid.
    CentroidNum = np.argmin(H, axis=1)

    # Update the centroids as the average of all data points belonging to a given centroid.
    for j in range(k):
      centers[j] = np.mean(X[CentroidNum == j], axis=0)
    
  # Label each data point with the centroid it is closest to.
  labels = CentroidNum 

  return labels

---

### Kernel Method

In [None]:

from scipy.spatial.distance import cdist
from scipy.spatial import distance

# Radial basis function kernel.
def rbf_kernel(X, Xp, h):
    # X: n*1 matrix
    # Xp: m*1 matrix
    # h: scalar value 
                
    # Find the pairwise distance between X and Xp.
    K = np.exp(-cdist(X, Xp)**2/(2*h**2))
            
    return K # n*m

# Calculate h-parameter for the RBF kernel as the median of the pairwise distance of X
# after removing diagonal terms which are all 0 (the distance of a point to itself is 0).
def median_distance(X):
    # X: n*1 matrix
    h = np.median([distance.euclidean(X[i], X[j]) for i in range(len(X)) for j in range(len(X)) if i != j])
    return h

# Calculate kernel regression weights.
def kernel_regression_fitting(xTrain, yTrain, h, beta=1):
    # X: input data, numpy array, n*1
    # Y: input labels, numpy array, n*1
    W = np.linalg.inv(rbf_kernel(xTrain, xTrain, h) + beta * np.identity(len(xTrain))) @ yTrain
    return W

# Train kernel weights and predict labels for test data.
def kernel_regression_fit_and_predict(xTrain, yTrain, xTest, h, beta):
    
    # Calculate kernel weights using training data.
    W = kernel_regression_fitting(xTrain, yTrain, h, beta)
    
    # Compute the kernel matrix between xTrain and xTest.
    K_xTrain_xTest = rbf_kernel(xTrain, xTest, h)
   
    # Predict the label of xTest using the kernel matrix and weights.
    yPred = np.dot(K_xTrain_xTest.T, W)
    return yPred

---