In [None]:

import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree, ensemble
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold
from collections import defaultdict
import pprint

Many of the following code examples utilize the following cancer recognition dataset.

In [1]:
from sklearn import datasets
import pandas as pd

CancerDataset = datasets.load_breast_cancer()

Features = pd.DataFrame(CancerDataset.data, columns = CancerDataset.feature_names)
Labels = pd.DataFrame(CancerDataset.target, columns = ['label'])

### Decision Tree

In [15]:
from sklearn import tree

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
DecTree = DecTreeModel.fit(Features, Labels)

Predictions = DecTree.predict(Features)

sum(Predictions == Labels['label']) / len(Labels)

0.9947275922671354

### K-fold Cross Validation

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
KfoldDecTree = KFold(n_splits = 10, shuffle = True)

cross_val_score(DecTreeModel, Features, Labels['label'], cv = KfoldDecTree)

array([0.92982456, 0.9122807 , 0.94736842, 0.94736842, 0.92982456,
       0.94736842, 0.96491228, 0.9122807 , 0.96491228, 0.89285714])

### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

RandomForestModel = RandomForestClassifier(n_estimators = 10, max_depth = 5)
KfoldRandomForest = KFold(n_splits=10, shuffle=True)

cross_val_score(RandomForestModel, Features, Labels['label'], cv = KfoldRandomForest)

array([0.96491228, 0.98245614, 0.87719298, 0.96491228, 0.94736842,
       0.94736842, 0.98245614, 1.        , 0.94736842, 0.94642857])

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

RegressionModel = LogisticRegression(max_iter = 100)
Regression = RegressionModel.fit(Features, Labels['label'])

Coefficients = Regression.coef_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Adaboost

In [16]:
from sklearn.ensemble import AdaBoostClassifier

DecTreeModel = tree.DecisionTreeClassifier(max_depth = 5)
AdaBoostModel = AdaBoostClassifier(estimator = DecTreeModel, n_estimators = 10, learning_rate = 1)
KfoldAdaBoost = KFold(n_splits = 10, shuffle = True)

cross_val_score(AdaBoostModel, Features, Labels['label'], cv = KfoldAdaBoost)

array([0.96491228, 0.89473684, 0.94736842, 0.94736842, 1.        ,
       1.        , 0.94736842, 0.9122807 , 0.92982456, 0.96428571])

### Gradient Descent
* Given a square loss function: 
$$\frac{1}{2}(\sigma (w' \cdot x) - y)^2$$

* Where $\sigma$ is the sigmoid activation function: 
$$\frac{1}{(1 + e^{-x})}$$

* And thus the gradient of the loss function with respect to the weights is:

$$ 
\frac{\partial f}{\partial w} = (\sigma(w' \cdot x_i) - y_i) * \sigma(w' \cdot x_i) * (1 - \sigma(w' \cdot x_i)) * x_i
$$


In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Learning rate.
eta = 0.01

# Generate true weights (w), observations (X), and labels (Y).
w, X, Y = GenerateData(m)

# Initialize weights.
w_prime = np.zeros(10)

gradient = np.zeros(10)

for o in range(n_iters):

    # Iterate through each data point in X, Y.
    for i in range(m):

        x_i = X[i]
        y_i = Y[i]

        gradient += (sigmoid(w_prime @ x_i) - y_i) * sigmoid(w_prime @ x_i) * (1 - sigmoid(w_prime @ x_i)) * x_i

    # Calculate the gradient for the whole dataset as the average of the summed
    # gradients from each data point.
    gradient = gradient / m

    w_prime = w_prime - eta * gradient


### Stochastic Gradient Descent