# Implementing your own Logistic Regression step by step

You've see the theory part of Logistic Regression, now let's see how to implement it.

In [1]:
# import necessary modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.special import expit as sigmoid

from evaluation import test
from utils import load_data, predict_image, scatter_plot, contour_plot
from utils import plot_boundary, load_cat_dataset, load_iris_2D
from utils import costs_plot

from sklearn import datasets
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

from lr import LogisticRegression

In [2]:
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([0, 0, 1, 1])
clf = linear_model.LogisticRegression(solver='lbfgs')
clf.fit(X, y)
clf.predict(X)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

array([0, 0, 1, 1])

Given `X`, we need to learn weight so that our model can predict results using

```python
sigmoid(np.dot(X, coef_) + intercept_)
```

In [3]:
m, n_features = X.shape
coef_ = np.zeros(shape=(1, n_features))
intercept_ = np.zeros(shape=(1,))

Every iteration we update the weights by the vectorized equation given by theory.

In [4]:
y.shape = (m, 1)
max_iter = 100
learning_rate = 1e-2
for step in range(max_iter):  
    preds = sigmoid(np.dot(X, coef_.T) + intercept_)
    error = preds - y
    gradient = np.dot(X.T, error) 
    coef_ -= learning_rate * gradient.T / m
    intercept_ -= learning_rate * error.sum() / m
coef_, intercept_

(array([[0.51392593, 0.35277722]]), array([-4.99600361e-18]))

After finishing the learning, we can use the weight and the sigmoid function to predict new data. If the output value is greater than 0.5, we output 1, otherwise we output 0.

In [5]:
sigmoid(np.dot(X, coef_.T) + intercept_)
sigmoid(np.dot(X, coef_.T) + intercept_).round().astype(np.int).ravel()

array([[0.29594077],
       [0.20090799],
       [0.70405923],
       [0.79909201]])

array([0, 0, 1, 1])

You may notice that for now our algorithm only support binary classification with value `[0,1]`. What if we have other labels values such as `[1,2]` or even text ones? [`LabelEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) comes to rescue.

## Predict Multiclass Dataset

`LabelEncoder` removes the restriction our label value, but our model is still a binary classifier. We need to make some change to our algorithm so that it can predict more than two classes.

In [6]:
X = np.array([[-1, -1], 
              [-2, -1], 
              [1, 1], 
              [2, 1],
              [5, 6],
              [7, 8]])
y = np.array([1, 1, 2, 2, 3, 3])

To predict the above dataset. The first thing we need to do is to change the shape of `coef_` and `intercept_` so that when we use 

```python
np.dot(X, coef_.T) + intercept_
```

to predict the label, it can output a row vector of the probability that item belongs to each of the three class.

In [7]:
n_classes = len(np.unique(y))
m, n_features = X.shape
coef_ = np.zeros(shape=(n_classes, n_features))
intercept_ = np.zeros(shape=(n_classes,))

There are two method to predict multiclass labels, we look at one versus rest first. On `ovr` setting, we format our multiclass task into `k` binary classification problem, where `k` refers to the number of classes in the training set. In the above example, we have `k = 3`. For every binary classification, we need to change `y` into

```python
y_i = np.apply_along_axis(lambda x: np.where(x == i, 1, 0), axis=0, arr=y)
```

Then we training and save the learning weights into `coef_` and `intercept_`.

Then we training and save the learning weights into `coef_` and `intercept_`.

In [8]:
le = LabelEncoder()
y = le.fit_transform(y)
classes_ = le.classes_
k = len(classes_)
max_iter = 1000
for i in range(k):
    y_i = np.apply_along_axis(lambda x: np.where(x == i, 1, 0), axis=0, arr=y)
    for step in range(max_iter):  
        preds = sigmoid(np.dot(X, coef_[i].T) + intercept_[i])
        error = preds - y_i
        gradient = np.dot(X.T, error) 
        coef_[i] -= learning_rate * gradient.T / m
        intercept_[i] -= learning_rate * error.sum() / m
coef_
intercept_

array([[-1.32043175, -1.03902171],
       [ 0.95449385, -1.02779511],
       [ 0.03911706,  0.67693788]])

array([-0.01246303, -0.28576712, -1.72829777])

As before we use our sigmoid function to predict new data. But instead of outputing a single value to threshold at 0.5, it outputs the probability of a example belongs to each class, we classify the class with the highest value.

In [9]:
scores = sigmoid(np.dot(X, coef_.T) + intercept_)
scores
classes_[scores.argmax(axis=1)]

array([[9.12694714e-01, 4.47082450e-01, 7.98525066e-02],
       [9.75093928e-01, 2.37404969e-01, 7.70251656e-02],
       [8.53394251e-02, 4.11185102e-01, 2.66541155e-01],
       [2.43078387e-02, 6.44609029e-01, 2.74257909e-01],
       [2.62940044e-06, 1.57086393e-01, 9.26144159e-01],
       [2.34672728e-08, 1.38635322e-01, 9.81312291e-01]])

array([1, 1, 2, 2, 3, 3])

Wrap up all the above step in a `LogisticRegression` class, we can train and predict as follows.

In [11]:
X = np.array([[-1, -1], 
              [-2, -1], 
              [1, 1], 
              [2, 1],
              [5, 6],
              [7, 8]])
y = np.array([1, 1, 2, 2, 3, 3])
clf = LogisticRegression()
clf.fit(X, y)
preds = clf.predict(X)
preds

array([1, 1, 2, 2, 3, 3])

This finishes our first implementation of logistic regression. Note that our model learns weights in a iterative manner called gradient descent. For each epoch, we averge the gradient and update the weights once. Another alternative gradient descent method, you might have heard is called stochastic gradient descent, which update the weight on every training example. Let's see how to implement it next.

---

What can you do after implementing logistic regression? Of course we can do binary classification! It means we can linearly separate a dataset if it has two classes. For example, the iris dataset, if we combine label 1 and 2 as one class.

In [None]:
# dir(linear_model.SGDClassifier)
print(linear_model.SGDClassifier.decision_function.__doc__)

In [None]:
X = np.array([[-1, -1], 
              [-2, -1], 
              [1, 1], 
              [2, 1],
              [5, 6],
              [7, 8]])
Y = np.array([1, 1, 2, 2, 3, 3])
Y = np.array([1, 1, 2, 2, 2, 2])
# Y = np.array(['a', 'a', 'b', 'b', 'b', 'b'])
# from sklearn.utils import check_X_y
# print(check_X_y(X, Y, 'csr', dtype=np.float64, order="C",
#                          accept_large_sparse=False))
clf = linear_model.SGDClassifier(max_iter=1000, tol=None, loss='log')
clf.fit(X, Y)
clf.coef_.shape, clf.coef_
clf.intercept_.shape, clf.intercept_
clf.predict(X)
# clf.predict_proba(X)
print(clf.predict.__doc__)
clf.average
clf.loss_function_.__doc__
print(clf._fit_multiclass.__doc__)

In [None]:
# Precompute the validation split using the multiclass labels

clf = LogisticRegression(print_cost = False)
clf.fit(X, Y)
clf.coef_.shape, clf.intercept_.shape
clf.weights.shape

In [None]:
def _prepare_fit_binary(est, y, i):
    # also prepares when est.classes_ == 2
    y_i = np.ones(y.shape, dtype=np.float64, order="C")
    y_i[y != est.classes_[i]] = -1.0
    average_intercept = 0
    average_coef = None
    coef = est.coef_[i]
    intercept = est.intercept_[i]
    return y_i, coef, intercept, average_coef, average_intercept

def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
               pos_weight, neg_weight, sample_weight, validation_mask=None,
               random_state=None):
    y_i, coef, intercept, average_coef, average_intercept = \
        _prepare_fit_binary(est, y, i)
    assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]
    result = plain_sgd(coef, intercept, est.loss_function_,
                           penalty_type, alpha, C, est.l1_ratio,
                           dataset, validation_mask, est.early_stopping,
                           validation_score_cb, int(est.n_iter_no_change),
                           max_iter, tol, int(est.fit_intercept),
                           int(est.verbose), int(est.shuffle), seed,
                           pos_weight, neg_weight,
                           learning_rate_type, est.eta0,
                           est.power_t, est.t_, intercept_decay)
    
    
def _make_validation_split(self, y):
        """Split the dataset between training set and validation set.
        Parameters
        ----------
        y : array, shape (n_samples, )
            Target values.
        Returns
        -------
        validation_mask : array, shape (n_samples, )
            Equal to 1 on the validation set, 0 on the training set.
        """
        n_samples = y.shape[0]
        validation_mask = np.zeros(n_samples, dtype=np.uint8)
        if not self.early_stopping:
            # use the full set for training, with an empty validation set
            return validation_mask

        if is_classifier(self):
            splitter_type = StratifiedShuffleSplit
        else:
            splitter_type = ShuffleSplit
        cv = splitter_type(test_size=self.validation_fraction,
                           random_state=self.random_state)
        idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
        if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
            raise ValueError(
                "Splitting %d samples into a train set and a validation set "
                "with validation_fraction=%r led to an empty set (%d and %d "
                "samples). Please either change validation_fraction, increase "
                "number of samples, or disable early_stopping."
                % (n_samples, self.validation_fraction, idx_train.shape[0],
                   idx_val.shape[0]))

        validation_mask[idx_val] = 1
        return validation_mask

In [None]:
X, y = load_iris_2D()
# pd.DataFrame(X).describe()
# pd.DataFrame(X).info()

In [None]:
# scatter plot the data in two dimension
scatter_plot(X, y);

In [None]:
skclf = linear_model.LogisticRegression(fit_intercept=True, solver='lbfgs')
skclf.fit(X,y)
contour_plot(X, y, skclf);

What we do if our data is not linearly separatable?

In [None]:
microchip = load_data('microchip_tests.txt')
X = microchip.iloc[:,:2].values
y = microchip.iloc[:,2].values
scatter_plot(X, y);

We can use some feature engineering trick to increase the number of features of our original data so that it can be linearly seperatable in high dimension. Then visualize how curve, which is also the line in hign dimensional space to seperate our dataset.

In [None]:
poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
X_poly.shape

In [None]:
skclf = linear_model.LogisticRegression(solver='newton-cg')
skclf.fit(X_poly, y)
scatter_plot(X, y)
plot_boundary(skclf, X, y, grid_step=.01, poly_featurizer=poly);

A more interesting example would be to use logistic regression to build a classifier for cat images.

In [None]:
X_train, X_test, y_train, y_test, num_px, classes = load_cat_dataset()

In [None]:
skclf = linear_model.LogisticRegression(penalty='none', solver='lbfgs', max_iter=1000)
skclf.fit(X_train, y_train);

In [None]:
cat_image = "my_image3.jpg"   # change this to the name of your image file 
predict_image(clf= skclf, fname=cat_image, num_px=num_px, classes=classes, plot_image=True)

In [None]:
not_cat_image = "my_image.jpg"
predict_image(clf= skclf, fname=not_cat_image, num_px=num_px, classes=classes, plot_image=True)

Enough motivation examples! Next step let's see how we can build our own logistic regression from scratch.

TODO

In [None]:
X, y = datasets.load_iris(return_X_y=True)
clf = linear_model.LogisticRegression(random_state=0, solver='lbfgs',
                                      max_iter=200,
                                      multi_class='multinomial').fit(X, y)
clf.score(X, y)
clf = linear_model.LogisticRegression(random_state=0, solver='lbfgs',
                                      max_iter=200,
                                      multi_class='ovr').fit(X, y)
clf.score(X, y)

References:

https://github.com/beckernick/logistic_regression_from_scratch

https://github.com/martinpella/logistic-reg/blob/master/logistic_reg.ipynb

https://www.kaggle.com/kashnitsky/topic-4-linear-models-part-3-regularization

https://github.com/Benlau93/Machine-Learning-by-Andrew-Ng-in-Python/blob/master/LogisticRegression/ML_RegularizedLogisticRegression.ipynb

In [None]:
clf = LogisticRegression(num_iterations = 2000, 
                         steps = 20,
                         learning_rate = 0.005,
                         print_cost = False)
clf.fit(X_train, y_train)

In [None]:
costs_plot(clf)