In [None]:
# import necessary modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.special import expit as sigmoid

from evaluation import test
from utils import load_data, predict_image, scatter_plot, contour_plot
from utils import plot_boundary, load_cat_dataset, load_iris_2D
from utils import costs_plot

from sklearn import datasets
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

from lr import LogisticRegression

What can you do after implementing logistic regression? Of course we can do binary classification! It means we can linearly separate a dataset if it has two classes. For example, the iris dataset, if we combine label 1 and 2 as one class.

In [None]:
# dir(linear_model.SGDClassifier)
print(linear_model.SGDClassifier.decision_function.__doc__)

In [None]:
X = np.array([[-1, -1], 
              [-2, -1], 
              [1, 1], 
              [2, 1],
              [5, 6],
              [7, 8]])
Y = np.array([1, 1, 2, 2, 3, 3])
Y = np.array([1, 1, 2, 2, 2, 2])
# Y = np.array(['a', 'a', 'b', 'b', 'b', 'b'])
# from sklearn.utils import check_X_y
# print(check_X_y(X, Y, 'csr', dtype=np.float64, order="C",
#                          accept_large_sparse=False))
clf = linear_model.SGDClassifier(max_iter=1000, tol=None, loss='log')
clf.fit(X, Y)
clf.coef_.shape, clf.coef_
clf.intercept_.shape, clf.intercept_
clf.predict(X)
# clf.predict_proba(X)
print(clf.predict.__doc__)
clf.average
clf.loss_function_.__doc__
print(clf._fit_multiclass.__doc__)

In [None]:
# Precompute the validation split using the multiclass labels

clf = LogisticRegression(print_cost = False)
clf.fit(X, Y)
clf.coef_.shape, clf.intercept_.shape
clf.weights.shape

In [None]:
def _prepare_fit_binary(est, y, i):
    # also prepares when est.classes_ == 2
    y_i = np.ones(y.shape, dtype=np.float64, order="C")
    y_i[y != est.classes_[i]] = -1.0
    average_intercept = 0
    average_coef = None
    coef = est.coef_[i]
    intercept = est.intercept_[i]
    return y_i, coef, intercept, average_coef, average_intercept

def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
               pos_weight, neg_weight, sample_weight, validation_mask=None,
               random_state=None):
    y_i, coef, intercept, average_coef, average_intercept = \
        _prepare_fit_binary(est, y, i)
    assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]
    result = plain_sgd(coef, intercept, est.loss_function_,
                           penalty_type, alpha, C, est.l1_ratio,
                           dataset, validation_mask, est.early_stopping,
                           validation_score_cb, int(est.n_iter_no_change),
                           max_iter, tol, int(est.fit_intercept),
                           int(est.verbose), int(est.shuffle), seed,
                           pos_weight, neg_weight,
                           learning_rate_type, est.eta0,
                           est.power_t, est.t_, intercept_decay)
    
    
def _make_validation_split(self, y):
        """Split the dataset between training set and validation set.
        Parameters
        ----------
        y : array, shape (n_samples, )
            Target values.
        Returns
        -------
        validation_mask : array, shape (n_samples, )
            Equal to 1 on the validation set, 0 on the training set.
        """
        n_samples = y.shape[0]
        validation_mask = np.zeros(n_samples, dtype=np.uint8)
        if not self.early_stopping:
            # use the full set for training, with an empty validation set
            return validation_mask

        if is_classifier(self):
            splitter_type = StratifiedShuffleSplit
        else:
            splitter_type = ShuffleSplit
        cv = splitter_type(test_size=self.validation_fraction,
                           random_state=self.random_state)
        idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
        if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
            raise ValueError(
                "Splitting %d samples into a train set and a validation set "
                "with validation_fraction=%r led to an empty set (%d and %d "
                "samples). Please either change validation_fraction, increase "
                "number of samples, or disable early_stopping."
                % (n_samples, self.validation_fraction, idx_train.shape[0],
                   idx_val.shape[0]))

        validation_mask[idx_val] = 1
        return validation_mask

In [None]:
X, y = load_iris_2D()
# pd.DataFrame(X).describe()
# pd.DataFrame(X).info()

In [None]:
# scatter plot the data in two dimension
scatter_plot(X, y);

In [None]:
skclf = linear_model.LogisticRegression(fit_intercept=True, solver='lbfgs')
skclf.fit(X,y)
contour_plot(X, y, skclf);

What we do if our data is not linearly separatable?

In [None]:
microchip = load_data('microchip_tests.txt')
X = microchip.iloc[:,:2].values
y = microchip.iloc[:,2].values
scatter_plot(X, y);

We can use some feature engineering trick to increase the number of features of our original data so that it can be linearly seperatable in high dimension. Then visualize how curve, which is also the line in hign dimensional space to seperate our dataset.

In [None]:
poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
X_poly.shape

In [None]:
skclf = linear_model.LogisticRegression(solver='newton-cg')
skclf.fit(X_poly, y)
scatter_plot(X, y)
plot_boundary(skclf, X, y, grid_step=.01, poly_featurizer=poly);

A more interesting example would be to use logistic regression to build a classifier for cat images.

In [None]:
X_train, X_test, y_train, y_test, num_px, classes = load_cat_dataset()

In [None]:
skclf = linear_model.LogisticRegression(penalty='none', solver='lbfgs', max_iter=1000)
skclf.fit(X_train, y_train);

In [None]:
cat_image = "my_image3.jpg"   # change this to the name of your image file 
predict_image(clf= skclf, fname=cat_image, num_px=num_px, classes=classes, plot_image=True)

In [None]:
not_cat_image = "my_image.jpg"
predict_image(clf= skclf, fname=not_cat_image, num_px=num_px, classes=classes, plot_image=True)

Enough motivation examples! Next step let's see how we can build our own logistic regression from scratch.

TODO

In [None]:
X, y = datasets.load_iris(return_X_y=True)
clf = linear_model.LogisticRegression(random_state=0, solver='lbfgs',
                                      max_iter=200,
                                      multi_class='multinomial').fit(X, y)
clf.score(X, y)
clf = linear_model.LogisticRegression(random_state=0, solver='lbfgs',
                                      max_iter=200,
                                      multi_class='ovr').fit(X, y)
clf.score(X, y)

References:

https://github.com/beckernick/logistic_regression_from_scratch

https://github.com/martinpella/logistic-reg/blob/master/logistic_reg.ipynb

https://www.kaggle.com/kashnitsky/topic-4-linear-models-part-3-regularization

https://github.com/Benlau93/Machine-Learning-by-Andrew-Ng-in-Python/blob/master/LogisticRegression/ML_RegularizedLogisticRegression.ipynb

In [None]:
clf = LogisticRegression(num_iterations = 2000, 
                         steps = 20,
                         learning_rate = 0.005,
                         print_cost = False)
clf.fit(X_train, y_train)

In [None]:
costs_plot(clf)