In [2]:
# IMPORTS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

%matplotlib inline

In [3]:
# OPTIONS
IMAGES_TO_PROCESS = 5000
TEST_RATIO = 0.15
RANDOM_STATE = 42
VERBOSE = 3
PCA_N_COMPENENTS = 100

In [4]:
# Load data + pull out images and lables
labeled_images = pd.read_csv("./data/digits-recognizer/train.csv")

images = labeled_images.iloc[0:IMAGES_TO_PROCESS, 1:]
labels = labeled_images.iloc[0:IMAGES_TO_PROCESS, :1]

In [5]:
# Split to train and test datasets
images_train, images_test, labels_train, labels_test = train_test_split(images,
                                                                        labels,
                                                                        test_size=TEST_RATIO,
                                                                        random_state=RANDOM_STATE,
                                                                        # to preserve initial class balance
                                                                        stratify=labeled_images[
                                                                                     labeled_images.columns[0]].values[
                                                                                 :IMAGES_TO_PROCESS])


In [8]:
% time

# Pipeline of PCA & LinearRegression

pca = PCA(n_components=PCA_N_COMPENENTS)

# Logistic Regression Parameters :
# 
# n_jobs - cores to use, -1 -> all 
# 
# C - inverse regularization, a scalar represent an
#  inverse factor to the loss faction, the smaller C is the stronger the regularization is. 
# 
# fit_intercept - centralize + set X = 0 predictors to Y's mean, in our model we have predictors with 0 as valid and 
# meaningful value (pixel color), so we don't want to centralize our data 
# 
# class_weight - the weight of each class (
# feature type, e.g: boolean etc..), none for uniform weight, balanced for using the ratio to of the class 
# occurrences in compare to data size and dict is using the class as key and the value is the corresponding class 
# weight. 
# 
# random_state - the input random factor for how the data will be randomly being split, if none - the random
#                state is a np.random() result, if the same random state been defined, the data will be being split
#                over and over by the same way.
# 
# solver - optimization algorithm, options: {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, explained well
#          in the library.
# 
# multi_class - 'ovr' for binary question for each label (label[i] will be defined as 1/0), 
#               'multinomial' for multiple result tags for labels.

# NOTE: lbfgs is not recommended with linear regression, explained - 
# https://stats.stackexchange.com/questions/191466/bfgs-lbfgs-for-linear-regression-overkill-or-compatibility-issue 
clf = LogisticRegression(n_jobs=-1, C=1.0, fit_intercept=False, class_weight="balanced", random_state=RANDOM_STATE,
                         verbose=VERBOSE, multi_class='multinomial', solver="sag")

pipe = Pipeline([('pca', pca), ('logistic', clf)])
pipe.fit(images_train, labels_train.values.ravel())

score = accuracy_score(pipe.predict(images_test), labels_test)
print "accuracy: {0}".format(score)


CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


Process PoolWorker-2:


Process PoolWorker-5:


Process PoolWorker-4:


Process PoolWorker-3:


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


Traceback (most recent call last):


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap


    self.run()


    self.run()


    self.run()


    self.run()


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run


    self._target(*self._args, **self._kwargs)


    self._target(*self._args, **self._kwargs)


    self._target(*self._args, **self._kwargs)


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 102, in worker


    self._target(*self._args, **self._kwargs)


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 102, in worker


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 102, in worker


    task = get()


  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 102, in worker


    task = get()


    task = get()


    task = get()


  File "/usr/local/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get


  File "/usr/local/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get


    racquire()


  File "/usr/local/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get


  File "/usr/local/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 362, in get


    racquire()


    return recv()


KeyboardInterrupt


    racquire()


KeyboardInterrupt


KeyboardInterrupt


KeyboardInterrupt
