# Binary Classifiers for testing #

### Configurations ###

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import time
import itertools
import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold
from sklearn import cross_validation, metrics, grid_search, linear_model, svm
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

n_fold = 5

# Parameters
lr_param = {'C':[0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5, 1, 5, 10],
           'tol' : [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10]}
# lr_param = {'C':[0.01],
#            'tol' : [1e-6]}
lr_results = pd.DataFrame(np.empty((0,4), float), columns=['c', 'tol', 'AUROC', 'f1'])


# Evaluation metrics
def gini(list_of_values):
  sorted_list = sorted(list(list_of_values))
  height, area = 0, 0
  for value in sorted_list:
    height += value
    area += height - value / 2.
  fair_area = height * len(list_of_values) / 2
  return (fair_area - area) / fair_area
  
def normalized_gini(y_pred, y):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini



### Data Loading ###


In [3]:
# UCI Datasets 
iris_data   = load_iris()
digits_data = load_digits()
print(iris_data.data.shape)
print(digits_data.data.shape)

iris_X = iris_data.data
iris_Y = iris_data.target

digits_X = digits_data.data
digits_Y = digits_data.target


(150, 4)
(1797, 64)


### Execution Function after change target values into binary(1-vs-all) values ###

In [6]:
t1 = time.time()

# Dataset selection
data_X = iris_X
data_Y = iris_Y

data_X = digits_X
data_Y = digits_Y

# Find distinct values in the target
for tgt_y in list(set(data_Y)):
    # Init the new target values
    print('For : ', tgt_y)
    #print(data_Y)
    data_Y_bin = pd.Series(np.zeros(len(data_Y)))
    data_Y_bin[data_Y == tgt_y] = 1
    data_Y_bin[data_Y != tgt_y] = 0
    print(np.vstack((data_Y, data_Y_bin)).T)
    
    kf = StratifiedKFold(data_Y_bin, n_folds = n_fold)
    clf_LogisticLR(kf, data_X, data_Y_bin, lr_param, lr_results)
    
print('Duration %.2f sec' %(time.time() - t1))

For :  0
[[ 0.  1.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        0.005000
tol      0.100000
AUROC    0.991199
f1       0.988484
Name: 16, dtype: float64
For :  1
[[ 0.  0.]
 [ 1.  1.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]


  'precision', 'predicted', average, warn_for)


c        0.005000
tol      0.100000
AUROC    0.991199
f1       0.988484
Name: 16, dtype: float64
For :  2
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  1.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        5.000000e+00
tol      1.000000e-07
AUROC    9.939771e-01
f1       9.913779e-01
Name: 370, dtype: float64
For :  3
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        5.000000e+00
tol      1.000000e-07
AUROC    9.939771e-01
f1       9.913779e-01
Name: 370, dtype: float64
For :  4
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        5.000000e+00
tol      1.000000e-07
AUROC    9.939771e-01
f1       9.913779e-01
Name: 370, dtype: float64
For :  5
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        5.000000e+00
tol      1.000000e-07
AUROC    9.939771e-01
f1       9.913779e-01
Name: 370, dtype: float64
For :  6
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 8.  0.]
 [ 9.  0.]
 [ 8.  0.]]
c        5.000000e+00
tol      1

### [Iris] Change target values into binary(one-vs-all) values ###

In [5]:
def clf_LogisticLR(kf, data_X, data_Y_bin, lr_param, lr_results):
    # Logistic linear regression Model
    for c, tol in itertools.product(lr_param['C'], lr_param['tol']):
        clf_LR = linear_model.LogisticRegression(C=c, penalty='l2', tol=tol)
        cv_results = pd.DataFrame(np.empty((0,2), float), columns=['AUROC', 'f1'])
        
        for train, test in kf:
            clf_LR.fit(data_X[train, :], data_Y_bin[train])
            
            trn_predY = clf_LR.predict(data_X[test, :])
            #print(np.vstack((data_Y_bin[test],trn_predY)).T)
            f1_score = metrics.f1_score(data_Y_bin[test], trn_predY)
            auroc_score = metrics.roc_auc_score(data_Y_bin[test], trn_predY)
            
            cv_results.loc[len(cv_results)] = [auroc_score, f1_score] 
    #         scores_roc  = cross_validation.cross_val_score(clf_LR, data_X, data_Y_bin, cv=n_fold, scoring='roc_auc')
    #         scores_prec = cross_validation.cross_val_score(clf_LR, data_X, data_Y_bin, cv=n_fold, scoring='f1')
    #         scores_rec  = cross_validation.cross_val_score(clf_LR, data_X, data_Y_bin, cv=n_fold, scoring='recall')
    #         scores_acc  = cross_validation.cross_val_score(clf_LR, data_X, data_Y_bin, cv=n_fold, scoring='accuracy')
            #print(c, tol, f1_score)
        
        avg_cv_results = cv_results.mean(axis=0)
        #print(cv_results)
        #print(avg_cv_results['AUROC'])
        lr_results.loc[len(lr_results)] = [c, tol, avg_cv_results['AUROC'], avg_cv_results['f1']]
        del cv_results, avg_cv_results
        #print(lr_results)
    # Select the best parameter    
    best_param_idx = lr_results.idxmax()['f1']
    print(lr_results.loc[best_param_idx, :])

### Logistic Regression ###

In [48]:

#print(lr_results.iloc[[lr_results.idxmax()['AUROC']]])
import sklearn
print(sklearn.__version__)

aa = []
aa.ins

metrics.roc_auc_score

0.16.1
