### 0) Data Importing and Helper Function Definition:

In [80]:
import numpy as np

DATA_folder  = '../../Data/'
data = np.load(DATA_folder+'train_imgs.npy')
lbls = np.load(DATA_folder+'train_lbls.npy')
test_data = np.load(DATA_folder+'test_imgs.npy')
test_lbls = np.load(DATA_folder+'test_lbls.npy')

### 1) Definition of RBF Transformer Object:

RBF Transformer is a scikit-learn compatible transformer object that implements:

    - fit method       - clusters the digit-separated data, and computes cluster centers and inv. sq. deviations
    - transform method - based on obtained centers and deviations it computes the fi values (RBF layer outputs) as 
                         simple Gaussian functions -> Fi_k(x) = exp{ -sqrt( sum[ (x_i-c_k_i)/(dev_i^2) ] ) } 
                         
## Normalizing each Fi row with row sum -> STABILIZED the Regression a lot!!!

## For PCA fit_transform() method needs to be separated to fit() and then transform()!!!

#### Definition of Clustering Function:

In [81]:
### Clustering:

# from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.cluster import KMeans, SpectralClustering
import random

# clust = AgglomerativeClustering(n_clusters=500, linkage='complete')
# cluster_labels = clust.fit_predict(dig_data)

# Function for data clustering, and computation of cluster center vectors and inv. sq. deviation vectors
def form_clusters(data, n_kmeans, n_agglo, n_random, dig):
    
    n_km = min(n_kmeans,data.shape[0])
    n_ag = min(n_agglo ,data.shape[0])
    n_rn = min(n_random,data.shape[0])
    
    lbls_set = []
    
    if n_km>0:
        kmeans = KMeans(n_clusters=n_km, random_state=0, init='k-means++', algorithm='elkan')
        lbls_kmeans = kmeans.fit_predict(data)
        lbls_set.append(lbls_kmeans)
    
    if n_ag>0:
        agglo  = SpectralClustering(n_clusters=n_ag, affinity='nearest_neighbors',eigen_solver='arpack',random_state=456,assign_labels='discretize')
        lbls_agglo  = agglo.fit_predict(data)
        lbls_set.append(lbls_agglo)
    
    centers     = []
    
    # Find cluster centers and covar matrix:
    for lbls in lbls_set:
        for k in range(max(lbls)+1):
            cluster = data[lbls==k,:]
            centers.append(cluster.mean(axis=0))
            
        del cluster
        
    # Add random points as centers:
    if n_rn>0:
        # seed a new random generator (to get repeatable results for hyperparam tuning)
        prng = np.random.RandomState()
        prng.seed(dig*n_rn)
        smpls = prng.choice(data.shape[0],n_rn,replace=False)
        for s in smpls:
            centers.append(data[s,:])
        
    return centers;


#### Definition of RBF (Fi) Function:

In [82]:
import gc

# chunks of 100, (10k smpls, 500 centers) ->  110.8s
# chunks of 200, (10k smpls, 500 centers) ->  102.3s
# chunks of 250, (10k smpls, 500 centers) ->  115.6s
# chunks of 500, (10k smpls, 500 centers) ->  145.1s

# Function to compute whole Fi output for given dataset (for all RB centers)
def fi_transform(data, all_centers):
    # data        - given dataset matrix for which to compute fi values
    # center      - list of all center vectors on which to compute fi vals

    new_data = np.empty((data.shape[0],len(all_centers)))
    st = 0
    ch = 200
    ns = data.shape[0]
    # Process data in chunks of 200 smpls (optimal speed)
    while st<ns:
        en = min(st+ch,ns)
        for k in range(len(all_centers)):
            # # SAME DEVIATION for ALL DIMENSIONS (better much better than cluster separate):
            new_data[st:en,k] = (np.square(data[st:en] - np.repeat(all_centers[k][np.newaxis,:],data[st:en].shape[0],axis=0))).mean(axis=1)
        st = en
        gc.collect()
    new_data = np.exp(-np.sqrt(new_data))
    return new_data

### Definition od Kernel PCA Layer

In [83]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
import time, gc

# SKLEARN Compatible Transformer - supports fit method (finding kPCA) and transform method (applying kPCA)
class myKernelPCA(BaseEstimator, TransformerMixin):
    
    # Transformer initialization (default to 250 dimensions)
    def __init__(self, n_kPCA=250, kernel = 'poly', degree=4, whiten=True, debug=False):
        self.n_kPCA    = n_kPCA # number of kernel PCA dimensions to be retained
        self.kernel    = kernel # kernel type (rbf, poly, cosine)
        self.degree    = degree # poly kernel degree
        self.whiten    = whiten # whiten data at output layer (zero mean, un var)
        self.debug     = debug  # debug flag
        self.max_smpls = 250    # max number of samples per digit
    
    # Clusters each digit and finds cluster centers:
    def fit(self, X, y):
        
        self.kPCA = KernelPCA(n_components=self.n_kPCA,kernel=self.kernel,copy_X=False,remove_zero_eig=True,random_state=1389)
        
        if self.debug:
            print('Applying Kernel PCA !!!')
            t = time.time()
        
        # If we have less than max number of smpls - fit regular kPCA
        if X.shape[0]<(self.max_smpls*10):
            self.kPCA.fit(X)
            # If required - whiten the data (for clustering)
            if self.whiten:
                self.scaler = StandardScaler().fit(self.kPCA.transform(X))
        
        # Else randomly pick max number of smpls for each digit and fit
        else:
            # Fix the random seed (for repeatable results)
            prng = np.random.RandomState()
            prng.seed(654)
            # Check how many smps are available per digit
            l = []
            for dig in range(10):
                l.append(min(self.max_smpls,sum(y==dig)))
            # Preallocate memory
            new_data = np.empty((sum(l),X.shape[1]))
            # Fill with randomly drawn smpls
            st = 0
            for dig in range(10):
                en = st+l[dig]
                data  = X[y==dig]
                smpls = prng.choice(data.shape[0],l[dig],replace=False)
                new_data[st:en,] = data[smpls,]
                st = en
            # Fit KernelPCA to new_data
            self.kPCA.fit(new_data)
            # If required - whiten the data (for clustering)
            if self.whiten:
                self.scaler = StandardScaler().fit(self.kPCA.transform(new_data))
            # Release old vars, and collect garbage
            del data, smpls, new_data
            gc.collect()
            
        if self.debug:
            print('Kernel PCA fitted in: ',(time.time()-t))
        
        return self
    
    # Clculate Kernel Components based on obtained model
    def transform(self, X, y=None):
        
        if self.debug:
            print('Applying Kernel PCA transformation!!! (data', X.shape,')')
            t = time.time()
        # Preallocate memory:
        new_data = np.empty((X.shape[0],self.n_kPCA))
        st = 0
        ch = 200
        ns = X.shape[0]
        # Transform data in chunks of 200 smpls (memory hungry transformation)
        while st<ns:
            en = min(st+ch,ns)
            new_data[st:en,:] = self.kPCA.transform(X[st:en,:])
            st = en
        
        if self.whiten:
            new_data = self.scaler.transform(new_data)
                
        if self.debug:
            print('Kernel PCA transformed in: ',time.time()-t)
        
        return new_data
    
    def fit_transform(self, X, y):
        self.fit(X,y)
        return self.transform(X)

#### Definition of RBF Transformer (sklearn compatible object):

In [84]:
from sklearn.base import BaseEstimator, TransformerMixin
import time, gc

# SKLEARN Compatible Transformer - supports fit method (custering data) and transform method (calculating fi values)
class myRBFtransformer(BaseEstimator, TransformerMixin):
    
    # Transformer initialization (default to 50 kMeans clusters)
    def __init__(self, n_centers=250, rn_ratio=0.1, cl_ratio=0.5, debug=False):
        self.n_centers = n_centers              # number of data centers to be formed per digit
        self.rn_ratio  = min(1,max(0,rn_ratio)) # random  ratio (between Random Selection and Clustering)
        self.cl_ratio  = min(1,max(0,cl_ratio)) # cluster ratio (between kMeans and Agglomerative)
        self.debug     = debug                  # debug flag
        if self.debug:
            print(self.n_centers,self.cl_ratio)
    
    # Clusters each digit and finds cluster centers:
    def fit(self, X, y):
        
        self.centers     = [] # list of cluster center vectors
        self.num_centers = [] # list of number of centers per digit
        
        # Calc. num. of clusters per digit based on assigned ratio (ratio*(num of smpls / 10))
        n_random   = round(self.rn_ratio*self.n_centers)
        n_clusters = self.n_centers - n_random
        n_kmeans   = round(self.cl_ratio*n_clusters)
        n_agglo    = n_clusters - n_kmeans
        
        if self.debug:
            print('Clustering data ',(n_kmeans,n_agglo))
            t = time.time()
        
        # Cluster the data over each digit
        for dig in range(10):
            # print('Clustering digit: ',dig)
            data = X[y==dig,:]
            centers = form_clusters(data, n_kmeans, n_agglo, n_random, dig)
            self.centers.extend(centers)
            self.num_centers.append(len(centers))
            
        if self.debug:
            print('Clustering time: ',(time.time()-t))
        
        del data, centers
        gc.collect()
        
        return self
    
    # Computes fi values (with Gaussian function) based on obtained data centers
    def transform(self, X, y=None):
        
        # Compute all Fi values:
        if self.debug:
            print('Calculating all Fi outputs !!! (data', X.shape,')')
            t = time.time()
        all_fis = fi_transform(X, self.centers)
        
        if self.debug:
            print('Fi calculation time: ',time.time()-t)
        
        return all_fis
    
    def fit_transform(self, X, y):
        self.fit(X,y)
        return self.transform(X)

In [85]:
# # Test:
# rbf = myRBFtransformer(n_kmeans = 10)
# a   = rbf.fit_transform(X=data[:500,:],y=lbls[:500])
# a.max(axis=1)

### 3) Pipelining the Model:

In [86]:
import types
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression

# n_pca     = 545 # First dimension to retain 99 % variance (stanford edu recomendation for images)!!!
n_kPCA    = 250
degree    = 6
kernel    = 'poly'
n_centers = 50
rn_ratio  = 0.3
cl_ratio  = 0.3
C         = 1e3
tol       = 1e-1
debug     = False


scaler = StandardScaler()
pca    = myKernelPCA(kernel=kernel,degree=degree,n_kPCA=n_kPCA,debug=debug)
rbf    = myRBFtransformer(n_centers=n_centers,rn_ratio=rn_ratio,cl_ratio=cl_ratio, debug=debug)
logreg = LogisticRegression(tol=tol,C=C,random_state=12,solver='liblinear')


In [87]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipe = Pipeline(steps=[('scal', scaler), ('pca', pca), ('rbf',rbf), ('logreg',logreg)])

In [88]:
# from sklearn.model_selection import GridSearchCV
from dask_searchcv import GridSearchCV

Ks  = [ 50]
RNs = [0.3] # Random Ratio
CLs = [0.3] # Cluster Ratio
Cs  = [1e3]

param_grid = [
    {
        'rbf__n_centers': Ks,
        'rbf__rn_ratio':  RNs,
        'rbf__cl_ratio':  CLs,
        'logreg__C':      Cs
    }
]

In [89]:
from sklearn.model_selection import PredefinedSplit
import gc

test_fold = [-1]*data.shape[0]+[0]*test_data.shape[0]
ps = PredefinedSplit(test_fold)

tot_data = np.vstack((data,test_data))
tot_lbls = np.hstack((lbls,test_lbls))

del data,test_data,lbls,test_lbls
gc.collect()

389

In [90]:
grid = GridSearchCV(pipe, cv=ps, n_jobs=-1, param_grid=param_grid, refit=False, return_train_score=False)

In [91]:
from dask.diagnostics import ProgressBar

In [92]:
 # Fit on train data and check with test data:
# n_centers = 50; rn_ratio = 0.3; cl_ratio = 1.0; C = 7500; - 91.91%
# # Fit on train data and check with test data (spectral):
# n_centers = 50; rn_ratio = 0.3; cl_ratio = 0.3; C = 7500; - 92.05%

# # With KERNEL PCA (poly, degree=9, n_kPCA = 250):
# Ks  = [ 50]; RNs = [0.3]; CLs = [0.3]; Cs  = [1e4]; - 94.82% (1hr 15min 34.7s)
# # With KERNEL PCA (rbf, n_kPCA = 250):
# Ks  = [ 50]; RNs = [0.3]; CLs = [0.3]; Cs  = [1e3]; - 94.82% (1hr 15min 34.7s)

# # With KERNEL PCA (poly, degree=6, n_kPCA = 500):
# Ks  = [ 50]; RNs = [0.3]; CLs = [0.3]; Cs  = [1e3]; - 94.88% (1hr 11min 48.4s)

# # With KERNEL PCA (poly, degree=6, n_kPCA = 500):
# Ks  = [50, 250]; RNs = [0.3, 0.7]; CLs = [1.0]; Cs  = [1e3]; - 96.47% (6hr 32min 51.8s) (best: K=250; RN=0.7;)
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 250, 'rbf__rn_ratio': 0.7} - 96.47%


In [None]:
# ### KERNEL PCA (poly, degree=6, n_kPCA = 500):

# ### Parameter Grid:
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 50, 'rbf__rn_ratio': 0.3}
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 50, 'rbf__rn_ratio': 0.7}
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 250, 'rbf__rn_ratio': 0.3}
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 250, 'rbf__rn_ratio': 0.7}
# ### Mean Fit Times:
# [  2963.   3157.  10559.  11954.]
# ### Mean Score Times
# [  92.   88.  195.  139.]
# ### Mean Train Score
# [ 95.58  95.24  98.06  97.99]
# ### Mean Test Score
# [ 94.84  94.69  96.41  96.47]

# ### Parameter Grid: (it was actually around 7 hours)
# {'logreg__C': 1000.0, 'rbf__cl_ratio': 1.0, 'rbf__n_centers': 300, 'rbf__rn_ratio': 1.0}
# ### Mean Fit Times:
# [ 11092.]
# ### Mean Score Times
# [ 319.]
# ### Mean Train Score
# [ 97.99]
# ### Mean Test Score
# [ 96.55]

In [None]:
with ProgressBar():
    #grid.fit(data, lbls)
    grid.fit(tot_data, tot_lbls)

print(max(grid.cv_results_['mean_test_score'])*100)

[############                            ] | 31% Completed |  3min  8.2s

In [None]:
print('### Parameter Grid:')
for k in range(len(grid.cv_results_['params'])):
    print(grid.cv_results_['params'][k])
print('\n### Mean Fit Times:')
print(np.round(grid.cv_results_['mean_fit_time']))
print('\n### Mean Score Times')
print(np.round(grid.cv_results_['mean_score_time']))
# print('\n### Mean Train Score')
# print(np.round(grid.cv_results_['mean_train_score']*100,2))
print('\n### Mean Test Score')
print(np.round(grid.cv_results_['mean_test_score']*100,2))

In [None]:
Ks  = [300]
RNs = [1.0] # Random Ratio
CLs = [1.0] # Cluster Ratio
Cs  = [1e3]

param_grid = [
    {
        'rbf__n_centers': Ks,
        'rbf__rn_ratio':  RNs,
        'rbf__cl_ratio':  CLs,
        'logreg__C':      Cs
    }
]

In [None]:
grid = GridSearchCV(pipe, cv=ps, n_jobs=-1, param_grid=param_grid)

In [None]:
with ProgressBar():
    #grid.fit(data, lbls)
    grid.fit(tot_data, tot_lbls)
    
print(max(grid.cv_results_['mean_test_score'])*100)

In [None]:
# print(grid.best_params_)
# print(grid.cv_results_['mean_test_score']*100)

In [None]:
print('### Parameter Grid:')
for k in range(len(grid.cv_results_['params'])):
    print(grid.cv_results_['params'][k])
print('\n### Mean Fit Times:')
print(np.round(grid.cv_results_['mean_fit_time']))
print('\n### Mean Score Times')
print(np.round(grid.cv_results_['mean_score_time']))
print('\n### Mean Train Score')
print(np.round(grid.cv_results_['mean_train_score']*100,2))
print('\n### Mean Test Score')
print(np.round(grid.cv_results_['mean_test_score']*100,2))