### 0) Data Importing and Helper Function Definition:

In [1]:
import numpy as np

DATA_folder  = '../../Data/'
data = np.load(DATA_folder+'train_imgs.npy')
lbls = np.load(DATA_folder+'train_lbls.npy')
test_data = np.load(DATA_folder+'test_imgs.npy')
test_lbls = np.load(DATA_folder+'test_lbls.npy')

In [2]:
data[lbls==0,:].mean(axis=0).shape
st = 50
en = 100
k  = 0
all_centers= [data[51,:]]
new_data = (np.square(data[st:en] - np.repeat(all_centers[k][np.newaxis,:],data[st:en].shape[0],axis=0))).mean(axis=1)
new_data.shape

(50,)

### 1) Definition of RBF Transformer Object:

RBF Transformer is a scikit-learn compatible transformer object that implements:

    - fit method       - clusters the digit-separated data, and computes cluster centers and inv. sq. deviations
    - transform method - based on obtained centers and deviations it computes the fi values (RBF layer outputs) as 
                         simple Gaussian functions -> Fi_k(x) = exp{ -sqrt( sum[ (x_i-c_k_i)/(dev_i^2) ] ) } 
                         
## Normalizing each Fi row with row sum -> STABILIZED the Regression a lot!!!

## For PCA fit_transform() method needs to be separated to fit() and then transform()!!!

#### Definition of Clustering Function:

In [3]:
### Clustering:

from sklearn.cluster import KMeans, AgglomerativeClustering

# clust = AgglomerativeClustering(n_clusters=500, linkage='complete')
# cluster_labels = clust.fit_predict(dig_data)

# Function for data clustering, and computation of cluster center vectors and inv. sq. deviation vectors
def form_clusters(data, n_kmeans, n_agglo):
    
    n_km = min(n_kmeans,data.shape[0])
    n_ag = min(n_agglo ,data.shape[0])
    
    lbls_set = []
    
    if n_km>0:
        kmeans = KMeans(n_clusters=n_km, random_state=0, init='k-means++', algorithm='elkan')
        lbls_kmeans = kmeans.fit_predict(data)
        lbls_set.append(lbls_kmeans)
    
    if n_ag>0:
        agglo  = AgglomerativeClustering(n_clusters=n_ag, linkage='complete')
        lbls_agglo  = agglo.fit_predict(data)
        lbls_set.append(lbls_agglo)
    
    centers     = []
    
    # Find cluster centers and covar matrix:
    for lbls in lbls_set:
        for k in range(max(lbls)+1):
            cluster = data[lbls==k,:]
            centers.append(cluster.mean(axis=0))
            
        del cluster
        
    return centers;


#### Definition of RBF (Fi) Function:

In [4]:
import gc

# chunks of 100, (10k smpls, 500 centers) ->  110.8s
# chunks of 200, (10k smpls, 500 centers) ->  102.3s
# chunks of 250, (10k smpls, 500 centers) ->  115.6s
# chunks of 500, (10k smpls, 500 centers) ->  145.1s

# Function to compute whole Fi output for given dataset (for all RB centers)
def fi_transform(data, all_centers):
    # data        - given dataset matrix for which to compute fi values
    # center      - list of all center vectors on which to compute fi vals

    new_data = np.empty((data.shape[0],len(all_centers)))
    st = 0
    ch = 200
    ns = data.shape[0]
    # Process data in chunks of 200 smpls (optimal speed)
    while st<ns:
        en = min(st+ch,ns)
        for k in range(len(all_centers)):
            # # SAME DEVIATION for ALL DIMENSIONS (better much better than cluster separate):
            new_data[st:en,k] = (np.square(data[st:en] - np.repeat(all_centers[k][np.newaxis,:],data[st:en].shape[0],axis=0))).mean(axis=1)
        st = en
        gc.collect()
    new_data = np.exp(-np.sqrt(new_data))
    return new_data

#### Definition of RBF Transformer (sklearn compatible object):

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# SKLEARN Compatible Transformer - supports fit method (custering data) and transform method (calculating fi values)
class myRBFtransformer(BaseEstimator, TransformerMixin):
    
    # Transformer initialization (default to 50 kMeans clusters)
    def __init__(self, n_clusters=250, cl_ratio=0.5, debug=False):
        self.n_clusters  = n_clusters             # number of clusters to be formed per digit
        self.cl_ratio    = min(1,max(0,cl_ratio)) # cluster ratio (between kMeans and Agglomerative)
        # self.centers     = []                     # list of cluster center vectors
        # self.num_centers = []                     # list of number of centers per digit
        self.debug       = debug                  # debug flag
        if self.debug:
            print(self.n_clusters,self.cl_ratio)
    
    # Clusters each digit and finds cluster centers:
    def fit(self, X, y):
        
        self.centers     = []
        self.num_centers = []
        
        # Calc. num. of clusters per digit based on assigned ratio (ratio*(num of smpls / 10))
        n_kmeans = round(self.cl_ratio*self.n_clusters)
        n_agglo  = self.n_clusters - n_kmeans
        
        if self.debug:
            print('Clustering data ',(n_kmeans,n_agglo))
        
        # Cluster the data over each digit
        for dig in range(10):
            # print('Clustering digit: ',dig)
            data = X[y==dig,:]
            centers = form_clusters(data, n_kmeans, n_agglo)
            self.centers.extend(centers)
            self.num_centers.append(len(centers))
        
        return self
    
    # Computes fi values (with Gaussian function) based on obtained cluster centers
    def transform(self, X, y=None):
        
        # Compute all Fi values:
        if self.debug:
            print('Calculating all Fi outputs !!!')
        all_fis = fi_transform(X, self.centers)
        
        return all_fis

In [6]:
# # Test:
# rbf = myRBFtransformer(n_kmeans = 10)
# a   = rbf.fit_transform(X=data[:500,:],y=lbls[:500])
# a.max(axis=1)

### 3) Pipelining the Model:

In [7]:
import types
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression

n_pca      = 545 # First dimension to retain 99 % variance (stanford edu recomendation for images)!!!
n_clusters = 500
cl_ratio   = 0.1

scaler = StandardScaler()
pca    = PCA(n_components=n_pca, whiten=True, random_state=1234)
rbf    = myRBFtransformer(n_clusters=n_clusters,cl_ratio=cl_ratio, debug=False)
logreg = LogisticRegression(tol=1e-12,C=1e+12,random_state=12,solver='liblinear')

############################################################################
# For PCA -> fit_transform() method needs to be modified so it calls fit() #
#            and transform() methods separately (not at the same time) !!! #
#                                                                          #
#         -> otherwise it will produce slightly different outputs when     #
#            transform() is called next time (comp. to fit_transform())    #
#                                                                          #
#            WHICH massively messes up with the OUTPUT of FI LAYER !!!!    #
############################################################################

def new_fit_transform(self, X, y=None, **fit_params):
    if y is None:
        # return self.fit(X, **fit_params).transform(X)
        self.fit(X, **fit_params)
        return self.transform(X)
    else:
        # return self.fit(X, y, **fit_params).transform(X)
        self.fit(X, y, **fit_params)
        return self.transform(X)
    
pca.fit_transform = types.MethodType(new_fit_transform, pca)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipe = Pipeline(steps=[('scal', scaler), ('pca', pca), ('rbf',rbf), ('logreg',logreg)])

In [9]:
# pipe1 = Pipeline(steps=[('scal', scaler), ('pca', pca)])
# new_data = pipe1.fit_transform(test_data)
# rbf.fit(new_data, test_lbls)
# import time
# t = time.time()
# new_data_2 = rbf.transform(new_data)
# print(time.time()-t)

In [10]:
# my_data = test_data[:10000]
# my_lbls = test_lbls[:10000]
# pipe.fit(data,lbls)

# print(pipe.score(my_data,my_lbls)*100)

In [11]:
# from sklearn.model_selection import GridSearchCV
from dask_searchcv import GridSearchCV

Ks = [25, 50, 100]
Rs = [0.9, 1] # 1 kMeans, 0 - Agglo
Cs = [5e2,7.5e3]

fld = 5
dt  = 10000
param_grid = [
    {
        'rbf__n_clusters': Ks,
        'rbf__cl_ratio':   Rs,
        'logreg__C':       Cs
    }
]

In [12]:
grid = GridSearchCV(pipe, cv=fld, n_jobs=-1, param_grid=param_grid)

In [13]:
from dask.diagnostics import ProgressBar

In [14]:
# 5th Fold on test_data 10k (with    deviations)             - around 74%
# 5th Fold on test_data 10k (without deviations)             - around 83.87%
# 5th Fold on test_data 10k (without deviations, no sorting) - around 86.14% (takes 2x more time)

# # 3rd Fold on test_data 5k:
# Ks = [50]; Rs = [0.9]; Ns = [ 5]; Cs = [1e12]; - 77.20 %
# Ks = [50]; Rs = [0.5]; Ns = [10]; Cs = [1e08]; - 76.26 %
# Ks = [50]; Rs = [0.5]; Ns = [10]; Cs = [1e05]; - 76.50 %
# Ks = [50]; Rs = [0.5]; Ns = [10]; Cs = [1e05]; - 76.50 %
# Ks = [50]; Rs = [0.5]; Ns = [10]; Cs = [1e04]; - 77.80 %

# 5th Fold on test_data 5k:
# Ks = [50]; Rs = [0.5]; Ns = [10]; Cs = [7500]; - 86.12 %

# # 5th Fold on test_data 10k:
# Ks = [ 50]; Rs = [1.0]; Ns = [10]; Cs = [7500]; - 86.85 %
# Ks = [250]; Rs = [1.0]; Ns = [10]; Cs = [7500]; - 85.20 %


# # 5th Fold on Test Data 10k:
# Ks = [25, 50, 100]; Rs = [0.9, 1]; Cs = [5e2,7.5e3]                - 1hr 13min 33.0s
# {'logreg__C': 7500.0, 'rbf__cl_ratio': 0.9, 'rbf__n_clusters': 50} - 87.03%


In [15]:
with ProgressBar():
    #grid.fit(data, lbls)
    grid.fit(test_data[:dt], test_lbls[:dt])
    
print(max(grid.cv_results_['mean_test_score'])*100)

[########################################] | 100% Completed |  1hr 13min 33.0s
87.03


In [16]:
print(grid.best_params_)

{'logreg__C': 7500.0, 'rbf__cl_ratio': 0.9, 'rbf__n_clusters': 50}


In [17]:
print(max(grid.cv_results_['mean_test_score'])*100)

87.03


In [18]:
print(grid.cv_results_['mean_test_score'])

[ 0.8552  0.8664  0.8681  0.8573  0.8668  0.8689  0.8664  0.8703  0.858
  0.8667  0.8685  0.86  ]


In [19]:
param_grid

[{'logreg__C': [500.0, 7500.0],
  'rbf__cl_ratio': [0.9, 1],
  'rbf__n_clusters': [25, 50, 100]}]