### 0) Data Importing and Helper Function Definition:

In [1]:
import numpy as np

DATA_folder  = '../../Data/'
data = np.load(DATA_folder+'train_imgs.npy')
lbls = np.load(DATA_folder+'train_lbls.npy')
test_data = np.load(DATA_folder+'test_imgs.npy')
test_lbls = np.load(DATA_folder+'test_lbls.npy')

In [2]:
data[lbls==0,:].shape

(5923, 784)

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


### 1) Definition of RBF Transformer Object:

RBF Transformer is a scikit-learn compatible transformer object that implements:

    - fit method       - clusters the digit-separated data, and computes cluster centers and inv. sq. deviations
    - transform method - based on obtained centers and deviations it computes the fi values (RBF layer outputs) as 
                         simple Gaussian functions -> Fi_k(x) = exp{ -sqrt( sum[ (x_i-c_k_i)/(dev_i^2) ] ) } 
                         
## Normalizing each Fi row with row sum -> STABILIZED the Regression a lot!!!

## For PCA fit_transform() method needs to be separated to fit() and then transform()!!!

#### Definition of Clustering Function:

In [4]:
### Clustering:

from sklearn.cluster import KMeans, AgglomerativeClustering

# clust = AgglomerativeClustering(n_clusters=500, linkage='complete')
# cluster_labels = clust.fit_predict(dig_data)

# Function for data clustering, and computation of cluster center vectors and inv. sq. deviation vectors
def form_clusters(data, n_kmeans, n_agglo):
    
    n_km = min(n_kmeans,data.shape[0])
    n_ag = min(n_agglo ,data.shape[0])
    
    lbls_set = []
    
    if n_km>0:
        kmeans = KMeans(n_clusters=n_km, random_state=0, init='k-means++', algorithm='elkan')
        lbls_kmeans = kmeans.fit_predict(data)
        lbls_set.append(lbls_kmeans)
    
    if n_ag>0:
        agglo  = AgglomerativeClustering(n_clusters=n_ag, linkage='complete')
        lbls_agglo  = agglo.fit_predict(data)
        lbls_set.append(lbls_agglo)
    
    centers     = []
    inv_sq_devs = []
    
    # Find cluster centers and covar matrix:
    for lbls in lbls_set:
        for k in range(max(lbls)+1):
            
            cluster = data[lbls==k,:]
            centers.append(cluster.mean(axis=0))
            dev = np.std(cluster,axis=0)
            dev[dev==0]+=1e-3 # to avoid Infs and NaNs
            inv_sq_devs.append(np.reciprocal(np.square(dev)))
            
        del cluster, dev
    return centers, inv_sq_devs;


#### Definition of RBF (Fi) Function:

In [5]:
import gc

# Function to compute whole Fi output for given dataset (for all RB centers)
def fi_transform(data, all_centers, all_inv_sq_devs, kPCAs):
    # data        - given dataset matrix for which to compute fi values
    # center      - list of all center vectors on which to compute fi vals
    # inv_sq_devs - list of all reciprocal sq. deviation vectors on which to compute fi vals     

    dim1 = 0
    for cnt in all_centers:
        dim1+=len(cnt)
    
    new_data = np.empty((data.shape[0],dim1))
    st = 0
    ch = 200
    ns = data.shape[0]
    # Process data in chunks of 200 smpls (optimal speed)
    while st<ns:
        q = 0
        en = min(st+ch,ns)
        for d in range(10):
            trns_data = kPCAs[d].transform(data[st:en])
            for k in range(len(all_centers[d])):
                new_data[st:en,q] = np.dot(np.square(trns_data - np.repeat(all_centers[d][k][np.newaxis,:],trns_data.shape[0],axis=0)),all_inv_sq_devs[d][k])
                q+=1
        st = en
    new_data = np.exp(-np.sqrt(new_data))
    del trns_data; gc.collect()
    return new_data

#### Definition of RBF Transformer (sklearn compatible object):

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import KernelPCA
import gc

# SKLEARN Compatible Transformer - supports fit method (custering data) and transform method (calculating fi values)
class myRBFtransformer(BaseEstimator, TransformerMixin):
    
    # Transformer initialization (default to 50 kMeans clusters)
    def __init__(self, n_clusters=250, cl_ratio=0.5, n_fi_max=10, n_kPCA=50, debug=False):
        self.n_clusters  = n_clusters             # number of clusters to be formed per digit
        self.cl_ratio    = min(1,max(0,cl_ratio)) # cluster ratio (between kMeans and Agglomerative)
        self.n_fi_max    = n_fi_max               # num of max fi vals to pick per digit
        self.n_kPCA      = n_kPCA                 # number of kernel PCA components
        self.debug       = debug                  # debug flag

        if self.debug:
            print(self.n_clusters,self.cl_ratio,self.n_fi_max)
    
    # Clusters each digit and finds cluster centers and deviations:
    def fit(self, X, y):
        
        self.kPCAs       = []
        self.num_centers = []
        self.centers     = []
        self.inv_sq_devs = []
        
        # Calc. num. of clusters per digit based on assigned ratio (ratio*(num of smpls / 10))
        n_kmeans = round(self.cl_ratio*self.n_clusters)
        n_agglo  = self.n_clusters - n_kmeans
        
        if self.debug:
            print('Clustering data ',(n_kmeans,n_agglo))
        
        # Cluster the data over each digit
        for dig in range(10):
            # Select corresponding digit:
            data = X[y==dig,:]
            # Fit the kernel PCA and transform data:
            kPCA = KernelPCA(n_components=self.n_kPCA,kernel='rbf',copy_X=False,remove_zero_eig=True,random_state=111).fit(data)
            trns_data = kPCA.transform(data)
            # Cluster the data and find centers / devs:
            centers, inv_sq_devs = form_clusters(trns_data, n_kmeans, n_agglo)
            # Update self:
            self.kPCAs.append(kPCA)
            self.centers.append(centers)
            self.inv_sq_devs.append(inv_sq_devs)
            self.num_centers.append(len(centers))
        
        del centers, inv_sq_devs, data, trns_data
        gc.collect()
        
        # Limit the n_fi_max to the min. number of centers per digit 
        self.n_fi_max = min(self.n_fi_max,min(self.num_centers))
        
        return self
    
    # Computes fi values (with Gaussian function) based on obtained cluster centers and deviations
    def transform(self, X, y=None):
        
        # Compute all Fi values:
        if self.debug:
            print('Calculating all Fi outputs !!!')
        all_fis = fi_transform(X, self.centers, self.inv_sq_devs, self.kPCAs)
        
        # Find n max fi vals over each digit:
        n_max  = self.n_fi_max
        result = np.empty((X.shape[0],n_max*10))
        start  = 0
        for k in range(10):
            end = start + self.num_centers[k]
            result[:,(k*n_max):((k+1)*n_max)] = np.sort(all_fis[:,start:end],axis=1)[:,-n_max:]
            start = end
        del all_fis
        
        # Normalize each row with the sum of that row's elements:
        # result = np.divide(result,np.sum(result,axis=1).reshape((result.shape[0],1)))
        row_sum = np.sum(result,axis=1).reshape((result.shape[0],1))
        row_sum[row_sum==0]=1e-12
        result = np.divide(result,row_sum)
        
        if self.debug:
            print('Transform finished !!!')
        
        return result

In [7]:
# # Test:
# rbf = myRBFtransformer(n_kmeans = 10)
# a   = rbf.fit_transform(X=data[:500,:],y=lbls[:500])
# a.max(axis=1)

### 3) Pipelining the Model:

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegression

n_clusters = 500
cl_ratio   = 0.5
n_fi_max   = 10
n_kPCA     = 200

scaler = StandardScaler()
rbf    = myRBFtransformer(n_clusters=n_clusters,cl_ratio=cl_ratio,n_fi_max=n_fi_max,n_kPCA=n_kPCA)
logreg = LogisticRegression(tol=1e-12,C=1e+12,random_state=12,solver='liblinear')

pipe = Pipeline(steps=[('scal', scaler), ('rbf',rbf), ('logreg',logreg)])

In [9]:
# import time
# t = time.time()
# pipe.fit(test_data[:],test_lbls[:])
# print(time.time()-t)

In [10]:
# t = time.time()
# y = pipe.predict(test_data)
# print(time.time()-t)

In [11]:
# print(sum(test_lbls==y)/len(y)*100)

# # for k in range(20,50):
# #     print(test_lbls[k],y[k])


In [12]:
# from sklearn.model_selection import GridSearchCV
from dask_searchcv import GridSearchCV

# Number of Kernel PCA components per digit:
Ps = [ 50, 100, 200]
# Number of Clusters per digit:
Ks = [100, 300, 500]
# Ratio between kMeans and Agglomerative:
Rs = [0.1, 0.5, 0.9]
# Number of Max Fi vals to be analyzed:
Ns = [10]
# Logreg Regularization constant:
Cs = [1e5, 1e12, 1e15]

# # BEST SOL = 77.65% at 3 fold -> P = 200, K= 200 (liner kPCA kernel)
Ps = [100,200]
Ks = [ 50,400]
Rs = [0.0,1.0]
Ns = [15]
Cs = [1e12]

# Ps = [150]
# Ks = [500]
# Rs = [0.5]
# Ns = [10]
# Cs = [1e12]

param_grid = [
    {
        'rbf__n_kPCA':     Ps,
        'rbf__n_clusters': Ks,
        'rbf__cl_ratio':   Rs,
        'rbf__n_fi_max':   Ns,
        'logreg__C':       Cs
    }
]


In [13]:
grid = GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=param_grid)

In [14]:
from dask.diagnostics import ProgressBar

In [15]:
with ProgressBar():
    # grid.fit(data, lbls)
    grid.fit(data[:], lbls[:])
    
print(max(grid.cv_results_['mean_test_score'])*100)

[#########                               ] | 23% Completed |  4hr  1min 43.8s


MemoryError: 

In [None]:
print(grid.best_params_)

In [None]:
print(max(grid.cv_results_['mean_test_score'])*100)

In [None]:
print(grid.cv_results_['mean_test_score'])

In [None]:
tr_data = test_data[:7500]
tr_lbls = test_lbls[:7500]
ts_data = test_data[7500:]
ts_lbls = test_lbls[7500:]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegression

n_clusters = 500
cl_ratio   = 0.5
n_fi_max   = 10
n_kPCA     = 150

scaler = StandardScaler()
rbf    = myRBFtransformer(n_clusters=n_clusters,cl_ratio=cl_ratio,n_fi_max=n_fi_max,n_kPCA=n_kPCA)
logreg = LogisticRegression(tol=1e-12,C=1e+12,random_state=12,solver='liblinear')

pipe1 = Pipeline(steps=[('scal', scaler), ('rbf',rbf)])
pipe2 = Pipeline(steps=[('logreg',logreg)])

In [None]:
pipe1.fit(test_data,test_lbls)

In [None]:
tr_data_1 = pipe1.transform(tr_data)
ts_data_1 = pipe1.transform(ts_data)

In [None]:
pipe2.fit(tr_data_1,tr_lbls)

In [None]:
print(pipe2.score(tr_data_1,tr_lbls),pipe2.score(ts_data_1,ts_lbls))

In [None]:
new_data = pipe1.transform(test_data)

In [None]:
from dask_searchcv import GridSearchCV
from dask.diagnostics import ProgressBar

n_clusters = 500
cl_ratio   = 0.5
n_fi_max   = 10
n_kPCA     = 150

# 4th Fold on Test Set -> 98.05 with C = 1e13
Cs = [1e9,1e10,1e12,1e13]

param_grid = [
    {
        'logreg__C':       Cs
    }
]

grid = GridSearchCV(pipe2, cv=4, n_jobs=-1, param_grid=param_grid)

with ProgressBar():
    # grid.fit(data, lbls)
    grid.fit(new_data[:], test_lbls[:])
print(max(grid.cv_results_['mean_test_score'])*100)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.cv_results_['mean_test_score'])

In [None]:
10000000000000/1e13