### 0) Data Importing and Helper Function Definition:

In [15]:
import numpy as np

DATA_folder  = '../../Data/'
data = np.load(DATA_folder+'train_imgs.npy')
lbls = np.load(DATA_folder+'train_lbls.npy')
test_data = np.load(DATA_folder+'test_imgs.npy')
test_lbls = np.load(DATA_folder+'test_lbls.npy')

In [16]:
data[lbls==0,:].shape

(5923, 784)

### 1) Definition of RBF Transformer Object:

RBF Transformer is a scikit-learn compatible transformer object that implements:

    - fit method       - clusters the digit-separated data, and computes cluster centers and inv. sq. deviations
    - transform method - based on obtained centers and deviations it computes the fi values (RBF layer outputs) as 
                         simple Gaussian functions -> Fi_k(x) = exp{ -sqrt( sum[ (x_i-c_k_i)/(dev_i^2) ] ) } 

#### Definition of Clustering Function:

In [17]:
### Clustering:

from sklearn.cluster import KMeans

# Function for data clustering, and computation of cluster center vectors and inv. sq. deviation vectors
def form_clusters(data, n_kmeans):
    
    kmeans = KMeans(n_clusters=n_kmeans, random_state=0, init='k-means++', algorithm='elkan')
    lbls_kmeans = kmeans.fit_predict(data)
    
    lbls_set = [lbls_kmeans] # just as an option to include more clustering methods
    
    centers   = []
    inv_sq_devs  = []
    
    # Find cluster centers and covar matrix:
    for lbls in lbls_set:
        for k in range(max(lbls)+1):
#             # Ignore single element clusters (we cannot determine the spread)
#             if(sum(lbls==k)>=2):
#                 # print(k, sum(lbls==k),'elems')
#                 cluster = data[lbls==k,:]
#                 centers.append(cluster.mean(axis=0))
#                 dev = np.std(data,axis=0)
#                 dev[dev==0]+=1e-6 # to avoid Infs and NaNs
#                 inv_sq_devs.append(np.reciprocal(np.square(dev)))
#             else:
#                 print('ignored')
            
            cluster = data[lbls==k,:]
            centers.append(cluster.mean(axis=0))
            dev = np.std(cluster,axis=0)
            dev[dev==0]+=1e-3 # to avoid Infs and NaNs
            inv_sq_devs.append(np.reciprocal(np.square(dev)))
            
        del cluster, dev
    return centers, inv_sq_devs;


#### Definition of RBF (Fi) Function:

In [18]:
# Function to calculate one Fi column for given dataset (for one RB center)
def calc_fi_column(data, center, inv_sq_devs):
    # data        - given dataset matrix for which to compute fi values
    # center      - given center vector on which to compute fi vals
    # inv_sq_devs - given reciprocal sq. deviation vector on which to compute fi vals 
    
    fi_col = np.empty((data.shape[0],1))
    # tmp  = data - np.dot(np.ones((data.shape[0],1)),center.reshape((1,center.shape[0])))
    # tmp2 = np.square(tmp)
    # tmp3 = np.dot(tmp2,inv_sq_devs)
    # fi_col = np.exp(-np.sqrt(tmp3))
    fi_col = np.exp(-np.sqrt(np.dot(np.square(data - np.dot(np.ones((data.shape[0],1)),center.reshape((1,center.shape[0])))),inv_sq_devs)))

    return fi_col;

# Function to compute whole Fi output for given dataset (for all RB centers)
def fi_transform(data, all_centers, all_inv_sq_devs):
    # data        - given dataset matrix for which to compute fi values
    # center      - list of all center vectors on which to compute fi vals
    # inv_sq_devs - list of all reciprocal sq. deviation vectors on which to compute fi vals     
    new_data = np.empty((data.shape[0],len(all_centers)))
    for k in range(len(all_centers)):
        new_data[:,k] = calc_fi_column(data, all_centers[k], all_inv_sq_devs[k])
    
    return new_data

#### Definition of RBF Transformer (sklearn compatible object):

In [19]:
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.base

from sklearn.base import TransformerMixin

# SKLEARN Compatible Transformer - supports fit method (custering data) and transform method (calculating fi values)
class myRBFtransformer(TransformerMixin):
    
    # Transformer initialization (default to 50 kMeans clusters)
    def __init__(self, n_kmeans=50):
        self.centers     = []
        self.inv_sq_devs = []
        self.n_kmeans    = n_kmeans
        print(self.n_kmeans)
    
    # Clusters each digit and finds cluster centers and deviations:
    def fit(self, X, y):
        self.centers  = []
        self.inv_sq_devs = []
        for dig in range(10):
            print('Current dig: ',dig)
            data = X[y==dig,:]
            centers, inv_sq_devs = form_clusters(data, self.n_kmeans)
            self.centers.extend(centers)
            self.inv_sq_devs.extend(inv_sq_devs)
        return self
    
    # Computes fi values (with Gaussian function) based on obtained cluster centers and deviations:
    def transform(self, X, y=None):
        result = fi_transform(X, self.centers, self.inv_sq_devs)
        return result

In [20]:
# # Test:
# rbf = myRBFtransformer(n_kmeans = 10)
# a   = rbf.fit_transform(X=data[:500,:],y=lbls[:500])
# a.max(axis=1)

### 3) Pipelining the Model:

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression

n_pca    = 545 # First dimension to retain 99 % variance (stanford edu recomendation for images)!!!
n_kmeans = 500

scaler1 = StandardScaler()
pca     = PCA(n_components=n_pca, whiten=True)
rbf     = myRBFtransformer(n_kmeans=n_kmeans)
scaler2 = StandardScaler()

500


In [22]:
from sklearn.pipeline import Pipeline
# my_pipe = Pipeline(steps=[('scal1', scaler1), ('pca', pca), ('rbf',rbf)])
my_pipe = Pipeline(steps=[('scal1', scaler1), ('pca', pca), ('rbf',rbf), ('scal2',scaler2)])
# my_pipe = Pipeline(steps=[('scal1', scaler1), ('pca', pca), ('rbf',rbf), ('scal2',scaler2), ('',)])

In [23]:
output = my_pipe.fit_transform(X=test_data,y=test_lbls)



Current dig:  0
Current dig:  1
Current dig:  2
Current dig:  3
Current dig:  4
Current dig:  5
Current dig:  6
Current dig:  7
Current dig:  8
Current dig:  9


In [30]:
# logreg = LogisticRegression(tol=1e-32,C=1e32,random_state=12,solver='newton-cg',multi_class ='multinomial')
# 81.6%

# logreg = LogisticRegression(tol=1e-32,C=1e32,random_state=12,solver='newton-cg',multi_class ='multinomial')
# 97.25% with kmeans = 500
# logreg = LogisticRegression(tol=1e-8,C=1e8,random_state=12,solver='newton-cg',multi_class ='multinomial')
# 97.53% with kmeans = 500


# logreg = LogisticRegression(tol=1e-32,C=1e32,random_state=12,solver='liblinear',penalty ='l2')
logreg = LogisticRegression(tol=1e-8,C=1e8,random_state=12,solver='newton-cg',multi_class ='multinomial')
logreg.fit(output, test_lbls)



LogisticRegression(C=100000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=12, solver='newton-cg', tol=1e-08, verbose=0,
          warm_start=False)

In [31]:
print(logreg.score(output, test_lbls)*100)

97.53


In [26]:
a = rbf.inv_sq_devs

In [29]:
for k in range(50):
    print(sum(a[k]),a[k].max())



2419.77261729 37.3678331685
14875134.7086 11550921.0157
545000000.0 1000000.0
2735.99161462 43.5901557541
545000000.0 1000000.0
25825599.7362 24882701.3224
6618.407122 414.530832976
6426.46170319 863.872686617
1935.88580736 14.1264500513
545000000.0 1000000.0
5571233.06201 3676471.16608
2872.27812089 30.9720514845
545000000.0 1000000.0
3668.63504196 220.608893547
2769.41449403 47.0061073948
545000000.0 1000000.0
39718.1761875 14578.63452
2877.27653898 59.6199323023
545000000.0 1000000.0
545000000.0 1000000.0
5766.66589241 570.600887492
545000000.0 1000000.0
545000000.0 1000000.0
545000000.0 1000000.0
545000000.0 1000000.0
4657.97854652 146.115343634
3673.01439666 52.1818080363
2576945777.12 2574180424.7
2913277.13086 1267544.07536
2695037.57043 1104292.66238
545000000.0 1000000.0
4017.6193854 266.99069881
545000000.0 1000000.0
545000000.0 1000000.0
545000000.0 1000000.0
2067.83048787 39.513300241
545000000.0 1000000.0
3391.30250061 198.229867493
545000000.0 1000000.0
545000000.0 100000

In [28]:
b=np.arange(5)
np.std(b)

1.4142135623730951