In [1]:
import numpy as np

# Define folder paths:
DATA_folder  = '../../Data/'
CLUSTER_folder = DATA_folder+'Clusters/'

### Data Transformation:

In [2]:
from math import ceil

def split_seqs(N, max_len):
    seqs = []
    for k in range(ceil(N/max_len)):
        seqs.append(range(k*max_len, min((k+1)*max_len,N)))
    return seqs;

def split_seqs_by_list(lens):
    seqs = []
    start = 0
    for l in lens:
        seqs.append(range(start, start+l))
        start = start+l
    return seqs;

In [3]:
# Function Definition:

from sklearn.decomposition import KernelPCA, PCA, NMF

def find_transform(data):
    
    # Params:
    n_pca  = 50
    n_kpca = 50
    n_nmf  = 50
    
    # Models:
    pca  = PCA(n_components=n_pca)
    kpca = KernelPCA(n_components=n_kpca, kernel='poly', degree = 9, copy_X=False)
    nmf  = NMF(n_components=n_nmf, init = 'nndsvd')
    
    # Fitting:
    pca.fit(data)
    kpca.fit(data)
    nmf.fit(data)
    
    return [pca, kpca, nmf];

def apply_transform(data, transforms):
# Apply the transform to the whole dataset at once:
    for k in range(len(transforms)):
        current_data = transforms[k].transform(data)
        if k==0:
            new_data = current_data
        else:
            new_data = np.hstack((new_data,current_data))
    return new_data;

def apply_transform_chunky(data, transforms):
# Chunkify the dataset, and then transform each chunk:
    
    # Define max chunk size:
    max_chunk_size = 2500
    
    # Count the number of new dimensions (in case we have different dim reductions):
    new_dims  = []
    for q in range(len(transforms)):
        new_dims.append(transforms[q].transform(data[0].reshape(1,-1)).shape[1])
    dim_seqs = split_seqs_by_list(new_dims)
        
    # Split data into sequences:
    dat_seqs = split_seqs(data.shape[0], max_chunk_size)
    
    # Preallocate memory:
    new_data = np.zeros((data.shape[0],sum(new_dims)))
    
    # Iteratre over each transformation and each data chunk:
    for k in range(len(transforms)):
        for dat_seq in dat_seqs:
            st1 = min(dat_seq)
            en1 = max(dat_seq)+1
            st2 = min(dim_seqs[k])
            en2 = max(dim_seqs[k])+1
            new_data[st1:en1,st2:en2] = transforms[k].transform(data[dat_seq])
    
    return new_data;

In [4]:
# # Transformation:

# k = 3
# data = np.load(CLUSTER_folder+'train_'+str(k)+'.npy')

# tr  = find_transform(data)
# # tst = apply_transform_chunky(data, tr)
# tst = apply_transform(data, tr)

# tst.shape

### Clustering:

In [5]:
# Function Definition:

from sklearn.cluster import KMeans

def form_clusters(data):
    n_means = 50
    
    kmeans = KMeans(n_clusters=n_means, random_state=0, init='k-means++', algorithm='elkan')
    lbls_kmeans = kmeans.fit_predict(data)
    
    lbls_set = [lbls_kmeans]
    
    centers     = []
    covars      = []
    inv_covars  = []
    
    # Find cluster centers and covar matrix:
    for lbls in lbls_set:
        for k in range(max(lbls)+1):
            cluster = data[lbls==k,:]
            centers.append(cluster.mean(axis=0))
            # Invert with Moore-Penrose (to avoid singularity problems)
            covar = np.cov(cluster.T)
            covars.append(covar)
            inv_covars.append(np.linalg.pinv(covar))
    
    return centers, covars, inv_covars;



In [6]:
# # Test:
# centers, covars, inv_covars = form_clusters(new)
# # Check for 0 variance elements (main diag):
# for b in range(len(covars)):
#     for k in range(covars[0].shape[0]):
#         if covars[b][k,k]==0:
#             print(b,k)

In [7]:
from scipy.spatial.distance import mahalanobis

# # Improvised Mahalanobis (to deal with singular covar matrices)
# # (not needed -> using Moore-Penrose inverse for covar matrix)
# def improv_mahalanobis(center, point, covar):
    
#     # Pick all relevant variables:
#     rel_vars = []
#     for b in range(covar.shape[0]):
#         if covar[b,b]!=0:
#             rel_vars.append(b)
    
#     # If all are relevant just calc regular Mahalanobis
#     if len(rel_vars)==covar.shape[0]:
#         mah_dist   = mahalanobis(center,point,covar)
    
#     # Else remove the unrelevant and calc regular Mahalanobis
#     else:
#         new_center = center[rel_vars]
#         new_point  = point[rel_vars]
#         new_covar  = covar[np.ix_(rel_vars, rel_vars)]
#         mah_dist   = mahalanobis(new_center,new_point,new_covar)
        
#     return mah_dist;

def fi_func(point, center, inv_covar):
    return np.exp(-mahalanobis(center, point, inv_covar));


In [8]:
# # Test:
# point = centers[0]*0.0
# point = centers[15]*1.1
# for b in range(len(centers)):
# # for b in [15, 33]:
#     center     = centers[b]
#     inv_covar  = inv_covars[b]
#     print(fi_func(point, center, inv_covar))
    

### Full Loop:

In [9]:
all_transforms = []
all_centers    = []
all_inv_covars = []

# Find centers and covar matrices:
for dig in range(10):
    print(dig)
    data = np.load(CLUSTER_folder+'train_'+str(dig)+'.npy')
    all_transforms.append(find_transform(data))
    new_data = apply_transform(data, all_transforms[dig])
    centers, covars, inv_covars = form_clusters(new_data)
    all_centers.append(centers)
    all_inv_covars.append(inv_covars)


0
1
2
3
4
5
6
7
8
9


In [21]:
def apply_fi(data_points, centers, inv_covars):
    output_data = np.zeros((data_points.shape[0],len(centers)))
    for k in range(data_points.shape[0]):
        for j in range(len(centers)):
            output_data[k,j] = fi_func(data_points[k], centers[j], inv_covars[j])
    return output_data;

def find_fi_vals(data, all_transforms, all_centers, all_inv_covars):
    
    # Count the num of centers for each digit:
    cnts = []
    for k in range(10):
        cnts.append(len(all_centers[k]))
    # Form sequences (for positioning in final matrix):
    cnt_seqs = split_seqs_by_list(cnts)
    
    # Initialize the fis matrix:
    fi_vals = np.zeros((data.shape[0],sum(cnts)))
    
    # Iterate over each digit:
    for dig in range(10):
        
        print(dig)
        
        new_data = apply_transform_chunky(data, all_transforms[dig])
        
        st = min(cnt_seqs[dig])
        en = max(cnt_seqs[dig])+1
        fi_vals[:,st:en] = apply_fi(new_data, all_centers[dig], all_inv_covars[dig])
            
    del new_data
    return fi_vals;
        

In [22]:
data = np.load(CLUSTER_folder+'train_imgs.npy')
lbls = np.load(CLUSTER_folder+'train_lbls.npy')
fis = find_fi_vals(data, all_transforms, all_centers, all_inv_covars)

### Logistic Regression:

In [35]:
from sklearn.linear_model import LogisticRegression

# logistic = LogisticRegression(multi_class='multinomial',solver='saga')
logistic = LogisticRegression(C=1e5)

logistic.fit(fis, lbls)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [36]:
test_data = np.load(CLUSTER_folder+'test_imgs.npy')
test_lbls = np.load(CLUSTER_folder+'test_lbls.npy')

# test_fis = find_fi_vals(test_data, all_transforms, all_centers, all_inv_covars)

predicted_lbls = logistic.predict(test_fis)

In [58]:
err = 0;
for k in range(len(test_lbls)):
    if test_lbls[k]!=predicted_lbls[k]:
        err+=1
err = err/len(test_lbls)*100
print(err)
test_fis.shape

88.64999999999999


(10000, 500)

In [68]:
print(test_lbls[5])
for k in range(10):
    print(max(test_fis[5][(k*50):(k*50+50)]))
    
fx = fis[0:5]

fx.max(axis = 1)

1
2.14421486174e-24
3.63477136309e-06
3.82947430071e-22
1.2640511999e-14
1.45996268538e-25
3.48319278549e-13
1.6680075877e-20
4.84123457764e-15
2.08212600581e-10
2.27856183142e-19


array([  5.20547784e-06,   5.38906749e-06,   7.99136780e-05,
         1.04999050e-05,   5.20547784e-06])

In [None]:
def reduce_fis(fis):
    
    d = round(fis.shape[1]/10)
    
    red_fis = np.zeros((fis.shape[0],10)
    for k in range(10):
       red_fis[:,k] = fis[:,(k*d):(k*d+d)].max(axis = 1)
                       
    return red_fis;