## Score to rank columns for Clustering

In [1]:
import numpy as np
import random
from scipy.linalg import orth
import sklearn
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

### Finding other features which do not contribute significantly to the data

In order to create vectors which are not orthogonal to the already exixting vectors in any matrix, we can either take a scalar version of the vectors, linear combination of the vectors or vectors which are parallel to existing vectors. Here I have defined two function for scaling the vector and finding the linear combination of the vectors respectively.

In [2]:
def scalingVector(A, P, S):
    index = np.random.randint(0,S)
    scaleBy = np.random.rand()
    #print(index, scaleBy)
    return scaleBy * A[:,index]

In [3]:
def linearCombination(A, P, S):
    index1 = np.random.randint(0,S)
    index2 = np.random.randint(0,S)
    scaleBy1 = np.random.rand()
    scaleBy2 = np.random.rand()
    temp = (scaleBy1 * A[:,index1]) + (scaleBy2 * A[:,index2])
    return temp

### Generating the data that is used to test the function

First we define a matrix of size (N,S) which has all randomly initialised values. The next step is to convert the matrix such that all the S features are orthogonal to each other. 
In order to find the remaining (P-S) features, we randomly choose between either scaling the existing vectors or finding the linear combination of the two vectors. This is done using a randomly generated key. 
Once this is done, we can see that the function returns a matrix of size (N,P) which has S principle components (where S<P)

In [4]:
def getOrthogonalComponents(N, P, S, K):
    initialMatrix = np.random.rand(N, S)
    orthogonalisedMatrix = orth(initialMatrix)    
    #print(orthogonalisedMatrix.shape)
    
    for i in range(P-S):
        key = np.random.randint(0,2)
        #print(key)
        if key == 0:
            temp = scalingVector(orthogonalisedMatrix, P, S)
        else:
            temp = linearCombination(orthogonalisedMatrix, P, S)

        #temp = scalingVector(orthogonalisedMatrix, P, S)
        temp = temp.reshape(N,1)
        #print(temp)
        #print(temp.shape)
        orthogonalisedMatrix = np.append(orthogonalisedMatrix, temp,axis = 1)

    return orthogonalisedMatrix

### Function to get the scores
In order to get the scores from the data we apply Principal Component Analysis (PCA). PCA is used to find the features which contribute more to the data than all the other components. The Eigen Values that are obtained from PCA give us the measure of contributions that each feature has to the dataset. We use this value of Eigen Values as the scores.

In [5]:
def getScores(x):
    x -= np.mean(x, axis = 0)  
    cov = np.cov(x, rowvar = False)
    evals , evecs = np.linalg.eigh(cov)
    idx = np.argsort(evals)[::-1]
    evecs = evecs[:,idx]
    evals = evals[idx]
    #print(evals)
    return evals, evecs

### Trying out the functions on a smaller dataset

In [6]:
N, P, S, K = 160, 3, 1, 3
data = getOrthogonalComponents(N, P, S, K)
kmeans = sklearn.cluster.KMeans(n_clusters=K, random_state=0).fit(data)
scores, eigVectors = getScores(data)
#print(eigValues)
print("The scores for the features are")
print(np.real(scores))

The scores for the features are
[ 3.05791890e-03  3.55332336e-20 -4.88913348e-19]


### Implementation of the functions on the parameters given in the problem

In [7]:
# Defining the input parameters to be gven to the function
N, P, S, K = 1000, 10, 5, 3

# Generating the data from the function defined above:
data = getOrthogonalComponents(N, P, S, K)

# Finding the cluster sizes and centroids
kmeans = sklearn.cluster.KMeans(n_clusters=K, random_state=0).fit(data)

# Getting the scores using PCA (Principle Component Analysis)
scores, eigVectors = getScores(data)
print("The scores for the features are")
print(scores)

The scores for the features are
[ 2.36387825e-03  2.13617456e-03  1.57970356e-03  1.13805666e-03
  1.10553318e-04  2.06271057e-19  1.19696898e-19  9.76121549e-20
 -2.64106480e-20 -1.29436426e-19]


### End of File