In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import pairwise_distances as pair_dist
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import math



In [2]:
def check_numeric(X):
  newX = np.array(X).reshape(-1)
  return all(not isinstance(n, str) for n in newX)

# X=np.array([[6,7,8],[3,0,5]])
# print(check_numeric(X))
# print(X)
# print(check_numeric([1,2,3]))

In [3]:
def error_return(p):
  ranked = np.arange(p, dtype=int) 
  weight = np.empty((1,p))
  weight = np.squeeze(weight) 
  weight[:] = np.nan
  return ranked, weight


### Preprocessing data and call feature selection algorithm

In [4]:
def MultiSurf_configure(X,Y, **kwargs):
  X = np.array(X)
  Y = np.array(Y)
  prior, numUpdates, categoricalX = kwargs['prior'], kwargs['numUpdates'], kwargs['categoricalX']

  # if not check_numeric(X):
  #   print('X does not contain numeric data')
  #   p = X.shape[1] # no of attributes
  #   return error_return(p) 

  # Check if the input sizes are consistent
  if Y.shape[0] != X.shape[0]:
    print('number of instances and output labels doesnot match')
    p = X.shape[1] # no of attributes
    return error_return(p)
  
  # converting classes as 0 to #classes
  [Y, grp] = pd.factorize(Y)

  # grpToInd contains class to index mapping, grpToInd[className]= ind
  grpToInd={}
  for ind, g in enumerate(grp):
    grpToInd[g]= ind

  # removing incomplete instances
  # df_XY = pd.DataFrame(X)
  # df_XY['Y'] = Y
  # df_XY = df_XY.dropna()
  # X, Y = np.array(df_XY.iloc[:, 0:-1]), np.array(df_XY.iloc[:, -1])
  
  Ngrp = len(grp)
  N = X.shape[0]
  C = np.zeros((N,Ngrp))
  C[np.arange(N), Y] = True
  
  # Get class probs
  if not prior or prior == 'empirical':
    classProb = C.sum(0)
  elif prior == 'uniform':
    classProb = np.ones((1, Ngrp))
  elif isinstance(prior, dict):
    classProb = np.zeros((1,Ngrp))
    # if prior is a dictionary of class and prior prob
    if not len(prior):
      print('prior dictionrary is empty')
      p = X.shape[1] # no of attributes
      return error_return(p)
    for g, prob in prior.items():
      classProb[grpToInd[g]]=prob
  elif check_numeric(prior):
    if len(prior)!= Ngrp or any(p <0 for p in prior) or not all(isinstance(p, float) for p in prior) or all(p == 0 for p in prior) :
      print('prior doesnot fulfil all conditions')
      p = X.shape[1] # no of attributes
      return error_return(p)
    classProb = prior
  else:
    print('prior is invalid')
    p = X.shape[1] # no of attributes
    return error_return(p)
  
  #  Normalize class probs
  classProb = classProb/classProb.sum()
  classProb = np.squeeze(classProb) 
  #  If there are classes with zero probs, remove them
  zeroProb = classProb != 0
  
  t = [ not classProb[y]==0 for y in Y] # contains array of size [Ngrp,1] with values [True, True, False ...]
  X = X[t]
  Y = Y[t]
  C = C[t]
  C = C[:,zeroProb]
  classProb = classProb[zeroProb]

  #  Do we have enough observations?
  if len(Y)<2:
    print('not enough instances')
    p = X.shape[1] # no of attributes
    return error_return(p)

  # if not check_numeric([K]) or K<0:
  #   print('invalid value of K')
  #   p = X.shape[1] # no of attributes
  #   return error_return(property)
  # K= math.ceil(K)

  # Check number of updates
  if (not numUpdates=='all') and (not check_numeric([numUpdates]) or numUpdates<=0):
    print('numUpdates is invalid')
  elif (not numUpdates) or numUpdates=='all':
    numUpdates = X.shape[0]
  else:
    numUpdates = math.ceil(numUpdates)
    
  # Check the type of categoricalX
  if not categoricalX or (categoricalX != 'on' and categoricalX != 'off'):
      print('categoricalX is invalid')
  categoricalX = (categoricalX == 'on')   

  # Find max and min for every predictor
  p = X.shape[1] # no of attributes
  Xmax = X.max(0)
  Xmin = X.min(0) 
  Xdiff = Xmax - Xmin
  Xmean = np.mean(X, axis=0) 

  # Exclude single-valued attributes
  isDiffValue = Xdiff >= 1e-9  # boolean array of size #attributes [1,0,0,0]
  if not any(isDiffValue):
    p = X.shape[1] # no of attributes
    return error_return(p)
  
  X = X[:, isDiffValue ] 
  Xmax = Xmax[isDiffValue] 
  Xmin = Xmin[isDiffValue]
  Xdiff = Xdiff[isDiffValue] 
  Xmean = Xmean[isDiffValue]
  rejected = [ i for i in range(len(isDiffValue)) if not isDiffValue[i]]  # indices of the deleted attributes (values range from 1 to p)
  accepted = [ i for i in range(len(isDiffValue)) if isDiffValue[i]]  # indices of remaining attributes (values range from 1 to p)

  # Scale and center the attributes
  if not categoricalX:
      X = (X - Xmin) / Xdiff 

  # The #updates cannot be more than the #observations
  numUpdates = min(numUpdates, X.shape[0])

  # Call MultiSurf. By default all weights are set to NaN.
  weight = np.empty(p) 
  weight[:] = np.nan

  weight[accepted] = MultiSurf(X, Y, C,classProb,numUpdates,categoricalX) 

  # Assign ranks to attributes
  sorted = np.argsort(-weight[accepted])
  accepted = np.array(accepted)
  ranked = accepted[sorted]
  ranked = np.append(ranked, rejected)
  ranked = ranked.astype(int)

  return ranked, weight 



# MultiSurf algorithm function

In [5]:
def MultiSurf(scaledX, scaledY, C,classProb,numUpdates, categoricalX):
  
  # MultiSurf for classification
  numInstances,numAttr = scaledX.shape 
  attrWeights = np.zeros(numAttr) 
  # C is boolean 2D matrix of size (N, Ngrp) i.e. (Number of instances vs Number of classes)
  numClasses = C.shape[1]   # Ngrp

   # Choose the random instances
  np.random.seed(123)
  rndIdx = np.random.choice(numInstances, numUpdates, replace=False)   # random indices of m instances from total of n instances
  idxVec = np.arange(numInstances)   # [0 .. N-1]

   # Make searcher objects, one object per class. 
  instIdxPerClass = {}  
  for c in range(numClasses):
    c_C = C[:,c]
    instIdxPerClass[c] = np.array([i for i in range(len(c_C)) if c_C[i]], dtype=int)   # instances of class c
  
  # selecting distance function
  distFunc = 'manhattan'
  if categoricalX:
    distFunc = 'hamming'
  else: 
    distFunc = 'manhattan'
  
  pairwiseDistance = pair_dist(scaledX, metric=distFunc)

   # Outer loop, for updating attribute weights iteratively
  for i in range(numUpdates):
    targetIdx = rndIdx[i]
    
      # Choose the correct random observation
    targetInst = scaledX[targetIdx,:]

      # Find the class for this observation
    targetC = C[targetIdx, :]
    thisC = scaledY[targetIdx] # taking the class of target instance
    
    # find nearest hits and misses
    allInstIdx = np.arange(numInstances)
    otherIdxExceptTarget = allInstIdx[allInstIdx!=targetIdx] 
    meanDistance = np.mean(pairwiseDistance[targetIdx][otherIdxExceptTarget])
    stdDistance = np.std(pairwiseDistance[targetIdx][otherIdxExceptTarget])
    threshold = meanDistance - (stdDistance/2.0)
    # print('threshold : ' ,threshold)
    Hits = [] 
    Misses = {}
    
    for j in range(numInstances): 
      newC = scaledY[j]
      if pairwiseDistance[targetIdx][j] < threshold:
        if newC == thisC and j != targetIdx:
          Hits.append(j)
        elif newC != thisC:
          if newC in Misses:
            Misses[newC] = np.append(Misses[newC], j)
          else:
            Misses[newC] = np.array([j], dtype= 'int')

    Hits = np.array(Hits)
    # Misses = np.array(Misses)
    # print('targetID :',targetIdx)
    # print('HitId :', Hits)
    # print('MissId :', Misses)
    
    #***************** ATTRIBUTE UPDATE *****************************
    # Inner loop to update weights for each attribute:

    # print('targetID :',targetIdx)
    # print("target :",targetInst)
    # print('HitId :', Hits)
    # print("Hit :", X[Hits])
    # print('MissId :', Misses[:, 0:-1])
    # print("Miss :", X[Misses[:, 0:-1]])
    
    for j in range(numAttr):
      dH = diffH(j,scaledX,targetIdx,Hits, categoricalX)/numUpdates
      # print('dH', dH) 
      dM = diffM(j,scaledX,targetIdx,Misses,categoricalX ,classProb)/numUpdates
      # print('dM', dM)
      attrWeights[j] = attrWeights[j] - dH + dM 

      
      #****************************************************************
    
    # print('attrWeights : ', attrWeights)
  
  return attrWeights    

In [6]:
def cityblock(thisX, X):
  d = abs(X - thisX) 
  return d

In [7]:
def hamming(thisX, X):
  d = (X != thisX).astype(int) 
  return d


In [8]:
#--------------------------------------------------------------------------
# DIFFH (for MultiSurfClass): Function to calculate difference measure
# for an attribute between the selected instance and its hits

def diffH(a, X, targetIdx, Hits, categoricalX):

  # print(a, targetIdx, Hits, categoricalX)

  distMeas = 0.0 

  # If no hits, return zero by default
  if not len(Hits):
    return distMeas

  # Calculate weighted sum of distances
  if categoricalX:
    distMeas = np.sum(hamming(X[targetIdx, a], X[Hits,a]))
  else:
    distMeas = np.sum(cityblock(X[targetIdx, a], X[Hits,a]))
  
  # if a==0:
    # print('Hit distmeas : ', distMeas/np.float(len(Hits)))

  return distMeas/np.float(len(Hits))


In [9]:
#--------------------------------------------------------------------------
# DIFFM (for MultiSurfClass) : Function to calculate difference measure
# for an attribute between the selected instance and its misses
def diffM(a, X, targetIdx, Misses, categoricalX, classProb):
  
  # print(a, targetIdx, Misses, categoricalX)

  distMeas = 0.0 

  # If no hits, return zero by default
  if not len(Misses):
    return distMeas

  totalMisses = 0.0
  for missIdx in Misses.values():
    totalMisses += len(missIdx)
    
  # Loop over misses
  for cls, missIdx in Misses.items():            

    if categoricalX:
      distMeas = distMeas + (np.sum(hamming(X[targetIdx, a], X[missIdx, a]))) * (len(missIdx)/totalMisses) * len(Misses)
    else:
      # if a==0:
        # print("missDiff for class {} with prob {}/{}: {}".format(cls, len(missIdx), totalMisses, (np.sum(cityblock(X[targetIdx, a], X[missIdx, a]))) * (len(missIdx)/totalMisses) * len(Misses)))
      
      distMeas = distMeas + (np.sum(cityblock(X[targetIdx, a], X[missIdx, a]))) * (len(missIdx)/totalMisses) * len(Misses)

  distMeas = distMeas/np.float(totalMisses)
  # print('For attribute ',a,' Miss distmeas : ', distMeas)

  return distMeas


### Run feature selection

In [10]:
# from sklearn.datasets import load_iris, load_digits, load_wine
# X, Y = load_digits(return_X_y= True)
# print('dataset: digits')
# ranked, weight = MultiSurf_configure(X,Y,K=10,prior='uniform', numUpdates='all', categoricalX='off')
# print(ranked, weight)

In [11]:
# X = np.array([[0,0,0,0,1,0,0],
#               [0,1,0,1,0,1,0],
#               [1,0,0,0,1,1,0],
#               [0,0,0,1,0,0,0],
#               [1,1,0,0,0,0,0],
#               [0,1,0,0,1,0,0],
#               [0,0,0,0,0,1,0],
#               [1,0,0,1,0,0,0],
#               [1,0,1,0,0,0,1],
#               [1,1,0,0,1,0,1],
#               [0,0,0,0,0,0,1],
#               [1,0,1,1,1,1,1],
#               [1,0,0,0,0,0,0],
#               [1,0,0,1,0,1,1],
#               [0,1,1,0,0,0,0],
#               [0,0,1,1,0,0,1],
#               [1,1,0,1,0,0,0],
#               [1,0,1,0,1,0,1],
#               [0,1,1,1,0,0,1],
#               [1,1,1,1,1,1,1]])

# Y = np.array([0,1,1,0,0,1,0,1,1,0,0,1,1,1,1,0,0,1,1,0])

# MultiSurf_configure(X,Y,K=3,prior='uniform', numUpdates='all', categoricalX='on')


In [12]:
# X = np.array([[1,0,1],
#               [1,0,0],
#               [0,1,1],
#               [0,1,0],
#               [0,0,1],
#               [0,0,0],
#               [1,1,1],
#               [1,1,0]])

# Y = np.array([1,1,1,1,0,0,0,0])
# ranked, weight= MultiSurf_configure(X,Y,K=1,prior='uniform', numUpdates='all', categoricalX='on')

# print('ranked')
# print(ranked)
# print('weight')
# print(weight)


In [13]:
# def permute(x):
#   if len(x) == 7:
#     global X
#     global Y
#     Y.append((x[0] ^ x[2] ^ x[6]))
#     X.append(x)
#     return
#   x = np.append(x,0)
#   permute(x)
#   x = x[0:-1]
#   x = np.append(x,1)
#   permute(x)

In [14]:
# X = []
# Y = []
# x = np.array([], dtype=int)
# permute(x)
# X = np.array(X)
# Y = np.array(Y)
# ranked, weight=MultiSurf_configure(X,Y,prior='uniform', numUpdates=len(Y)/2, categoricalX='on')

# print('ranked')
# print(ranked)
# print('weight')
# print(weight)


In [15]:
# !pip install skrebate
# from skrebate import ReliefF, MultiSURF, SURF, SURFstar, MultiSURFstar
# # test classification dataset
# from sklearn.datasets import make_classification
# # define dataset
# X, Y = make_classification(n_samples=100, n_features=15, n_informative=10, n_redundant=0, n_classes=3,shuffle=False, random_state=1)
# # summarize the dataset
# print(X.shape, Y.shape)
# ranked, weight= MultiSurf_configure(X,Y,prior='uniform', numUpdates='all', categoricalX='off')

# print('ranked')
# print(ranked)
# print('weight')
# print(weight)

# multiSurf = MultiSURF(n_features_to_select=min(X.shape[1]-50, 50))
# multiSurf.fit(X, Y)
# weight = multiSurf.feature_importances_
# ranked = multiSurf.top_features_

# print('ranked')
# print(ranked)
# print('weight')
# print(weight)

In [16]:
# import time
# a = np.array([[1,1,1],[2,2,2], [3,3,3], [0,0,0]])
# start = time.time()
# c = pair_dist(a, metric='manhattan')
# print(np.average(c,axis = 1))
# print(np.average(c,axis = 0))
# print(time.time()-start)
# print(c,'\n')
# start = time.time()
# d = pdist(a, metric='cityblock')
# f = squareform(d)
# print(time.time()-start)
# print(d)
# print(f)

In [22]:
# # !pip install skrebate
# # from skrebate import ReliefF, MultiSURF, SURF, SURFstar, MultiSURFstar
# # # test classification dataset
# # from sklearn.datasets import make_classification

# # X= np.array([[-2,-1],[-1,0],[0,1],[1,2],[2,4],[3,6],[6,8],[4,5],[7,7],[8,9],[9,10],[10,11],[11,12],[12,13]])
# # X= np.array([[-2,-1],[-1,0],[0,1],[1,2],[2,4],[3,6],[4,8],[6,5],[7,7],[8,9],[9,10],[10,11],[11,12],[12,13]])
# X= np.array([[4,1.2],[4.4,1.4],[4.8,1.8],[4.9,2.5],[5.1,2.9],[5.4,4.5],[6.1,7.3],[5.9,6.5],[6.4,8.9],[6.8,10.9],[7.1,11.1],[7.2,11.5],[7.5,12],[7.8,12.3]])
# Y= np.array([1,1,1,1,1,1,1,2,2,2,2,2,2,2])
# ranked, weight = MultiSurf_configure(X,Y,prior='empirical', numUpdates='all', categoricalX='off')
# print(ranked, weight)

# # multiSurf = MultiSURF(n_features_to_select=2)
# # multiSurf.fit(X, Y)
# # weight = multiSurf.feature_importances_
# # ranked = multiSurf.top_features_

# # print(ranked, weight)

[1 0] [-0.10892857 -0.0703668 ]
