In [1]:
#  =============================================================================================================================
#                                         SCRIPT FOR FEATURE REDUCTION or FEATURE SELECTION
#  lAST UPDATE: 03/19/2017
#  =============================================================================================================================

In [2]:
# ------------------------------------------------------------------------------------------------------------------------------
#                                            STEP 1: Import libraries
# ------------------------------------------------------------------------------------------------------------------------------
#%reset
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import collections
import matplotlib as mpl
from collections import OrderedDict
import time
from datetime import datetime
from sys import stdout
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import AffinityPropagation
from sklearn import metrics


In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [1]:
def ismember(x, y):
    """Function to find indexes of a list (x) within another list (y) in the same order as indicate by x
    
    Parameters:
    -----------
    -x:list
    -y list
    
    Returns:
    --------
    indexes in list "y" where the elements of "x" are located.
    """
        
    bind = {}
    for i, elt in enumerate(y):
        if elt not in bind:
            bind[elt] = i
    return [bind.get(itm, None) for itm in x]  # None can be replaced by any other "not in b" value

def load_numerical_table(fileName):
    """Function to load .csv file that has numerical-only data; it also displays some basic info about the .csv file content
    
    Parameters:
    -----------
    -fileName: path of rthe .csv file we want to load; put file name only if file is located in current path
    
    Returns:
    --------
    InputMatrix  = numpy array with all values, equivalent to the values in the input .csv file (but no headers)
    InputHeaders = list with headers in input .csv file
    
    """

    # Load .csv file into panda dataframe and show head
    # ----------------------------------------------------
    transformed_data = pd.read_csv(fileName)
    display(transformed_data.head(3))

    # Put headers into list
    # ----------------------
    InputHeaders=list(transformed_data)


    # Turn data frames into matrix
    # ---------------------------------
    InputMatrix  = transformed_data.as_matrix(); InputMatrix = np.array(InputMatrix) ;
    #print "Input Matrix Size:"; print "-----------------------"
    #print InputMatrix.shape

    # Turn -1000 into NaNs
    # ---------------------------------
    #InputMatrix[InputMatrix==-1000] = np.nan
    
    return InputMatrix,InputHeaders

def NormalizeData(InputMatrix_train,InputMatrix_test,NormalizationMethod):
    """Function to normalize all columns in a matrix according to NormalizationMethod (see below)
    
    Parameters:
    -----------
    -InputMatrix: Matrix we want to normalize
    
    Returns:
    --------
    NormalizationMethod: 'MeanStd' or 'MinMax' for now.
    
    """

    # OPTION 1: Mean and Std
    # -----------------------
    # Transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features 
    # by their standard deviation.Scaled data wil have zero mean and unit variance
    if NormalizationMethod=='MeanStd':
        #X= preprocessing.scale(InputMatrix_train)
        normMap=preprocessing.StandardScaler().fit(InputMatrix_train)
        X_train=normMap.transform(InputMatrix_train) 
        X_test=normMap.transform(InputMatrix_test) 
        

    # OPTION 2: Min and Max
    # ----------------------
    # Scaling features to lie between a given minimum and maximum value, often between zero and one, or so that the maximum absolute 
    # value of each feature is scaled to unit size.
    if NormalizationMethod=='MinMax':
        
        min_max_scaler = preprocessing.MinMaxScaler()
        normMap=min_max_scaler.fit(InputMatrix_train)
        X_train = min_max_scaler.fit_transform(InputMatrix_train,normMap)
        X_test = min_max_scaler.fit_transform(InputMatrix_test,normMap)      

    return X_train, X_test


def featureSelection(X,XHeaders,X_test,XHeaders_test,FeatureReductionMethod,ExplainedVariance,APpreference):
    
    # METHOD 1: SVD (Singular Value Decomposition)
    # -----------------------------------------------------------------------------------------------------------------------------

    if FeatureReductionMethod=='SVD':

        # --------------------------------------
        # OPTION 1: Singular Vale Decomposition
        # --------------------------------------

        # Linear dimensionality reduction using Singular Value Decomposition of centered data,  keeping only the most significant 
        # singular vectors to project the data to a lower dimensional space.

        # References:
        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html

        print " Feature reduction implemented using SVD "
        print "----------------------------------------------------------------------------"

        
        # Define PCA
        pca = PCA(n_components=ExplainedVariance)

        # Compute PCA
        pca.fit(X)

        # Apply dimensionality reduction to X_train and X_test
        Xreduced=pca.transform(X);
        Xreduced_test=pca.transform(X_test);
        

        # Explain variance by each component
        print "- Original train matrix has size " + str(X.shape)
        print "- Reduced train matrix has size " + str(Xreduced.shape)
        print "- Original test matrix has size " + str(X_test.shape)
        print "- Reduced test matrix has size " + str(Xreduced_test.shape)
        print "- Number of components needed to explain " + str(ExplainedVariance*100)+ "(%) is", pca.explained_variance_ratio_.size
        
        # New Feature Headers
        pcrange=range(1,pca.explained_variance_ratio_.size+1)  
        myheaders = map(str,pcrange)
        myheaders = ["PC" + myheaders for myheaders in myheaders]
        HeadersReduced=','.join(myheaders)
        HeadersReduced_test=','.join(myheaders)
        
    if FeatureReductionMethod=='AffinityPropagation':

        # --------------------------------------
        # OPTION 2: Affinity Propagation
        # --------------------------------------

        # Affinity Propagation is a clustering technique that identifies "exemplars" (which will be our reduced features)

        # References:
        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation.fit
      

        print " Feature reduction implementing using Affinity Propagation "
        print "----------------------------------------------------------------------------"
        
        # Apply Affinity Propagation to matrix X.T
        af = AffinityPropagation(preference=APpreference).fit(X.T)
        
        
        # Identify indices for examplars (i.e., features that are cluster centers)
        cluster_centers_indices = af.cluster_centers_indices_
        
        # Reduce train set
        HeadersReduced = [XHeaders[i] for i in cluster_centers_indices]
        Xreduced=X[:,cluster_centers_indices]
        
        # Reduce test set
        Xreduced_test=X_test[:,cluster_centers_indices]
        HeadersReduced_test=HeadersReduced
        
        #print "Selected features:"
        #print cluster_centers_indices
        #print HeadersReduced

        # Identify labels for each feature (i.e., to which cluster center they are linked)
        # ---------------------------------------------------------------------------------
        labels = af.labels_
        #print labels
        #print 'Affinity Propagation finished'
        print "- Original train matrix has size " + str(X.shape)
        print "- Reduced train matrix has size " + str(Xreduced.shape)
        print "- Original test matrix has size " + str(X_test.shape)
        print "- Reduced test matrix has size " + str(Xreduced_test.shape)
        print "- Number of features selected " + str(len(HeadersReduced))

    if FeatureReductionMethod=='none':
        Xreduced = X[:,:]
        HeadersReduced = XHeaders
        Xreduced_test = X_test[:,:]
        HeadersReduced_test = HeadersReduced
        
    return Xreduced,HeadersReduced,Xreduced_test,HeadersReduced_test


In [None]:
def RunFeatureReduction(InputToFeatureReduction_train,OutputFromFeatureReduction_train,InputToFeatureReduction_test,\
                        OutputFromFeatureReduction_test,NormalizationMethod,FeatureReductionMethod,\
                       ExplainedVariance,APpreference):
    """Main function to run Feature Reduction
    
    Parameters:
    -----------
    - InputToInputToFeatureReduction: Input file (.csv)
    - OutputFromFeatureReduction: Output file (.csv)
    - NormalizationMethod: 'MinMax' or 'MeanStd'
    - FeatureReductionMethod: 'SVD'
    - ExplainedVariance: This applies to SVD, and it specifies the amount of variance we want to explain from input space.
    -
    Returns:
    --------
    Output is a .csv file with name <OutputFromFeatureReduction>
    """
        
    # Load train and test data from .csv numerical-only table and do some basic arrangements
    # ------------------------------------------------------------------------------------------------------------------------------
    InputMatrix_train,InputHeaders_train=load_numerical_table(InputToFeatureReduction_train)
    InputMatrix_test,InputHeaders_test=load_numerical_table(InputToFeatureReduction_test)


    # Feature space normalization on training data
    # ------------------------------------------------------------------------------------------------------------------------------
    X_train,X_test=NormalizeData(InputMatrix_train,InputMatrix_test,NormalizationMethod)
   

    # Feature reduction for training set
    # ------------------------------------------------------------------------------------------------------------------------------
    Xinput_train=X_train[:,:-1];
    XHeaders_train=InputHeaders_train[:-1]
    Xinput_test=X_test[:,:-1];
    XHeaders_test=InputHeaders_test[:-1]
    Xreduced_train,HeadersReduced_train,Xreduced_test,HeadersReduced_test=featureSelection(Xinput_train,XHeaders_train,Xinput_test,XHeaders_test,\
                                                                                           FeatureReductionMethod,ExplainedVariance,APpreference)


    # Report and Export .csv file for train set
    # -----------------------------------------------------------------------------------------------------------------------------
    NewOutputMatrix_train=InputMatrix_train[:,-1];
    NewOutputHeader_train=InputHeaders_train[-1];
    # Save dataframe to human readable data
    myMatrix_train = np.append(Xreduced_train,  NewOutputMatrix_train[:, None], 1);
    myHeader_train=  ",".join((str(HeadersReduced_train), NewOutputHeader_train))
    np.savetxt(OutputFromFeatureReduction_train,myMatrix_train,header=myHeader_train,delimiter=',')
    
    # Report and Export .csv file for test set
    # -----------------------------------------------------------------------------------------------------------------------------
    NewOutputMatrix_test=InputMatrix_test[:,-1];
    NewOutputHeader_test=InputHeaders_test[-1];
    # Save dataframe to human readable data
    myMatrix_test = np.append(Xreduced_test,  NewOutputMatrix_test[:, None], 1);
    myHeader_test=  ",".join((str(HeadersReduced_test), NewOutputHeader_test))
    np.savetxt(OutputFromFeatureReduction_test,myMatrix_test,header=myHeader_test,delimiter=',')