In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import pairwise_distances 
from sklearn.feature_selection import mutual_info_classif

In [2]:
class DropDuplicate(BaseEstimator, TransformerMixin):
    """
    Drops duplicate columns
    
    """
        
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        """
        Argument: 
        X: Pandas Dataframe
        
        Identifies unique columns
        
        """
        # get the list of unique columns in idx
        _, idx = np.unique(X, axis=1, return_index=True)
        
        # set attributes as list of unique columns and length of the list 
        self.keepList_ = idx
        self.numUnique_ = len(idx)
        self.numDuplicate_ = len(X.columns) - self.numUnique_
        
        return self
    
    def transform(self, X):
        """
        Transforms X by dopping duplicate columns 
        
        """
        print("--->", self.numDuplicate_, "columns dropped due to duplicate values:\n")
        
        return X.iloc[:, self.keepList_]

In [75]:
def test_duplicate():
    test_data = pd.DataFrame({'a': [None]*9 + [1],
                              'b': np.ones(10),
                              'c': np.ones(10)})

    dropduplicate = DropDuplicate()
    
    result = dropduplicate.fit_transform(test_data)
    
    assert(list(result.columns)==['b','a'])

In [76]:
test_duplicate()

---> 1 columns dropped due to duplicate values:



In [3]:
class DropMissing(BaseEstimator, TransformerMixin):
    """
    Drops columns with missing values ratio above a threshold
    
    """
    
    def __init__(self, threshold=0.9):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        """
        Argument: 
        X: Pandas dataframe
        
        """
        
        # find missing percentage for each column
        missing_pct = X.isnull().sum() / len(X)
        
        # identify columns with missing ratio above the threshold
        to_drop = list((missing_pct[missing_pct >= self.threshold]).index)
        
        # define attribute for list of columns to drop
        self.dropList_ = to_drop
        
        # define attribute for the number of columns to be dropped
        self.numMissingCol_ = len(self.dropList_)
        
        return self
    
    def transform(self, X):
        """
        Transforms X by dropping high missing columns
        
        """

        print("--->", self.numMissingCol_, "columns dropped due to missing values:\n", self.dropList_)

        # return a dataframe with dropped columns missing over threshold 
        return X.drop(columns=self.dropList_)

In [73]:
def test_missing():
    test_data = pd.DataFrame({'a': [None]*9 + [1],
                              'b': np.ones(10),
                              'c': range(10)})

    dropmissing = DropMissing(threshold=0.9)
    
    result = dropmissing.fit_transform(test_data)
    
    assert(result.equals(pd.DataFrame({'b': np.ones(10),'c': range(10)})) )

In [74]:
test_missing()

---> 1 columns dropped due to missing values:
 ['a']


In [4]:
class DropHighCorr(BaseEstimator, TransformerMixin):
    """
    Drops one of every two similar columns
    
    Acceptable metrics are: 
    
    ‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’
    ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, 
    ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘minkowski’, 
    ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, 
    ‘sokalsneath’, ‘sqeuclidean’, ‘yule’
    
    """
    
    def __init__(self, threshold=0.9, metric = None):
        self.threshold = threshold
        self.metric = metric
    
    def fit(self, X, y=None):
        """
        Argument: 
        X: Pandas dataframe
        
        """        
        
        if self.metric is None:
            # Calculate correlation matrix
            sim_matrix = X.corr().abs()
        else:
            # calculate a similarity matrix
            sim_matrix = pairwise_distances(X.t, metric)

        # Subset to upper triangle of corr_matrix
        upper = sim_matrix.where(
            np.triu(np.ones(sim_matrix.shape), k=1).astype(np.bool))

        # Identify columns with correlation above threshold
        to_drop = [column for column in upper.columns if any(upper[column] >= self.threshold)]
        
        # define attribute for list of columns to drop
        self.dropList_ = to_drop
       
        # define attribute for the number of columns to be dropped
        self.numDropCols_ = len(self.dropList_)
        
        return self
    
    def transform(self, X):
        """
        Transforms X by dropping high missing columns
        
        """
        
        print("--->", self.numDropCols_, "columns dropped due to collinearity:\n", self.dropList_)

        # return a dataframe with dropped columns
        return X.drop(columns=self.dropList_)

In [71]:
def test_highcorr():
    drophighcorr = DropHighCorr(threshold=0.9)
    test_data = pd.DataFrame({'a': np.array(range(10)), 
                              'b': np.array(range(10)) + 0.05,
                              'c': np.ones(10) 
                             })
    result = drophighcorr.fit_transform(test_data)
    assert(list(result.columns) == ['a', 'c'])

In [72]:
test_highcorr()

---> 1 columns dropped due to collinearity:
 ['b']


In [5]:
class DropZeroCov(BaseEstimator, TransformerMixin):
    """
    Drops columns with a single unique value (zero variance)
    
    """
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        """
        Argument: 
        X: Pandas dataframe
        
        """                
        one_unique = X.apply(lambda x: x.nunique() == 1, axis=0)
        to_drop = list(one_unique[one_unique == True].index)
        
        self.dropList_ = to_drop
        self.numZeroCov_ = len(self.dropList_)
        
        return self
    
    def transform(self, X):
        """
        Transforms X by dropping columns with zero variance
        
        """
        print("--->", self.numZeroCov_, 'columns dropped due to zero variance:\n', self.dropList_)

        # return a dataframe without columns with a single unique value 
        return X.drop(columns=self.dropList_)

In [77]:
def test_zerocov():
    dropzerocov = DropZeroCov()
    test_data = pd.DataFrame({'a': np.array(range(10)), 
                              'b': np.array(range(10)) + 0.05,
                              'c': np.ones(10) 
                             })
    result = dropzerocov.fit_transform(test_data)
    assert(list(result.columns) == ['a', 'b'])

In [78]:
test_zerocov()

---> 1 columns dropped due to zero variance:
 ['c']


In [58]:
test_data = pd.DataFrame({'a':np.random.rand(10), 
                     'b': np.random.rand(10),
                     'c': np.random.rand(10),
                     'd': [None]*9 + [1],
                     'e': np.ones(10)*2+np.random.rand(10)/10,
                     'f': np.ones(10)*2+np.random.rand(10)/10,
                     'g': np.ones(10),
                     'h': range(10),
                     'i': range(10)})
test_data

Unnamed: 0,a,b,c,d,e,f,g,h,i
0,0.226037,0.77699,0.99764,,2.015994,2.033849,1.0,0,0
1,0.990756,0.886384,0.529057,,2.002938,2.093631,1.0,1,1
2,0.032307,0.015095,0.125357,,2.002152,2.035927,1.0,2,2
3,0.866544,0.274767,0.850547,,2.058378,2.091117,1.0,3,3
4,0.0256,0.761403,0.375346,,2.09769,2.031013,1.0,4,4
5,0.280362,0.405906,0.499647,,2.086648,2.057108,1.0,5,5
6,0.457048,0.218581,0.001806,,2.058269,2.001504,1.0,6,6
7,0.243567,0.654431,0.891285,,2.056723,2.03852,1.0,7,7
8,0.464741,0.484516,0.711038,,2.062744,2.049865,1.0,8,8
9,0.356416,0.178853,0.794608,1.0,2.051981,2.020661,1.0,9,9


In [59]:
prep_pipeline = Pipeline([
    ('drop_duplicate', DropDuplicate()),
    ('drop_missing', DropMissing(threshold=0.9)),
    ('drop_zerocov', DropZeroCov()),
    ('drop_correlated', DropHighCorr(threshold=0.6))
])

In [60]:
prep_pipeline.fit_transform(test_data)

---> 1 columns dropped due to duplicate values:

---> 1 columns dropped due to missing values:
 ['d']
---> 1 columns dropped due to zero variance:
 ['g']
---> 1 columns dropped due to collinearity:
 ['f']


Unnamed: 0,h,a,b,c,e
0,0,0.226037,0.77699,0.99764,2.015994
1,1,0.990756,0.886384,0.529057,2.002938
2,2,0.032307,0.015095,0.125357,2.002152
3,3,0.866544,0.274767,0.850547,2.058378
4,4,0.0256,0.761403,0.375346,2.09769
5,5,0.280362,0.405906,0.499647,2.086648
6,6,0.457048,0.218581,0.001806,2.058269
7,7,0.243567,0.654431,0.891285,2.056723
8,8,0.464741,0.484516,0.711038,2.062744
9,9,0.356416,0.178853,0.794608,2.051981


In [22]:
class MISelector(BaseEstimator, TransformerMixin):
    """
    Selects most important features by estimating mutual information for a discrete target variable  
    num_feat can be given as an integer or as a float
    If num_feat is given as an integer e.g. 10, it will be used to select top 10 features 
    If given as a float e.g. 0.8, it will be used to select enough features to cover 80% of total feature importance
    
    """
    
    def __init__(self, num_feat=100):
        self.num_feat = num_feat
        """
        Arguments:
        num_feat: is an integer if user specifies the number of features or a float between 0 and 1 if user 

        """
    def fit(self, X, y=None):
        """
        Use Scikit-learn's mutual_info_classif to estimate mutual information between each feature and the target.
        
        Arguments:
        X: 2D Pandas dataframe
        y: 1D Pandas dataframe
        
        """
        
        # Impute mean for missing values
        X = X.apply(lambda x: x.fillna(x.mean()),axis=0)
        
        # Random Forest model
        self.mi_ = mutual_info_classif(X, y)
        self.mi_ = self.mi_/sum(self.mi_)
        
        # Identify top features
        dic = dict(zip(X.columns, self.mi_))
        self.features_ordered_ = [k for k in sorted(dic, key=dic.get, reverse=True)]
        self.mi_ordered_ = [dic[k] for k in features_ordered]

        if isinstance(self.num_feat, int):
            # if user selected a specific number of fearures 
            self.top_feature_list_ = self.features_ordered_[:self.num_feat]
        else:
            # if the user specified a threshold of overall imporatance 
            cumsum = np.cumsum(self.mi_ordered_)
            self.num_feat_selected_ = np.argmax(cumsum>=self.num_feat) + 1
            self.top_feature_list_ = self.features_ordered_[:self.num_feat_selected]
            
        return self
    
    def transform(self, X):
        """
        Returns a reduced Dataframe
        
        """
    
        return X[self.top_feature_list_]

In [29]:
class RFSelector(BaseEstimator, TransformerMixin):
    """
    Selects most important features by training a random forest model 
    num_feat can be given as an integer or as a float
    If num_feat is given as an integer e.g. 10, it will be used to select top 10 features 
    If given as a float e.g. 0.8, it will be used to select enough features to cover 80% of total feature importance
    
    """
    
    def __init__(self, num_feat=100):
        self.num_feat = num_feat
        """
        Arguments:
        num_feat: is an integer if user specifies the number of features or a float between 0 and 1 if user 

        """
    def fit(self, X, y=None):
        """
        Fit a Random Forest model to data to find most important features
        
        Arguments:
        X: 2D Pandas dataframe
        y: 1D Pandas dataframe
        
        """
        
        # Impute mean for missing values
        X = X.apply(lambda x: x.fillna(x.mean()),axis=0)
        
        # Random Forest model
        rf = RandomForestClassifier()
        rf.fit(X, y)
        
        # Identify top features
        dic = dict(zip(X.columns, rf.feature_importances_))
        self.features_ordered_ = [k for k in sorted(dic, key=dic.get, reverse=True)]
        self.importance_ordered_ = [dic[k] for k in features_ordered]

        if isinstance(self.num_feat, int):
            # if user selected a specific number of fearures 
            self.top_feature_list_ = self.features_ordered_[:self.num_feat]
        else:
            # if the user specified a threshold of overall imporatance 
            cumsum = np.cumsum(self.importance_ordered_)
            self.num_feat_selected_ = np.argmax(cumsum>=self.num_feat) + 1
            self.top_feature_list_ = self.features_ordered_[:self.num_feat_selected]
            
        return self
    
    def transform(self, X):
        """
        Returns a reduced Dataframe
        
        """
    
        return X[self.top_feature_list_]

In [12]:
test_pipeline = Pipeline([
    ('drop_missing', DropMissing(threshold=0.9)),
    ('drop_zerocov', DropZeroCov()),
    ('drop_correlated', DropHighCorr(threshold=0.9)),
    ('RF_selector', RFSelector(num_feat=10))
])

In [4]:
# Reading output of Featuretools 
fm = pd.read_csv('featurematrix20190104')

In [5]:
fm = fm.dropna(subset=['TARGET'])

# Getting a sample
fm = fm.sample(n=50000)

# One-hot-encoding
fm = pd.get_dummies(fm)

In [6]:
print("Number of columns in data after feature engineering: ", fm.shape[1])

Number of columns in data after feature engineering:  1087


In [7]:
X = fm.drop(columns=['TARGET', 'SK_ID_CURR'])
y = fm['TARGET']

In [None]:
# test MIselector
miselector = MISelector()
X_red = miselector.fit_transform(X, y)

In [1]:
t1 = time.time()
fm_reduced = test_pipeline.fit_transform(X, y)
t2 = time.time() - t1
print(t2, 'seconds')

NameError: name 'time' is not defined

In [23]:
fm_reduced.head()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1,DAYS_ID_PUBLISH,DAYS_REGISTRATION,MEAN(prev_app.AMT_ANNUITY),MEAN(bureau.DAYS_CREDIT),AMT_ANNUITY,MAX(bureau.DAYS_CREDIT_ENDDATE)
85199,0.514518,0.556727,-20514,,-2600,-4269.0,26099.205,-1100.5,51948.0,2242.0
165132,0.678678,0.822499,-14920,0.518546,-2920,-8885.0,14748.052,-2913.0,17874.0,-2366.0
214844,0.561105,0.301625,-14855,0.620403,-4659,-957.0,7359.39,-861.833333,20281.5,839.0
285556,0.608568,0.7463,-25139,,-4922,-9783.0,16053.593,-1434.0,23364.0,-1130.0
270656,0.724231,0.092617,-17444,,-4261,-1284.0,7683.847,-1352.583333,24750.0,1514.0


In [24]:
print(test_pipeline.named_steps['drop_missing'].numMissingCol_)
print(test_pipeline.named_steps['drop_zerocov'].numZeroCov_)
print(test_pipeline.named_steps['drop_correlated'].numDropCols_)

6
19
454
