# Feature Selection

In [1]:
import mlrun

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
# nuclio: start-code

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json

# Feature selection strategies
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel

# Model based feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Scale feature scores
from sklearn.preprocessing import MinMaxScaler

# SKLearn estimators list
from sklearn.utils import all_estimators

# MLRun utils
from mlrun.mlutils.plots import gcf_clear
from mlrun.utils.helpers import create_class
from mlrun.artifacts import PlotArtifact

In [5]:
def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [6]:
def plot_stat(context,
              stat_name,
              stat_df):
    gcf_clear(plt)
        
    # Add chart
    ax = plt.axes()
    stat_chart = sns.barplot(x=stat_name, 
                            y='index', 
                            data=stat_df.sort_values(stat_name, ascending=False).reset_index(), 
                            ax=ax)
    plt.tight_layout()

    for p in stat_chart.patches:
        width = p.get_width()
        plt.text(5+p.get_width(), p.get_y()+0.55*p.get_height(),
                 '{:1.2f}'.format(width),
                 ha='center', va='center')

    context.log_artifact(PlotArtifact(f'{stat_name}', body=plt.gcf()), 
                         local_path=os.path.join('plots', 'feature_selection', f'{stat_name}.html'))
    gcf_clear(plt)

In [7]:
def feature_selection(context,
                      df_artifact,
                      k=2,
                      min_votes=0.5,
                      label_column: str = 'Y',
                      stat_filters = ['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'],
                      model_filters = {'LinearSVC': 'LinearSVC', 
                                       'LogisticRegression': 'LogisticRegression', 
                                       'ExtraTreesClassifier': 'ExtraTreesClassifier'},
                      max_scaled_scores = True):
    """Applies selected feature selection statistical functions 
    or models on our 'df_artifact'.
    
    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context
    :param k:                 number of top features to select from each statistical
                              function or model
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute 
                              number of votes
    :param label_column:      ground-truth (y) labels
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection)
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS', 
                              'FIT', 'META') or a path to such json file.
    :param max_scaled_scores: produce feature scores table scaled with max_scaler                            
    """
    
    # Read input DF
    df = df_artifact.as_df()
    
    # Drop nan's and inf's for our calculations
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    # Create selected statistical estimators
    stat_functions_list = {stat_name:SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) 
                               for stat_name in stat_filters}
    requires_abs = ['chi2']
    
    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns)
    for stat_name, stat_func in stat_functions_list.items():
        try:
            # Compute statistics
            params = (X, y) if stat_name in requires_abs else (np.abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")
                
    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')
        
    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X,y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()
                
        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_
        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)
    
    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores', 
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(key='max_scaled_scores_feature_scores', 
                            df=normalized_df,
                            local_path='max_scaled_scores_feature_scores.parquet',
                            format='parquet')
    
    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:,'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count', 
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')
    
    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')
    
    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes>=votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df,y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')

In [8]:
# nuclio: end-code

## Test

In [9]:
from mlrun import code_to_function, mount_v3io, mlconf, NewTask, run_local

In [10]:
mlconf.artifact_path = os.path.abspath('./artifacts')
mlconf.db_path = 'http://mlrun-api:8080'

### Local Test

In [11]:
task = NewTask(params={'k': 2,
                       'min_votes': 0.3,
                       'label_column': 'is_error'},
               inputs={'df_artifact': os.path.abspath('data/metrics.pq')})

In [12]:
runl = run_local(task=task,
          name='feature_selection',
          handler=feature_selection,
          artifact_path=os.path.join(os.path.abspath('./'), 'artifacts'))

> 2021-06-10 12:55:47,338 [info] starting run feature_selection uid=bcf7669f839147798ff84c6e2934bdbb DB=http://mlrun-api:8080


Pass k=2 as keyword args. From version 0.25 passing these as positional arguments will result in an error
Liblinear failed to converge, increase the number of iterations.
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


> 2021-06-10 12:55:50,106 [info] votes needed to be selected: 2


Converting input from bool to <class 'numpy.uint8'> for compatibility.


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...2934bdbb,0,Jun 10 12:55:47,completed,feature_selection,v3io_user=orzkind=handlerowner=orzhost=jupyter-orz-67f66877b6-66j9z,df_artifact,k=2min_votes=0.3label_column=is_error,,f_classifmutual_info_classifchi2f_regressionLinearSVCLogisticRegressionExtraTreesClassifierfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features


to track results use .show() or .logs() or in CLI: 
!mlrun get run bcf7669f839147798ff84c6e2934bdbb --project default , !mlrun logs bcf7669f839147798ff84c6e2934bdbb --project default
> 2021-06-10 12:55:50,335 [info] run executed, status=completed


## Job Test

In [13]:
fn = code_to_function(name='feature_selection',
                      handler='feature_selection')
fn.spec.default_handler = 'feature_selection'
fn.spec.description = "Select features through multiple Statistical and Model filters"
fn.metadata.categories = ['data-prep', 'ml']
fn.metadata.labels = {"author": "orz"}
fn.export('function.yaml')
fn.apply(mount_v3io())

> 2021-06-10 12:55:59,218 [info] function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f25f84d4310>

In [14]:
fn_run = fn.run(task)

> 2021-06-10 12:55:59,231 [info] starting run feature-selection-feature_selection uid=843dcd7678cb4d66a258494e0dc2b7ce DB=http://mlrun-api:8080
> 2021-06-10 12:55:59,352 [info] Job is running in the background, pod: feature-selection-feature-selection-676s2
> 2021-06-10 12:56:08,394 [info] votes needed to be selected: 2
> 2021-06-10 12:56:08,502 [info] run executed, status=completed
Pass k=2 as keyword args. From version 0.25 passing these as positional arguments will result in an error
Liblinear failed to converge, increase the number of iterations.
Converting input from bool to <class 'numpy.uint8'> for compatibility.
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...0dc2b7ce,0,Jun 10 12:56:05,completed,feature-selection-feature_selection,v3io_user=orzkind=jobowner=orzhost=feature-selection-feature-selection-676s2,df_artifact,k=2min_votes=0.3label_column=is_error,,f_classifmutual_info_classifchi2f_regressionLinearSVCLogisticRegressionExtraTreesClassifierfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features


to track results use .show() or .logs() or in CLI: 
!mlrun get run 843dcd7678cb4d66a258494e0dc2b7ce --project default , !mlrun logs 843dcd7678cb4d66a258494e0dc2b7ce --project default
> 2021-06-10 12:56:11,578 [info] run executed, status=completed


In [15]:
mlrun.get_dataitem(fn_run.spec.inputs['df_artifact']).as_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-04-27 14:46:46.780,Smith_Group,Denise_Crest,5124209057231,75.598891,0.000000,0.000000,252.445971,False
2021-04-27 14:46:46.780,Smith_Group,Denise_Crest,2891755865712,50.090373,3.280849,0.000000,229.889187,False
2021-04-27 14:46:46.780,Smith_Group,Debra_Gateway,0388020295311,73.243063,9.372341,2.170138,260.883807,False
2021-04-27 14:46:46.780,Smith_Group,Debra_Gateway,9633813691441,60.830420,12.241878,2.295717,244.238613,False
2021-04-27 14:46:46.780,Ferrell_Ltd,Murphy_Meadow,1517129765931,72.647964,0.535463,0.000000,212.944943,False
...,...,...,...,...,...,...,...,...
2021-04-27 15:46:46.780,Smith_Group,Debra_Gateway,9633813691441,77.875954,3.250584,0.000000,245.150281,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Murphy_Meadow,1517129765931,77.831459,0.000000,0.000000,235.109321,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Murphy_Meadow,6964486699383,55.978514,2.977447,0.533963,277.622402,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Nicholas_Estate,8002897098167,58.265446,4.090207,2.048268,272.717982,False


In [16]:
mlrun.get_dataitem(fn_run.outputs['feature_scores']).as_df()

Unnamed: 0,f_classif,mutual_info_classif,chi2,f_regression,LinearSVC,LogisticRegression,ExtraTreesClassifier
cpu_utilization,2520.015809,0.178281,4457.42936,2520.015809,-0.043911,0.232309,0.019217
latency,10152.151995,0.196697,272872.890194,10152.151995,0.031839,0.063963,0.019217
packet_loss,14120.490547,0.212439,157191.427524,14120.490547,0.05074,0.22373,0.019217
throughput,20421.72103,0.234061,109129.511665,20421.72103,-0.015724,-0.093029,0.019217


In [17]:
mlrun.get_dataitem(fn_run.outputs['selected_features']).as_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,packet_loss,throughput,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-04-27 14:46:46.780,Smith_Group,Denise_Crest,5124209057231,75.598891,0.000000,252.445971,False
2021-04-27 14:46:46.780,Smith_Group,Denise_Crest,2891755865712,50.090373,0.000000,229.889187,False
2021-04-27 14:46:46.780,Smith_Group,Debra_Gateway,0388020295311,73.243063,2.170138,260.883807,False
2021-04-27 14:46:46.780,Smith_Group,Debra_Gateway,9633813691441,60.830420,2.295717,244.238613,False
2021-04-27 14:46:46.780,Ferrell_Ltd,Murphy_Meadow,1517129765931,72.647964,0.000000,212.944943,False
...,...,...,...,...,...,...,...
2021-04-27 15:46:46.780,Smith_Group,Debra_Gateway,9633813691441,77.875954,0.000000,245.150281,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Murphy_Meadow,1517129765931,77.831459,0.000000,235.109321,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Murphy_Meadow,6964486699383,55.978514,0.533963,277.622402,False
2021-04-27 15:46:46.780,Ferrell_Ltd,Nicholas_Estate,8002897098167,58.265446,2.048268,272.717982,False
