# Feature Selection

In [1]:
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
# nuclio: start-code

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json

# Feature selection strategies
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel

# Model based feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Scale feature scores
from sklearn.preprocessing import MinMaxScaler

# SKLearn estimators list
from sklearn.utils import all_estimators

# MLRun utils
from mlrun.mlutils import create_class, gcf_clear
from mlrun.artifacts import PlotArtifact

In [5]:
def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [6]:
def plot_stat(context,
              stat_name,
              stat_df):
    gcf_clear(plt)
        
    # Add chart
    ax = plt.axes()
    stat_chart = sns.barplot(x=stat_name, 
                            y='index', 
                            data=stat_df.sort_values(stat_name, ascending=False).reset_index(), 
                            ax=ax)
    plt.tight_layout()

    for p in stat_chart.patches:
        width = p.get_width()
        plt.text(5+p.get_width(), p.get_y()+0.55*p.get_height(),
                 '{:1.2f}'.format(width),
                 ha='center', va='center')

    context.log_artifact(PlotArtifact(f'{stat_name}', body=plt.gcf()), 
                         local_path=os.path.join('plots', 'feature_selection', f'{stat_name}.html'))
    gcf_clear(plt)

In [1]:
def feature_selection(context,
                      df_artifact,
                      k=2,
                      min_votes=0.5,
                      label_column: str = 'Y',
                      stat_filters = ['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'],
                      model_filters = {'LinearSVC': 'LinearSVC', 
                                       'LogisticRegression': 'LogisticRegression', 
                                       'ExtraTreesClassifier': 'ExtraTreesClassifier'},
                      max_scaled_scores = True):
    """Applies selected feature selection statistical functions 
    or models on our 'df_artifact'.
    
    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context
    :param k:                 number of top features to select from each statistical
                              function or model
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute 
                              number of votes
    :param label_column:      ground-truth (y) labels
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection)
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS', 
                              'FIT', 'META') or a path to such json file.
    :param max_scaled_scores: produce feature scores table scaled with max_scaler                            
    """
    
    # Read input DF
    df_path = str(df_artifact)
    context.logger.info(f'input dataset {df_path}')
    if df_path.endswith('csv'):
        df = pd.read_csv(df_path)
    elif df_path.endswith('parquet') or df_path.endswith('pq'):
        df = pd.read_parquet(df_path)
    
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    # Create selected statistical estimators
    stat_functions_list = {stat_name:SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) 
                               for stat_name in stat_filters}
    requires_abs = ['chi2']
    
    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns)
    for stat_name, stat_func in stat_functions_list.items():
        try:
            # Compute statistics
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")
                
    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')
        
    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X,y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()
                
        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_
        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)
    
    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores', 
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(key='max_scaled_scores_feature_scores', 
                            df=normalized_df,
                            local_path='max_scaled_scores_feature_scores.parquet',
                            format='parquet')
    
    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:,'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count', 
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')
    
    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')
    
    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes>=votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df,y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')

In [8]:
# nuclio: end-code

## Test

In [3]:
from mlrun import code_to_function, mount_v3io, mlconf, NewTask, run_local

In [None]:
mlconf.artifact_path = os.path.abspath('./artifacts')
mlconf.db_path = 'http://mlrun-api:8080'

### Local Test

In [11]:
task = NewTask(params={'k': 2,
                       'min_votes': 0.3,
                       'label_column': 'is_error'},
               inputs={'df_artifact': '/User/demo-network-operations/data/metrics.parquet'})

In [12]:
runl = run_local(task=task,
          name='feature_selection',
          handler=feature_selection,
          artifact_path=os.path.join(os.path.abspath('./'), 'artifacts'))

[mlrun] 2020-04-12 12:28:08,160 starting run feature_selection uid=558aa6cf639d4e9eab6c8d6020f45962  -> http://10.194.95.255:8080


  f = msb / msw
posx and posy should be finite values
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[mlrun] 2020-04-12 12:28:08,333 input dataset /User/demo-network-operations/data/metrics.parquet
[mlrun] 2020-04-12 12:28:10,756 log artifact f_classif at /User/functions/feature_selection/artifacts/plots/feature_selection/f_classif.html, size: 22020, db: Y
[mlrun] 2020-04-12 12:28:13,489 log artifact mutual_info_classif at /User/functions/feature_selection/artifacts/plots/feature_selection/mutual_info_classif.html, size: 13738, db: Y
[mlrun] 2020-04-12 12:28:14,731 log artifact chi2 at /User/functions/feature_selection/artifacts/plots/feature_selection/chi2.html, size: 21563, db: Y
[mlrun] 2020-04-12 12:28:16,084 log artifact f_regression at /User/functions/feature_selection/artifacts/plots/feature_selection/f_regression.html, size: 23635, db: Y
[mlrun] 2020-04-12 12:28:17,914 log artifact LinearSVC at /User/functions/feature_selection/artifacts/plots/feature_selection/LinearSVC.html, size: 13296, db: Y
[mlrun] 2020-04-12 12:28:22,923 log artifact LogisticRegression at /User/functions

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...20f45962,0,Apr 12 12:28:08,completed,feature_selection,v3io_user=adminkind=handlerowner=adminhost=jupyter-76c7c6dbb5-vsqfd,df_artifact,k=2min_votes=0.3label_column=is_error,,f_classifmutual_info_classifchi2f_regressionLinearSVCLogisticRegressionExtraTreesClassifierfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features


to track results use .show() or .logs() or in CLI: 
!mlrun get run 558aa6cf639d4e9eab6c8d6020f45962 --project default , !mlrun logs 558aa6cf639d4e9eab6c8d6020f45962 --project default
[mlrun] 2020-04-12 12:28:26,989 run executed, status=completed


## Job Test

In [5]:
fn = code_to_function(name='feature_selection',
                      handler='feature_selection')
fn.spec.default_handler = 'feature_selection'
fn.spec.description = "Select features through multiple Statistical and Model filters"
fn.metadata.categories = ['data-prep', 'ml']
fn.metadata.labels = {"author": "orz"}
fn.export('function.yaml')
fn.apply(mount_v3io())

[mlrun] 2020-05-20 16:48:40,196 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f3a37606ef0>

In [20]:
fn.run(task)

[mlrun] 2020-04-12 13:24:01,198 starting run feature-selection-feature_selection uid=27215a4474d34055ba376f15fe092a58  -> http://10.194.95.255:8080
[mlrun] 2020-04-12 13:24:02,000 Job is running in the background, pod: feature-selection-feature-selection-8rkr5
  f = msb / msw
posx and posy should be finite values
[mlrun] 2020-04-12 13:24:47,709 input dataset /User/demo-network-operations/data/metrics.parquet
[mlrun] 2020-04-12 13:24:49,249 log artifact f_classif at /User/functions/feature_selection/artifacts/27215a4474d34055ba376f15fe092a58/plots/feature_selection/f_classif.html, size: 42884, db: Y
[mlrun] 2020-04-12 13:24:51,930 log artifact mutual_info_classif at /User/functions/feature_selection/artifacts/27215a4474d34055ba376f15fe092a58/plots/feature_selection/mutual_info_classif.html, size: 30250, db: Y
[mlrun] 2020-04-12 13:24:53,046 log artifact chi2 at /User/functions/feature_selection/artifacts/27215a4474d34055ba376f15fe092a58/plots/feature_selection/chi2.html, size: 42515, db

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...fe092a58,0,Apr 12 13:24:47,completed,feature-selection-feature_selection,host=feature-selection-feature-selection-8rkr5kind=jobowner=adminv3io_user=admin,df_artifact,k=2label_column=is_errormin_votes=0.3,,f_classifmutual_info_classifchi2f_regressionLinearSVCLogisticRegressionExtraTreesClassifierfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features


to track results use .show() or .logs() or in CLI: 
!mlrun get run 27215a4474d34055ba376f15fe092a58  , !mlrun logs 27215a4474d34055ba376f15fe092a58 
[mlrun] 2020-04-12 13:25:13,202 run executed, status=completed


<mlrun.model.RunObject at 0x7f4ebc1210b8>

In [21]:
pd.read_parquet(runl.spec.inputs['df_artifact'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,6625659405376,69.242828,False,10.122039,False,0.000000,False,259.837367,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,0306839395881,55.539413,False,0.000000,False,2.338992,False,277.264491,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,9686333640344,54.234624,False,1.099465,False,0.535960,False,214.538381,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,83.379805,False,0.000000,False,0.973531,False,210.010873,False,False
2020-03-29 19:22:10.106,Romero-Perry,Kim_Locks,9598503476170,81.713657,False,0.000000,False,0.817911,False,239.972481,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-29 20:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,66.323627,False,0.000000,False,0.169410,False,208.415531,False,False
2020-03-29 20:22:10.106,Romero-Perry,Kim_Locks,9598503476170,54.032799,False,0.000000,False,0.000000,False,249.860566,False,False
2020-03-29 20:22:10.106,Romero-Perry,Kim_Locks,6733246376493,76.085686,False,0.000000,False,0.000000,False,233.559866,False,False
2020-03-29 20:22:10.106,Romero-Perry,Moore_Gateway,5925984923860,67.469579,False,0.156262,False,0.000000,False,253.169495,False,False


In [22]:
pd.read_parquet(runl.outputs['feature_scores'])

Unnamed: 0,f_classif,mutual_info_classif,chi2,f_regression,LinearSVC,LogisticRegression,ExtraTreesClassifier
cpu_utilization,2582.004,0.189574,4639.35124,2582.004,-0.139358,-0.154484,0.006194
cpu_utilization_is_error,10259.46,0.203149,3243.240561,10259.46,0.025081,0.056447,0.006194
latency,9161.905,0.200192,263740.808988,9161.905,0.068929,0.103105,0.006194
latency_is_error,15799.42,0.221002,3769.368661,15799.42,0.025081,0.049925,0.006194
packet_loss,14263.52,0.222967,157171.40272,14263.52,0.138662,0.201053,0.006194
packet_loss_is_error,30645.12,0.243446,4391.686839,30645.12,0.083481,0.102485,0.006194
throughput,19371.76,0.238092,115449.93169,19371.76,-0.028992,-0.163124,0.006194
throughput_is_error,inf,0.282466,5298.0,-6.491939e+18,0.996251,5.250064,0.006194


In [23]:
pd.read_parquet(runl.outputs['max_scaled_scores_feature_scores'])

Unnamed: 0,f_classif,mutual_info_classif,chi2,f_regression,LinearSVC,LogisticRegression,ExtraTreesClassifier
cpu_utilization,0.0,0.0,0.005359,1.0,0.0,0.001596,0.0
cpu_utilization_is_error,0.273578,0.14614,0.0,1.0,0.144802,0.040562,0.0
latency,0.234468,0.114307,1.0,1.0,0.183414,0.049182,0.0
latency_is_error,0.470989,0.338327,0.00202,1.0,0.144802,0.039357,0.0
packet_loss,0.416259,0.359483,0.590901,1.0,0.24482,0.067276,0.0
packet_loss_is_error,1.0,0.579949,0.004409,1.0,0.196229,0.049067,0.0
throughput,0.598285,0.522302,0.43074,1.0,0.097186,0.0,0.0
throughput_is_error,,1.0,0.007888,0.0,1.0,1.0,0.0


In [24]:
pd.read_parquet(runl.outputs['selected_features_count'])

Unnamed: 0,f_classif,mutual_info_classif,chi2,f_regression,LinearSVC,LogisticRegression,ExtraTreesClassifier,num_votes
cpu_utilization,0,0,0,0,0,0,0,0
cpu_utilization_is_error,0,0,0,0,0,0,0,0
latency,0,0,1,0,0,0,0,1
latency_is_error,0,0,0,0,0,0,1,1
packet_loss,0,0,1,0,0,0,1,2
packet_loss_is_error,1,1,0,1,0,0,1,4
throughput,0,0,0,1,0,0,1,2
throughput_is_error,1,1,0,0,1,1,1,5


In [25]:
pd.read_parquet(runl.outputs['selected_features'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,6625659405376,0.000000,False,259.837367,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,0306839395881,2.338992,False,277.264491,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,9686333640344,0.535960,False,214.538381,False,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,0.973531,False,210.010873,False,False
2020-03-29 19:22:10.106,Romero-Perry,Kim_Locks,9598503476170,0.817911,False,239.972481,False,False
...,...,...,...,...,...,...,...,...
2020-03-29 20:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,0.169410,False,208.415531,False,False
2020-03-29 20:22:10.106,Romero-Perry,Kim_Locks,9598503476170,0.000000,False,249.860566,False,False
2020-03-29 20:22:10.106,Romero-Perry,Kim_Locks,6733246376493,0.000000,False,233.559866,False,False
2020-03-29 20:22:10.106,Romero-Perry,Moore_Gateway,5925984923860,0.000000,False,253.169495,False,False
