# Pre Selection - Experiment

Remove features according to the following criteria:
- Variability close to 0
- High correlation between each other
- Handle NaN and missing values 

This notebook shows:
- how to use the [SDK](https://platiagro.github.io/sdk/) to load datasets, save models and other artifacts.
- how to declare parameters and use them to build reusable components.

## Declare parameters
Components may declare (and use) these default parameters:
- dataset
- target

Use these parameters to load/save datasets, models, metrics, and figures with the help of [PlatIAgro SDK](https://platiagro.github.io/sdk/). <br />
You may also declare custom parameters to set when running an experiment.

In [None]:
dataset = "iris" #@param {type:"string"}
target = "Species" #@param {type:"feature", label:"Atributo alvo", description: "Seu modelo será treinado para prever os valores do alvo."}
cutoff = 0.9 #@param {type:"number", label:"Limiar de correlação", description:"Atributos com correlação maior que o limiar serão removidos."}
threshold = 0.0 #@param {type:"number", label:"Limiar de threshold", description:"Atributos com variância menor que o limiar serão removidos."}

## Load dataset

Import and put the whole dataset in a pandas.DataFrame.

In [None]:
from platiagro import load_dataset

df = load_dataset(name=dataset)
X = df.drop(target, axis=1).to_numpy()
y = df[target].to_numpy()

## Load metadata about the dataset
For example, below we get the feature type for each column in the dataset. (eg. categorical, numerical, or datetime)

In [None]:
import numpy as np
from platiagro import stat_dataset

metadata = stat_dataset(name=dataset)
featuretypes = metadata["featuretypes"]

columns = df.columns.to_numpy()
featuretypes = np.array(featuretypes)
target_index = np.argwhere(columns == target)
columns = np.delete(columns, target_index)
featuretypes = np.delete(featuretypes, target_index)

## Wrapping custom transformer

In [None]:
%%writefile CustomTransformer.py
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class Correlation(BaseEstimator, TransformerMixin):
    """Feature selector that removes correlated features.
    
    This feature selection algorithm looks only at features(X) and 
    and removes those with high correlation.
    
    Attributes:
        categorical_indexes: a np.ndarray of categoricals indexes.
        cutoff: float of cutoff.
        drop_indexes: list of indexes to be droped.
    """

    def __init__(self, categorical_indexes: np.ndarray, cutoff: float):
        """Inits Correlation class.
        
        Args:
            categorical_indexes: categorical indexes of X.
            cutoff: cutoff value.
        """
        self.categorical_indexes = categorical_indexes
        self.cutoff = cutoff
    
    def transform(self, X) -> np.ndarray:
        """Reduce X to the selected features.
        
        Args:
            X: the input samples.
            
        Returns:
            np.ndarray: the input samples without only the selected features.
        """
        return np.delete(X, np.unique(self.drop_indexes), axis=1)
    
    def get_support(self) -> np.ndarray:
        """Returns a list of indexes to be removed.
        
        Returns:
            np.ndarray: indexes removed by the model.
        """
        return np.unique(self.drop_indexes)
    
    def fit(self, X: np.ndarray, y=None) -> np.ndarray:
        """Fit the model.
        
        Learn correlated features from X.
        
        Args:
            X: the imput sample.
        
        Returns:
            self
        """
        # get only numerical values from X
        X_numerical = np.delete(X, self.categorical_indexes, axis=1)
        
        # check the shape of input
        if np.ma.size(X_numerical, axis=0) <= 1 \
        or np.ma.size(X_numerical, axis=1) <= 1:
            return X
        
        # correlation matrix
        corr_matrix = np.abs(np.corrcoef(X_numerical.astype(float), rowvar=False))
        
        # mean correlation for each column
        mean_corr = np.mean(corr_matrix, axis=1)

        # pairwise correlations above cutoff
        above_cutoff = np.argwhere(np.triu(corr_matrix, k=1) > self.cutoff)

        # for each pairwise correlation above cutoff
        # remove the feature with the highest mean correlation 
        self.drop_indexes = [
            above_cutoff[i, np.argmax(pair)] 
            for i, pair in enumerate(mean_corr[above_cutoff])]

        return self

## Features configuration

In [None]:
from platiagro.featuretypes import NUMERICAL

# Selects the indexes of numerical and non-numerical features
numerical_indexes = np.where(featuretypes == NUMERICAL)[0]
non_numerical_indexes = np.where(~(featuretypes == NUMERICAL))[0]

# After the step handle_missing_values, 
# numerical features are grouped in the beggining of the array
numerical_indexes_after_handle_missing_values = \
    np.arange(len(numerical_indexes))
non_numerical_indexes_after_handle_missing_values = \
    np.arange(len(numerical_indexes), len(featuretypes))

# Get non numerical indexes columns names
non_numerical_columns = np.take(columns, non_numerical_indexes)

## Remove features with low-variance

In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from CustomTransformer import Correlation

pipeline = Pipeline(steps=[
    ('handle_missing_values', 
     make_column_transformer((SimpleImputer(strategy='mean'), numerical_indexes),
                             (SimpleImputer(strategy='most_frequent'), non_numerical_indexes),
                             remainder='drop')),
    ('handle_low_variance',
     make_column_transformer((VarianceThreshold(threshold=threshold),
                              numerical_indexes_after_handle_missing_values),
                             remainder='passthrough')),
    ('handle_correlated_features',
     Correlation(categorical_indexes=non_numerical_indexes_after_handle_missing_values,
                 cutoff=cutoff))
])

# Train model and transform dataset 
X = pipeline.fit_transform(X)

# Get columns name that was not removed by VarianceThreshold
remainder_numerical_indexes = \
    np.take(columns, 
              numerical_indexes[
                  pipeline.named_steps.handle_low_variance.named_transformers_.variancethreshold.get_support()])

# Removes highly correlated features from the features selected by VarianceThreshold
remainder_numerical_indexes = \
    np.delete(remainder_numerical_indexes,
              pipeline.named_steps.handle_correlated_features.get_support())

# The pipeline changes features order, and it's necessary to save the changes for inference step.
# numerical features are in the beggining, and non numerical in the end
features_after_pipeline = np.concatenate((remainder_numerical_indexes,
                                          non_numerical_columns))

# Convert back to DataFrame
df = pd.DataFrame(X, columns=features_after_pipeline)
df[target] = y

## Save dataset

Stores the transformed dataset in a object storage.<br>

In [None]:
from platiagro import save_dataset

save_dataset(name=dataset, df=df)

## Save model

Stores the model artifacts in a object storage.<br>
It will make the model available for future deployments.

In [None]:
from platiagro import save_model

save_model(pipeline=pipeline,
           columns=columns,
           features_after_pipeline=features_after_pipeline)