# prototype-sequence-engineer 

Feature engineering methods for sequence data

NCH 2022

## Sub-task: full sequence feature extraction

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

def eng_descriptive_statistics(df: pd.DataFrame, by, features):
    """Engineer descriptive statistics on full length sequence data

    Args:
        df (pd.DataFrame): input data 
        by (str or list): grouping variable
        features (str or list): features to include 

    Returns:
        pd.DataFrame (multi-index): engineered features by observation (e.g., sequence)

    NCH 2022

    """
    metrics = [
        ('mean', np.mean),
        ('std', np.std),
        ('min', np.min),
        ('max', np.max),
        ('median', np.median),
        ('mad', lambda x: np.median(np.absolute(x-np.median(x)))),
        ('aad', lambda x: np.mean(np.absolute(x-np.mean(x)))),
        ('range', lambda x: np.max(x)-np.min(x)),
        ('iqr', lambda x: np.percentile(x, 75) - np.percentile(x, 25)),
        ('pc', lambda x: np.sum(x > 0)),
        ('nc', lambda x: np.sum(x < 0)),
        ('vam', lambda x: np.sum(x > np.mean(x))),
        ('skew', stats.skew),
        ('kurt', stats.kurtosis),
        ('energy', lambda x: np.sum((x**2)/100))
        ]
    return df.groupby(by)[features].agg(metrics)