# Feature engineering

## date and time

In [1]:
if None:
    df.loc[:, 'year'] = df['datetime_column'].dt.year
    df.loc[:, 'weekofyear'] = df['datetime_column'].dt.weekofyear
    df.loc[:, 'month'] = df['datetime_column'].dt.month
    df.loc[:, 'dayofweek'] = df['datetime_column'].dt.dayofweek
    df.loc[:, 'weekend'] = (df.datetime_column.dt.weekday >=5).astype(int)
    df.loc[:, 'hour'] = df['datetime_column'].dt.hour

In [2]:
import pandas as pd

# create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()

# create some features based on datetime
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.isocalendar().week  # s.dt.weekofyear.values
}

## aggregated features

In [3]:
def generate_features(df):
    # create a bunch of features using the date column
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)
    
    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month, we calculate the
    # number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column
    aggs['num1'] = ['sum','max','min','mean']
    # for customer_id, we calculate the total count
    aggs['customer_id'] = ['size']
    # again for customer_id, we calculate the total unique
    aggs['customer_id'] = ['nunique']
    
    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

## time series data (list of values)

In [4]:
if None:
    import numpy as np

    feature_dict = {}

    # calculate mean
    feature_dict['mean'] = np.mean(x)

    # calculate max
    feature_dict['max'] = np.max(x)

    # calculate min
    feature_dict['min'] = np.min(x)

    # calculate standard deviation
    feature_dict['std'] = np.std(x)

    # calculate variance
    feature_dict['var'] = np.var(x)

    # peak-to-peak
    feature_dict['ptp'] = np.ptp(x)

    # percentile features
    feature_dict['percentile_10'] = np.percentile(x, 10)
    feature_dict['percentile_60'] = np.percentile(x, 60)
    feature_dict['percentile_90'] = np.percentile(x, 90)

    # quantile features
    feature_dict['quantile_5'] = np.quantile(x, 0.05)
    feature_dict['quantile_95'] = np.quantile(x, 0.95)
    feature_dict['quantile_99'] = np.quantile(x, 0.99)

In [5]:
if None:
    from tsfresh.feature_extraction import feature_calculators as fc

    # tsfresh based features
    feature_dict['abs_energy'] = fc.abs_energy(x)
    feature_dict['count_above_mean'] = fc.count_above_mean(x)
    feature_dict['count_below_mean'] = fc.count_below_mean(x)
    feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
    feature_dict['mean_change'] = fc.mean_change(x)

## polynomial features

In [6]:
import numpy as np

# generate a random dataframe with
# 2 columns and 100 rows
df = pd.DataFrame(
    np.random.rand(100, 2),
    columns=[f"f_{i}" for i in range(1, 3)]
)
df.head()

Unnamed: 0,f_1,f_2
0,0.612829,0.667684
1,0.826424,0.564424
2,0.265701,0.92842
3,0.030605,0.910132
4,0.216918,0.029578


In [7]:
from sklearn import preprocessing

# initialize polynomial features class object
# for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only=False,
    include_bias=False
)

# fit to the features
pf.fit(df)

# create polynomial features
poly_feats = pf.transform(df)

# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
    poly_feats,
    columns=[f"f_{i}" for i in range(1, num_feats + 1)]
)
df_transformed.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.612829,0.667684,0.37556,0.409176,0.445801
1,0.826424,0.564424,0.682977,0.466454,0.318575
2,0.265701,0.92842,0.070597,0.246682,0.861964
3,0.030605,0.910132,0.000937,0.027855,0.828341
4,0.216918,0.029578,0.047053,0.006416,0.000875


## binning (convert the numbers to categories)

In [8]:
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

df.head()

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.612829,0.667684,6,61
1,0.826424,0.564424,8,82
2,0.265701,0.92842,2,26
3,0.030605,0.910132,0,2
4,0.216918,0.029578,2,21


## log transformation

In [9]:
if None:
    print(df.f_3.var())
    print(df.f_3.apply(lambda x: np.log(1 + x)).var())

## missing (NaN) values

In [10]:
import numpy as np
from sklearn import impute

# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))

# convert the array to float
X = X.astype(float)

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
XX = knn_imputer.fit_transform(X)
X, XX

(array([[ 4., nan, 14.,  4.,  2.,  3.],
        [ 8.,  4., nan, nan, 12., 14.],
        [ 2.,  4.,  4.,  4.,  5.,  1.],
        [10.,  4., nan, nan,  4.,  4.],
        [ 3.,  1., 12., 10.,  4.,  8.],
        [nan, 13.,  6.,  2.,  2., 14.],
        [13., 10.,  8., nan,  8.,  8.],
        [12., 13., nan,  1.,  8.,  2.],
        [11.,  4.,  1., nan, nan,  2.],
        [ 5.,  6., 11.,  4., 13.,  5.]]),
 array([[ 4. ,  2.5, 14. ,  4. ,  2. ,  3. ],
        [ 8. ,  4. ,  9.5,  7. , 12. , 14. ],
        [ 2. ,  4. ,  4. ,  4. ,  5. ,  1. ],
        [10. ,  4. ,  7.5,  4. ,  4. ,  4. ],
        [ 3. ,  1. , 12. , 10. ,  4. ,  8. ],
        [12.5, 13. ,  6. ,  2. ,  2. , 14. ],
        [13. , 10. ,  8. ,  1.5,  8. ,  8. ],
        [12. , 13. ,  4.5,  1. ,  8. ,  2. ],
        [11. ,  4. ,  1. ,  2.5,  4.5,  2. ],
        [ 5. ,  6. , 11. ,  4. , 13. ,  5. ]]))