# Feature Engineering

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import warnings
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)   # For convinience

In [10]:
# Load cleaned train/val split and test dataset
train_df = pd.read_csv('../data/processed/cleaned_train.csv')   # Fit transformation only on the train split
valid_df = pd.read_csv('../data/processed/cleaned_valid.csv')
test_df = pd.read_csv('../data/processed/test_split.csv')

# Import EDA information
summary_cat_df = pd.read_csv('../data/processed/summary_cat.csv', index_col=0)
summary_cat_df

Unnamed: 0,n_cats,most_freq_cat,most_freq_count
cat1,2,A,113312
cat2,2,A,85472
cat3,2,A,142387
cat4,2,A,102780
cat5,2,A,98990
...,...,...,...
cat112,51,E,20176
cat113,59,BM,20958
cat114,19,A,105392
cat115,22,K,35230


In [11]:
cat_vars = [var for var in train_df.columns if var.startswith('cat')]
cont_vars = [var for var in train_df.columns if var.startswith('cont')]

Things to notice:
- We should make sure the encoders and transforms should be fitted only by the train split
- Handle unseen data in val/test split

In [12]:
# Fequency encoding
counts = []
for cat in cat_vars:
    count = train_df[cat].value_counts()
    counts.append(count)

    train_df[f'{cat}_freq'] = train_df[cat].map(count)
    train_df[f'{cat}_log_freq'] = np.log1p(train_df[f'{cat}_freq'])     # take log to compress extreme counts
    train_df[f'{cat}_norm_freq'] = train_df[f'{cat}_freq'] / len(train_df)

    valid_df[f'{cat}_freq'] = valid_df[cat].map(count)
    valid_df[f'{cat}_log_freq'] = np.log1p(valid_df[f'{cat}_freq'])     # take log to compress extreme counts
    valid_df[f'{cat}_norm_freq'] = valid_df[f'{cat}_freq'] / len(train_df)

    test_df[f'{cat}_freq'] = test_df[cat].map(count).fillna(0)
    test_df[f'{cat}_log_freq'] = np.log1p(test_df[f'{cat}_freq'])     # take log to compress extreme counts
    test_df[f'{cat}_norm_freq'] = test_df[f'{cat}_freq'] / len(train_df)

train_df.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat113_norm_freq,cat114_freq,cat114_log_freq,cat114_norm_freq,cat115_freq,cat115_log_freq,cat115_norm_freq,cat116_freq,cat116_log_freq,cat116_norm_freq
0,A,B,A,B,A,A,A,A,B,A,...,0.037118,105392,11.565451,0.699559,21435,9.972827,0.142279,2576,7.854381,0.017099
1,A,B,A,A,A,A,A,A,B,B,...,0.139113,105392,11.565451,0.699559,21435,9.972827,0.142279,7411,8.910856,0.049192
2,A,B,A,B,A,A,A,A,B,B,...,0.139113,105392,11.565451,0.699559,35230,10.469682,0.233846,8188,9.010547,0.054349
3,A,B,A,A,A,A,A,A,B,A,...,0.117075,105392,11.565451,0.699559,35230,10.469682,0.233846,16117,9.687692,0.10698
4,A,A,A,A,B,A,A,A,A,A,...,0.032312,105392,11.565451,0.699559,35230,10.469682,0.233846,16117,9.687692,0.10698


In [17]:
for cat, count in zip(cat_vars, counts):
    print(cat)
    print(count)
    print("")

cat1
cat1
A    113311
B     37344
Name: count, dtype: int64

cat2
cat2
A    85471
B    65184
Name: count, dtype: int64

cat3
cat3
A    142386
B      8269
Name: count, dtype: int64

cat4
cat4
A    102780
B     47875
Name: count, dtype: int64

cat5
cat5
A    98989
B    51666
Name: count, dtype: int64

cat6
cat6
A    105392
B     45263
Name: count, dtype: int64

cat7
cat7
A    146984
B      3671
Name: count, dtype: int64

cat8
cat8
A    141860
B      8795
Name: count, dtype: int64

cat9
cat9
A    90535
B    60120
Name: count, dtype: int64

cat10
cat10
A    128138
B     22517
Name: count, dtype: int64

cat11
cat11
A    134596
B     16059
Name: count, dtype: int64

cat12
cat12
A    127819
B     22836
Name: count, dtype: int64

cat13
cat13
A    135106
B     15549
Name: count, dtype: int64

cat14
cat14
A    148831
B      1824
Name: count, dtype: int64

cat15
cat15
A    150625
B        30
Name: count, dtype: int64

cat16
cat16
A    145482
B      5173
Name: count, dtype: int64

cat17
cat17
A   

In [23]:
# Ordinal Encoding (Much easier to handle unseen data)
ord_en = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ord_en.fit(train_df[cat_vars])
    
train_df[cat_vars] = ord_en.transform(train_df[cat_vars])
valid_df[cat_vars] = ord_en.transform(valid_df[cat_vars])
test_df[cat_vars] = ord_en.transform(test_df[cat_vars])

train_df.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat113_norm_freq,cat114_freq,cat114_log_freq,cat114_norm_freq,cat115_freq,cat115_log_freq,cat115_norm_freq,cat116_freq,cat116_log_freq,cat116_norm_freq
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.037118,105392,11.565451,0.699559,21435,9.972827,0.142279,2576,7.854381,0.017099
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.139113,105392,11.565451,0.699559,21435,9.972827,0.142279,7411,8.910856,0.049192
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.139113,105392,11.565451,0.699559,35230,10.469682,0.233846,8188,9.010547,0.054349
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.117075,105392,11.565451,0.699559,35230,10.469682,0.233846,16117,9.687692,0.10698
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.032312,105392,11.565451,0.699559,35230,10.469682,0.233846,16117,9.687692,0.10698


In [24]:
# One Hot Encoding
# Only on low cardinality columns
low_card_cols = summary_cat_df[summary_cat_df['n_cats'] <= 3].index.to_list()

onehot_en = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
onehot_en.fit(train_df[low_card_cols])

train_onehot_encoded = onehot_en.transform(train_df[low_card_cols])
valid_onehot_encoded = onehot_en.transform(valid_df[low_card_cols])
test_onehot_encoded = onehot_en.transform(test_df[low_card_cols])

train_onehot_df = pd.DataFrame(train_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))
valid_onehot_df = pd.DataFrame(valid_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))
test_onehot_df = pd.DataFrame(test_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))

train_df = pd.concat([train_df, train_onehot_df], axis=1)
valid_df = pd.concat([valid_df, valid_onehot_df], axis=1)
test_df = pd.concat([test_df, test_onehot_df], axis=1)

train_df.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat73_2.0,cat74_0.0,cat74_1.0,cat74_2.0,cat75_0.0,cat75_1.0,cat75_2.0,cat76_0.0,cat76_1.0,cat76_2.0
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [25]:
# Categorical groups numeric stats
for cat in cat_vars:
    group = train_df.groupby(cat)
    for cont in cont_vars:
        # Calculate global stats for unseen categories
        global_mean = train_df[cont].mean()
        global_med = train_df[cont].median()
        global_std = train_df[cont].std()
        
        train_df[f'{cat}_{cont}_mean'] = train_df[cat].map(group[cont].mean())
        valid_df[f'{cat}_{cont}_mean'] = valid_df[cat].map(group[cont].mean()).fillna(global_mean)
        test_df[f'{cat}_{cont}_mean'] = test_df[cat].map(group[cont].mean()).fillna(global_mean)

        train_df[f'{cat}_{cont}_med'] = train_df[cat].map(group[cont].median())
        valid_df[f'{cat}_{cont}_med'] = valid_df[cat].map(group[cont].median()).fillna(global_med)
        test_df[f'{cat}_{cont}_med'] = test_df[cat].map(group[cont].median()).fillna(global_med)

        train_df[f'{cat}_{cont}_std'] = train_df[cat].map(group[cont].std())
        valid_df[f'{cat}_{cont}_std'] = valid_df[cat].map(group[cont].std()).fillna(global_std)
        test_df[f'{cat}_{cont}_std'] = test_df[cat].map(group[cont].std()).fillna(global_std)

train_df.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat116_cont11_std,cat116_cont12_mean,cat116_cont12_med,cat116_cont12_std,cat116_cont13_mean,cat116_cont13_med,cat116_cont13_std,cat116_cont14_mean,cat116_cont14_med,cat116_cont14_std
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.070447,0.627966,0.594646,0.065338,0.766945,0.789182,0.064777,0.532059,0.486164,0.224767
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.01986,0.346091,0.352251,0.021235,0.416031,0.339244,0.124558,0.462671,0.383468,0.224749
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.022772,0.222776,0.220003,0.020596,0.324709,0.333292,0.065076,0.496191,0.392663,0.23105
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.04882,0.33016,0.32157,0.046048,0.41311,0.345247,0.133855,0.480627,0.388063,0.222465
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.04882,0.33016,0.32157,0.046048,0.41311,0.345247,0.133855,0.480627,0.388063,0.222465


In [26]:
# Numerics transformation

# Rank encoding for numerics
def fit_rank_transform(df, col):
    unique = sorted(df[col].unique())
    ranks = np.searchsorted(unique, unique) / len(unique)
    return unique, ranks

def transform_rank(values, unique, ranks):
    ''' Return the rank of value given the fitted rank transform (unique, ranks) '''
    return np.interp(values, unique, ranks)

# Winsorization (cap extremes)
def fit_winsor(df, col):
    l, u = df[col].quantile([0.01, 0.99])
    return l, u


for cont in cont_vars:
    # Log transform
    train_df[f'log_{cont}'] = np.log1p(train_df[cont])
    valid_df[f'log_{cont}'] = np.log1p(valid_df[cont])
    test_df[f'log_{cont}'] = np.log1p(test_df[cont])

    # Rank transform
    uniq, ranks = fit_rank_transform(train_df, cont)
    train_df[f'{cont}_rank'] = transform_rank(train_df[cont], uniq, ranks)
    valid_df[f'{cont}_rank'] = transform_rank(valid_df[cont], uniq, ranks)
    test_df[f'{cont}_rank'] = transform_rank(test_df[cont], uniq, ranks)

    # Winsorization
    l, u = fit_winsor(train_df, cont)
    train_df[f'{cont}_cap'] = train_df[cont].clip(l, u)
    valid_df[f'{cont}_cap'] = valid_df[cont].clip(l, u)
    test_df[f'{cont}_cap'] = test_df[cont].clip(l, u)

# Transform target loss
train_df[f'log_loss'] = np.log(train_df['loss'])
valid_df[f'log_loss'] = np.log(valid_df['loss'])
test_df[f'log_loss'] = np.log(test_df['loss'])
    
train_df

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,log_cont12,cont12_rank,cont12_cap,log_cont13,cont13_rank,cont13_cap,log_cont14,cont14_rank,cont14_cap,log_loss
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.466652,0.481366,0.594646,0.600205,0.764873,0.822493,0.539322,0.748000,0.714843,7.702186
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.312111,0.298137,0.366307,0.477123,0.541076,0.611431,0.265817,0.243547,0.304496,7.157424
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.184164,0.133540,0.202213,0.219947,0.218130,0.246011,0.359495,0.477393,0.432606,7.924380
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.306932,0.291925,0.359249,0.296578,0.320113,0.345247,0.546265,0.779089,0.726792,8.545367
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.301771,0.285714,0.352251,0.294339,0.317280,0.342239,0.324205,0.389743,0.382931,7.031936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150650,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.198853,0.155280,0.220003,0.287651,0.308782,0.333292,0.189145,0.047314,0.208216,7.088926
150651,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.263841,0.239130,0.301921,0.276605,0.294618,0.318646,0.266871,0.244965,0.305872,7.010619
150652,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.366983,0.363354,0.443374,0.292105,0.314448,0.339244,0.408054,0.562376,0.503888,8.659151
150653,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.616733,0.704969,0.852865,0.503652,0.580737,0.654753,0.543316,0.764055,0.721707,7.354279


In [27]:
valid_df

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,log_cont12,cont12_rank,cont12_cap,log_cont13,cont13_rank,cont13_cap,log_cont14,cont14_rank,cont14_cap,log_loss
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.216462,0.177019,0.241676,0.261390,0.274788,0.298734,0.191620,0.054405,0.211210,8.114690
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.301771,0.285714,0.352251,0.267873,0.283286,0.307181,0.213541,0.108243,0.238054,8.937905
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.256467,0.229814,0.292356,0.252844,0.263456,0.287682,0.194784,0.065298,0.215049,7.739738
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.484716,0.500000,0.623714,0.590168,0.739377,0.804291,0.553640,0.806093,0.739573,8.529857
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.572433,0.627329,0.772574,0.620450,0.824363,0.859764,0.589501,0.883474,0.803088,8.016014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18826,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.216462,0.177019,0.241676,0.229989,0.232295,0.258586,0.322848,0.383446,0.381055,8.398916
18827,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.153277,0.083851,0.165648,0.339696,0.373938,0.404520,0.550942,0.800306,0.734887,7.458325
18828,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.380001,0.378882,0.462286,0.272227,0.288952,0.312885,0.230081,0.161060,0.258702,8.492245
18829,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.263841,0.239130,0.301921,0.548733,0.654391,0.731059,0.327338,0.403132,0.387270,6.651481


## Export engineered dataframes

In [28]:
# Store in parquet format for efficiency
train_df.to_parquet('../data/processed/transformed_train.parquet', index=False)
valid_df.to_parquet('../data/processed/transformed_valid.parquet', index=False)
test_df.to_parquet('../data/processed/transformed_test.parquet', index=False)