# Retrain - Data Preparation

1. Fit feature encoders to the whole training (train+valid) dataset and transform training (train+valid split)/test dataset
2. Retrain final model on the transformed training dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import warnings
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)   # For convinience

In [3]:
# Load data
train_df = pd.read_csv('../data/processed/cleaned_train.csv')
valid_df = pd.read_csv('../data/processed/cleaned_valid.csv')
test_df = pd.read_csv('../data/processed/test_split.csv')
k_test_df = pd.read_csv('../data/raw/test.csv') # kaggle test set

# Combine train and valid into full training set
full_train_df = pd.concat([train_df, valid_df], ignore_index=True)
full_train_df

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,A,B,A,B,A,A,A,A,B,A,...,0.718367,0.335060,0.30260,0.67135,0.83510,0.569745,0.594646,0.822493,0.714843,2213.18
1,A,B,A,A,A,A,A,A,B,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.60
2,A,B,A,B,A,A,A,A,B,B,...,0.178193,0.247408,0.24564,0.22089,0.21230,0.204687,0.202213,0.246011,0.432606,2763.85
3,A,B,A,A,A,A,A,A,B,A,...,0.364464,0.401162,0.26847,0.46226,0.50556,0.366788,0.359249,0.345247,0.726792,5142.87
4,A,A,A,A,B,A,A,A,A,A,...,0.381515,0.363768,0.24564,0.40455,0.47225,0.334828,0.352251,0.342239,0.382931,1132.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169481,A,B,A,A,A,B,A,A,B,A,...,0.212308,0.325779,0.29758,0.34365,0.30529,0.245410,0.241676,0.258586,0.381055,4442.25
169482,A,B,A,A,A,A,A,B,B,A,...,0.183243,0.251696,0.40028,0.21374,0.19431,0.167024,0.165648,0.404520,0.734887,1734.24
169483,A,B,A,A,B,A,A,A,B,A,...,0.460158,0.521362,0.29758,0.50420,0.54983,0.453334,0.462286,0.312885,0.258702,4876.80
169484,A,B,A,A,A,A,A,A,B,A,...,0.392395,0.322536,0.36636,0.29095,0.43919,0.307628,0.301921,0.731059,0.387270,773.93


In [4]:
cat_vars = [var for var in full_train_df.columns if var.startswith('cat')]
cont_vars = [var for var in full_train_df.columns if var.startswith('cont')]

# Summarize all categorical columns
full_summary_cat_df = pd.DataFrame({
    "n_cats": full_train_df[cat_vars].nunique(),
    "most_freq_cat": full_train_df[cat_vars].agg(lambda x: x.value_counts().idxmax()),
    "most_freq_count": full_train_df[cat_vars].agg(lambda x: x.value_counts().max())
})

#full_summary_cat_df

In [5]:
# Fequency encoding
for cat in cat_vars:
    
    count = full_train_df[cat].value_counts()
    
    full_train_df[f'{cat}_freq'] = full_train_df[cat].map(count)
    full_train_df[f'{cat}_log_freq'] = np.log1p(full_train_df[f'{cat}_freq'])     # take log to compress extreme counts
    full_train_df[f'{cat}_norm_freq'] = full_train_df[f'{cat}_freq'] / len(full_train_df)

    test_df[f'{cat}_freq'] = test_df[cat].map(count).fillna(0)
    test_df[f'{cat}_log_freq'] = np.log1p(test_df[f'{cat}_freq'])     # take log to compress extreme counts
    test_df[f'{cat}_norm_freq'] = test_df[f'{cat}_freq'] / len(full_train_df)

    k_test_df[f'{cat}_freq'] = k_test_df[cat].map(count).fillna(0)
    k_test_df[f'{cat}_log_freq'] = np.log1p(k_test_df[f'{cat}_freq'])     # take log to compress extreme counts
    k_test_df[f'{cat}_norm_freq'] = k_test_df[f'{cat}_freq'] / len(full_train_df)

print('Train shape : ', full_train_df.shape)
print('Test shape  : ', test_df.shape)
print('Kaggle Test shape  : ', k_test_df.shape)

Train shape :  (169486, 479)
Test shape  :  (18831, 479)
Kaggle Test shape  :  (125546, 479)


In [7]:
# Ordinal Encoding (Much easier to handle unseen data)
ord_en = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ord_en.fit(full_train_df[cat_vars])
    
full_train_df[cat_vars] = ord_en.transform(full_train_df[cat_vars])
test_df[cat_vars] = ord_en.transform(test_df[cat_vars])
k_test_df[cat_vars] = ord_en.transform(k_test_df[cat_vars])

print('Train shape : ', full_train_df.shape)
print('Test shape  : ', test_df.shape)
print('Kaggle Test shape  : ', k_test_df.shape)

Train shape :  (169486, 479)
Test shape  :  (18831, 479)
Kaggle Test shape  :  (125546, 479)


In [8]:
# One Hot Encoding
# Only on low cardinality columns
low_card_cols = full_summary_cat_df[full_summary_cat_df['n_cats'] <= 3].index.to_list()

onehot_en = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
onehot_en.fit(full_train_df[low_card_cols])

train_onehot_encoded = onehot_en.transform(full_train_df[low_card_cols])
test_onehot_encoded = onehot_en.transform(test_df[low_card_cols])
k_test_onehot_encoded = onehot_en.transform(k_test_df[low_card_cols])

train_onehot_df = pd.DataFrame(train_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))
test_onehot_df = pd.DataFrame(test_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))
k_test_onehot_df = pd.DataFrame(k_test_onehot_encoded, columns=onehot_en.get_feature_names_out(low_card_cols))

full_train_df = pd.concat([full_train_df, train_onehot_df], axis=1)
test_df = pd.concat([test_df, test_onehot_df], axis=1)
k_test_df = pd.concat([k_test_df, k_test_onehot_df], axis=1)

print('Train shape : ', full_train_df.shape)
print('Test shape  : ', test_df.shape)
print('Kaggle Test shape  : ', k_test_df.shape)

Train shape :  (169486, 635)
Test shape  :  (18831, 635)
Kaggle Test shape  :  (125546, 635)


In [9]:
# Categorical groups numeric stats
for cat in cat_vars:
    group = full_train_df.groupby(cat)
    for cont in cont_vars:
        # Calculate global stats for unseen categories
        global_mean = full_train_df[cont].mean()
        global_med = full_train_df[cont].median()
        global_std = full_train_df[cont].std()
        
        full_train_df[f'{cat}_{cont}_mean'] = full_train_df[cat].map(group[cont].mean())
        test_df[f'{cat}_{cont}_mean'] = test_df[cat].map(group[cont].mean()).fillna(global_mean)
        k_test_df[f'{cat}_{cont}_mean'] = k_test_df[cat].map(group[cont].mean()).fillna(global_mean)

        full_train_df[f'{cat}_{cont}_med'] = full_train_df[cat].map(group[cont].median())
        test_df[f'{cat}_{cont}_med'] = test_df[cat].map(group[cont].median()).fillna(global_med)
        k_test_df[f'{cat}_{cont}_med'] = k_test_df[cat].map(group[cont].median()).fillna(global_med)

        full_train_df[f'{cat}_{cont}_std'] = full_train_df[cat].map(group[cont].std())
        test_df[f'{cat}_{cont}_std'] = test_df[cat].map(group[cont].std()).fillna(global_std)
        k_test_df[f'{cat}_{cont}_std'] = k_test_df[cat].map(group[cont].std()).fillna(global_std)

print('Train shape : ', full_train_df.shape)
print('Test shape  : ', test_df.shape)
print('Kaggle Test shape  : ', k_test_df.shape)

Train shape :  (169486, 5507)
Test shape  :  (18831, 5507)
Kaggle Test shape  :  (125546, 5507)


In [10]:
# Numerics transformation

# Rank encoding for numerics
def fit_rank_transform(df, col):
    unique = sorted(df[col].unique())
    ranks = np.searchsorted(unique, unique) / len(unique)
    return unique, ranks

def transform_rank(values, unique, ranks):
    ''' Return the rank of value given the fitted rank transform (unique, ranks) '''
    return np.interp(values, unique, ranks)

# Winsorization (cap extremes)
def fit_winsor(df, col):
    l, u = df[col].quantile([0.01, 0.99])
    return l, u


for cont in cont_vars:
    # Log transform
    full_train_df[f'log_{cont}'] = np.log1p(full_train_df[cont])
    test_df[f'log_{cont}'] = np.log1p(test_df[cont])
    k_test_df[f'log_{cont}'] = np.log1p(k_test_df[cont])

    # Rank transform
    uniq, ranks = fit_rank_transform(full_train_df, cont)
    full_train_df[f'{cont}_rank'] = transform_rank(full_train_df[cont], uniq, ranks)
    test_df[f'{cont}_rank'] = transform_rank(test_df[cont], uniq, ranks)
    k_test_df[f'{cont}_rank'] = transform_rank(k_test_df[cont], uniq, ranks)

    # Winsorization
    l, u = fit_winsor(full_train_df, cont)
    full_train_df[f'{cont}_cap'] = full_train_df[cont].clip(l, u)
    test_df[f'{cont}_cap'] = test_df[cont].clip(l, u)
    k_test_df[f'{cont}_cap'] = k_test_df[cont].clip(l, u)

# Transform target loss
full_train_df[f'log_loss'] = np.log(full_train_df['loss'])
test_df[f'log_loss'] = np.log(test_df['loss'])

print('Train shape : ', full_train_df.shape)
print('Test shape  : ', test_df.shape)
print('Kaggle Test shape  : ', k_test_df.shape)

Train shape :  (169486, 5550)
Test shape  :  (18831, 5550)
Kaggle Test shape  :  (125546, 5549)


In [11]:
# Export transformed retrain data
full_train_df.to_parquet('../data/retrain/final_transformed_train.parquet', index=False)
test_df.to_parquet('../data/retrain/final_transformed_test.parquet', index=False)
k_test_df.to_parquet('../data/retrain/final_k_test.parquet', index=False)