# stage_3_build_feature_engineering
This notebook transforms the cleaned data from stage 2 and generates a set of all numeric (float) features, then save it to the `data/processed` folder. 

Conscious of time, I'm not going to experiment with complex feature engineering ideas, rather just focus on 2 things:
- Convert life time transactional metrics like: orders to date, items to date etc., into orders/items per month
- Convert categorical features into numeric if they aren't numeric already

# Imports

In [22]:
import pandas as pd
import numpy as np

# Load cleaned data from stage2

In [5]:
input_data_path = '../data/processed/clean_data.parquet'
input_data = pd.read_parquet(input_data_path)

In [11]:
with pd.option_context('display.max_columns', 999):
    display(input_data.sample(20))

Unnamed: 0,customer_id,days_since_first_order,days_since_last_order,is_newsletter_subscriber,orders,items,cancels,returns,different_addresses,shipping_addresses,devices,vouchers,cc_payments,paypal_payments,afterpay_payments,apple_payments,female_items,male_items,unisex_items,wapp_items,wftw_items,mapp_items,wacc_items,macc_items,mftw_items,wspt_items,mspt_items,curvy_items,sacc_items,msite_orders,desktop_orders,android_orders,ios_orders,other_device_orders,work_orders,home_orders,parcelpoint_orders,other_collection_orders,redpen_discount_used,coupon_discount_applied,average_discount_onoffer,average_discount_used,revenue,tenure_months
31070,2addd25f15d30b6a52d54d6250a10d89,1756,902.0,N,4,6,0,1,0,2,1,0,0,1,0,0,6,0,0,4,0,0,2,2,0,0,0,0,0,0,4,0,0,0,0,0,0,4,0.0,0.0,0.0,0.0,823.98,28.466667
4311,16ecdcc195427a632d05047fc51bb45f,1865,1833.0,Y,2,6,0,0,1,2,2,1,1,0,0,0,4,0,2,0,4,2,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2,378.18,173.68,0.0818,0.241834,1240.28,1.066667
34322,757728f02d010c886d905ead3df8756e,299,299.0,N,1,1,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0.0,13.63,0.0,0.29989,31.82,0.0
46218,1148f9c5b6b5fea41c848391697ef97b,582,582.0,N,1,1,0,0,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,54.52,12.72,0.2999,0.369918,114.52,0.0
28932,2474c4037e77cf5606f3a81f72ea6639,547,161.0,Y,2,3,0,0,0,1,2,0,1,0,0,0,1,1,1,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,2,0,0,0.0,,0.0,0.0,825.35,12.866667
22731,d43915e347b375b96ea8fd6b51e0a984,1872,1365.0,Y,3,4,0,0,0,1,1,1,1,0,0,0,4,0,0,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,3,110.89,8.72,0.226,0.252605,307.5,16.9
10640,de3057b65ab82649a1da4b321be0138b,1779,1779.0,N,2,6,0,0,0,1,1,0,1,1,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0.0,0.0,0.0,0.0,231.42,0.0
23652,5c828bc7e18ca825e4b2a07281cf8872,1716,813.0,N,3,10,0,2,0,1,1,0,1,0,0,0,10,0,0,1,3,0,6,6,0,0,0,0,0,0,3,0,0,0,0,1,0,2,432.88,0.0,0.1006,0.100612,1508.48,30.1
23292,9f15c84cf77e0e996feedffbfc7588f1,1637,1071.0,N,3,6,0,2,0,2,1,2,0,1,0,0,4,2,0,0,2,1,2,2,0,0,0,0,0,0,3,0,0,0,0,0,0,3,133.94,36.34,0.2999,0.367923,330.92,18.866667
5176,32036448f64c53dd40125fdd7b60e268,1793,1793.0,N,1,1,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,81.77,0.0


# Calculate customer tenure

In [9]:
input_data = input_data.assign(
    tenure_months=lambda df: (df.days_since_first_order - df.days_since_last_order) / 30.0
)

In a naive way, I simply calculated the gap between first and last order and divided by 30 days, hope it's a good enough proxy for tenure

In [10]:
input_data.tenure_months.describe()

count    46279.000000
mean        11.644888
std         18.461401
min          0.000000
25%          0.000000
50%          0.033333
75%         17.966667
max         69.566667
Name: tenure_months, dtype: float64

Small tenure of 0 or 0.03 wouldn't work, in division it will result in huge numbers or inifinity

I'll just round them up to the closest integer

In [23]:
input_data['tenure_months'] = np.ceil(input_data.tenure_months)

In [24]:
input_data.tenure_months.describe()

count    46279.000000
mean        12.394455
std         18.291692
min          1.000000
25%          1.000000
50%          1.000000
75%         18.000000
max         70.000000
Name: tenure_months, dtype: float64

That looks much better

# Convert transactional features into monthly

In [56]:
features_numeric_trasactional = input_data[['orders','items','cancels','returns',
                                            'vouchers',
                                        'female_items','male_items','unisex_items','wapp_items','wftw_items',
                                        'mapp_items','wacc_items','macc_items','mftw_items','wspt_items','mspt_items',
                                        'curvy_items','sacc_items',
                                        'msite_orders','desktop_orders','android_orders','ios_orders','other_device_orders',
                                        'work_orders','home_orders','parcelpoint_orders','other_collection_orders',
                                        'redpen_discount_used','coupon_discount_applied',
                                        'revenue',
                                       ]].fillna(0.0)


Divide those feature by tenure_months

In [27]:
features_numeric_monthly = (1.0 * features_numeric_trasactional).div(input_data.tenure_months, axis='index')

In [57]:
features_numeric_monthly.describe()

Unnamed: 0,orders,items,cancels,returns,vouchers,female_items,male_items,unisex_items,wapp_items,wftw_items,...,android_orders,ios_orders,other_device_orders,work_orders,home_orders,parcelpoint_orders,other_collection_orders,redpen_discount_used,coupon_discount_applied,revenue
count,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,...,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,46279.0,36074.0,46279.0
mean,0.907217,1.629684,0.014082,0.190153,0.158151,1.035252,0.464915,0.129517,0.467542,0.364079,...,0.007025,0.046042,2.2e-05,0.035911,0.241882,0.00358,0.625845,74.69916,18.281998,201.831157
std,0.981241,2.770734,0.140664,0.621776,0.357911,1.996385,1.614851,0.624559,1.39715,0.88177,...,0.110476,0.264897,0.004648,0.238884,0.791259,0.072521,0.789673,391.11128,135.208842,1874.155354
min,0.028986,0.029851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.333333,0.619048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,30.435114
50%,1.0,1.0,0.0,0.0,0.0,0.642857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.388889,11.034286,0.0,72.64
75%,1.0,2.0,0.0,0.0625,0.083333,1.0,0.263158,0.0,0.444444,0.3,...,0.0,0.0,0.0,0.0,0.111111,0.0,1.0,54.0,9.09,163.5
max,107.0,232.0,14.83871,56.0,7.5,139.0,116.0,40.0,103.0,38.0,...,6.5,13.0,1.0,17.0,107.0,6.0,49.0,30988.68,15156.88,354700.16


In [58]:
features_numeric_monthly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46279 entries, 0 to 46278
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   orders                   46279 non-null  float64
 1   items                    46279 non-null  float64
 2   cancels                  46279 non-null  float64
 3   returns                  46279 non-null  float64
 4   vouchers                 46279 non-null  float64
 5   female_items             46279 non-null  float64
 6   male_items               46279 non-null  float64
 7   unisex_items             46279 non-null  float64
 8   wapp_items               46279 non-null  float64
 9   wftw_items               46279 non-null  float64
 10  mapp_items               46279 non-null  float64
 11  wacc_items               46279 non-null  float64
 12  macc_items               46279 non-null  float64
 13  mftw_items               46279 non-null  float64
 14  wspt_items            

Looks good

# Get other numeric features

In [39]:
features_numeric_other = input_data[['days_since_first_order','days_since_last_order','tenure_months',
                                     'different_addresses','shipping_addresses','devices',
                                 'average_discount_onoffer','average_discount_used',
                                ]]


In [76]:
features_numeric_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46279 entries, 0 to 46278
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   days_since_first_order    46279 non-null  int64  
 1   days_since_last_order     46279 non-null  float64
 2   tenure_months             46279 non-null  float64
 3   different_addresses       46279 non-null  int64  
 4   shipping_addresses        46279 non-null  int64  
 5   devices                   46279 non-null  int64  
 6   average_discount_onoffer  46279 non-null  float64
 7   average_discount_used     46279 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 2.8 MB


Convert everything to float:

In [77]:
features_numeric_other = features_numeric_other * 1.0

In [78]:
features_numeric_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46279 entries, 0 to 46278
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   days_since_first_order    46279 non-null  float64
 1   days_since_last_order     46279 non-null  float64
 2   tenure_months             46279 non-null  float64
 3   different_addresses       46279 non-null  float64
 4   shipping_addresses        46279 non-null  float64
 5   devices                   46279 non-null  float64
 6   average_discount_onoffer  46279 non-null  float64
 7   average_discount_used     46279 non-null  float64
dtypes: float64(8)
memory usage: 2.8 MB


Done

# Handle the categorical features

In [43]:
features_categorical = input_data[['is_newsletter_subscriber', 'cc_payments','paypal_payments','afterpay_payments','apple_payments']]

In [66]:
features_categorical

Unnamed: 0,is_newsletter_subscriber,cc_payments,paypal_payments,afterpay_payments,apple_payments
0,N,1,0,0,0
1,Y,0,1,0,0
2,Y,1,0,1,0
3,Y,1,0,0,0
4,Y,1,0,0,0
...,...,...,...,...,...
46274,N,1,1,0,0
46275,Y,1,1,0,0
46276,N,1,0,0,0
46277,Y,1,0,0,0


Convert Y/N to float

In [67]:
features_categorical = features_categorical\
.assign(is_newsletter_subscriber=lambda df: (df.is_newsletter_subscriber == 'Y') * 1.0)

In [68]:
features_categorical

Unnamed: 0,is_newsletter_subscriber,cc_payments,paypal_payments,afterpay_payments,apple_payments
0,0.0,1,0,0,0
1,1.0,0,1,0,0
2,1.0,1,0,1,0
3,1.0,1,0,0,0
4,1.0,1,0,0,0
...,...,...,...,...,...
46274,0.0,1,1,0,0
46275,1.0,1,1,0,0
46276,0.0,1,0,0,0
46277,1.0,1,0,0,0


Everything else to float as well:

In [69]:
features_categorical = 1.0 * features_categorical

In [70]:
features_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46279 entries, 0 to 46278
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   is_newsletter_subscriber  46279 non-null  float64
 1   cc_payments               46279 non-null  float64
 2   paypal_payments           46279 non-null  float64
 3   afterpay_payments         46279 non-null  float64
 4   apple_payments            46279 non-null  float64
dtypes: float64(5)
memory usage: 1.8 MB


# Conbine the feature sets and save to disk

Put customer_id in there in case we need addtional index or reference

In [81]:
features = pd.concat([
    input_data[['customer_id']], 
    features_categorical, 
    features_numeric_monthly, 
    features_numeric_other
], 
    axis = 1
)

In [82]:
features

Unnamed: 0,customer_id,is_newsletter_subscriber,cc_payments,paypal_payments,afterpay_payments,apple_payments,orders,items,cancels,returns,...,coupon_discount_applied,revenue,days_since_first_order,days_since_last_order,tenure_months,different_addresses,shipping_addresses,devices,average_discount_onoffer,average_discount_used
0,64f7d7dd7a59bba7168cc9c960a5c60e,0.0,1.0,0.0,0.0,0.0,0.354167,1.041667,0.000000,0.020833,...,5.180208,144.715417,2091.0,653.0,48.0,0.0,4.0,1.0,0.3364,0.358448
1,fa7c64efd5c037ff2abcce571f9c1712,1.0,0.0,1.0,0.0,0.0,0.188406,0.376812,0.000000,0.072464,...,0.000000,77.235942,2082.0,22.0,69.0,0.0,4.0,2.0,0.1404,0.140410
2,18923c9361f27583d2320951435e4888,1.0,1.0,0.0,1.0,0.0,1.028986,2.202899,0.028986,0.028986,...,1.564058,204.838696,2072.0,6.0,69.0,1.0,6.0,2.0,0.1851,0.189973
3,aa21f31def4edbdcead818afcdfc4d32,1.0,1.0,0.0,0.0,0.0,2.000000,2.000000,0.000000,0.000000,...,90.900000,143.640000,2054.0,2050.0,1.0,0.0,1.0,1.0,0.0000,0.387567
4,668c6aac52ff54d4828ad379cdb38e7d,1.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.000000,...,0.000000,0.000000,2053.0,2053.0,1.0,0.0,1.0,1.0,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46274,5b34391ec6fbc0f189cb8d3d88806199,0.0,1.0,1.0,0.0,0.0,0.400000,0.888889,0.000000,0.000000,...,39.807333,84.952000,1372.0,50.0,45.0,0.0,7.0,2.0,0.0091,0.352567
46275,198fd2f143f70b149344bcaf7eddee12,1.0,1.0,1.0,0.0,0.0,1.055556,1.055556,0.055556,0.333333,...,13.367778,76.871111,646.0,124.0,18.0,1.0,2.0,2.0,0.1210,0.209202
46276,338b5c8ade4af1a562d55d4036710630,0.0,1.0,0.0,0.0,0.0,0.181818,0.181818,0.000000,0.000000,...,,47.437273,1308.0,998.0,11.0,1.0,2.0,1.0,0.1500,0.150000
46277,2115c065bfc1f3b39e4c87c202e80fa5,1.0,1.0,0.0,0.0,0.0,2.800000,3.000000,0.000000,0.400000,...,50.990000,142.458000,1410.0,1287.0,5.0,0.0,1.0,2.0,0.1824,0.320760


In [87]:
features.isnull().any()[lambda s: s]

coupon_discount_applied    True
dtype: bool

Coupon_discount_applied still has null values in it, must have missed it somehow..

In [88]:
features = features.fillna(0.0)

In [89]:
features.isnull().any()[lambda s: s]

Series([], dtype: bool)

All good now

In [90]:
output_file_path = '../data/processed/features.parquet'

features.to_parquet(output_file_path)