# FEATURE ENGINEERING OPERATIONS - {"BLACK FRIDAY SALES" DATASET}

## 1. Importing Modules

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import PowerTransformer, FunctionTransformer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.pipeline import Pipeline

from pickle import dump, load

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

In [3]:
# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

In [4]:
from sklearn import set_config
set_config(display='diagram')

## 2. Feature Engineering Pipeline

### 2.1 Target Feature Transformation
#### Applied to the train dataset only, and will be used to inverse map the results on train, validation and test dataset to actual predictions. 

In [5]:
pt = PowerTransformer()

### 2.2 Categorical Feature Transformation
#### Applied to the train, validation, and test dataset

In [6]:
def rem_cols(ds):
    cols = ['User_ID','Product_ID']
    ds = ds.drop(columns=cols)
    
    return ds

In [7]:
pre_proc_cat = ColumnTransformer(transformers=[
                ('ft',FunctionTransformer(rem_cols),[0,1]),
                ('oe',OrdinalEncoder(categories=[['0-17','18-25','26-35','36-45','46-50','51-55','55+'],
                                ['A','B','C'],
                                ['0','1','2','3','4+']],dtype='int'),[3,5,6]),
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[2,4,7,8,9,10]),
                ],
                remainder='passthrough')

In [8]:
steps = [
         ('cat',pre_proc_cat)
        ]

pipe = Pipeline(steps)

## 3. Importing Dataset

### 3.1 Train Dataset

In [9]:
tr = pd.read_pickle('bfs_train_pp.pkl')

print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (549868, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
94454,1002589,P00296042,M,26-35,0,C,3,0,8,13,16,5820
6914,1001101,P00251842,M,36-45,1,A,1,0,1,15,16,11715
167574,1001861,P00154642,M,51-55,16,A,1,1,8,14,17,8075
323135,1001746,P00251842,M,26-35,4,C,1,1,1,15,16,4336
31253,1004786,P00025942,M,46-50,6,B,1,1,8,14,17,10057


In [10]:
Xtr = tr.drop(columns=['Purchase'])
Xtr.head(1)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
94454,1002589,P00296042,M,26-35,0,C,3,0,8,13,16


#### Target Feature Transformation (Train Dataset)

In [11]:
tr['tr_Purchase'] = pt.fit_transform(tr['Purchase'].values.reshape(-1,1))
dump(pt, open('bfs_target_PT.pkl','wb'))
print('Power Transformer object Saved Successfully \n')

# ptt = load(open('bfs_target_PT.pkl','rb'))
# tr['inv'] = ptt.inverse_transform(tr['tr_Purchase'].values.reshape(-1,1))
# tr[['Purchase','tr_Purchase','inv']].head()

Power Transformer object Saved Successfully 



#### Categorical Feature Encoding (Train Dataset)

In [12]:
Xtr = pipe.fit_transform(Xtr)
dump(pipe, open('bfs_pipe_FE.pkl','wb'))
print('FE Pipe object Saved Successfully \n')

print(Xtr.shape)
Xtr[:1]

FE Pipe object Saved Successfully 

(549868, 74)


array([[2., 2., 3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

#### Saving the train dataset after FE

In [13]:
Xtr = pd.DataFrame(Xtr)
Xtr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,2.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
ytr = tr[['Purchase','tr_Purchase']].values
ytr = pd.DataFrame(ytr, columns=['Purchase','Purchase_trans'])   
# use 'Purchase_trans' column as target during training and drop 'Purchase' column.
ytr.head()

Unnamed: 0,Purchase,Purchase_trans
0,5820.0,-0.6056
1,11715.0,0.5769
2,8075.0,-0.109
3,4336.0,-0.981
4,10057.0,0.2784


In [15]:
tr = pd.concat([Xtr,ytr], axis=1)
tr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,Purchase,Purchase_trans
0,2.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5820.0,-0.6056
1,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11715.0,0.5769
2,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8075.0,-0.109
3,2.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4336.0,-0.981
4,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10057.0,0.2784


In [16]:
print('Writing the dataframe to CSV and PKL files ... \n')

tr.to_csv('bfs_train_FE.csv', index=False)
tr.to_pickle('bfs_train_FE.pkl')

print('Train File Saved Successfully ...')

Writing the dataframe to CSV and PKL files ... 

Train File Saved Successfully ...


### 3.2 Validation Dataset

In [17]:
val = pd.read_pickle('bfs_valid_pp.pkl')

print(f'Shape of the validation dataset : {val.shape}')
val.head(5)

Shape of the validation dataset : (100, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
54386,1002279,P00221542,M,51-55,7,B,2,1,5,15,14,7058
536560,1004577,P0095842,M,36-45,7,C,0,1,3,4,12,8198
128570,1001764,P00044742,M,18-25,0,B,1,0,5,8,14,5426
409943,1003128,P00120042,M,18-25,2,B,3,0,1,2,15,11930
35044,1005406,P00050242,F,36-45,0,C,3,1,8,14,17,6164


In [18]:
Xval = val.drop(columns=['Purchase'])
Xval = pipe.transform(Xval)

yval = val['Purchase'].values

In [19]:
Xval = pd.DataFrame(Xval)
yval = pd.DataFrame(yval, columns=['Purchase'])

In [20]:
val = pd.concat([Xval,yval], axis=1)
val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,Purchase
0,5.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7058
1,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8198
2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5426
3,1.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11930
4,3.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6164


In [21]:
print('Writing the dataframes to CSV and PKL files ... \n')

val.to_csv('bfs_valid_FE.csv', index=False)
val.to_pickle('bfs_valid_FE.pkl')

print('Validation File Saved Successfully ...')

Writing the dataframes to CSV and PKL files ... 

Validation File Saved Successfully ...


### 3.3 Test Dataset

In [22]:
te = pd.read_pickle('bfs_test_pp.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Shape of the test dataset : (100, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
264361,1004682,P00362042,M,26-35,7,B,3,1,5,7,14,8880
177205,1003441,P00157642,F,26-35,14,A,2,0,1,14,16,15173
542026,1005463,P00085442,M,36-45,0,B,3,1,12,14,17,1433
481858,1002143,P00110742,M,18-25,4,B,2,0,1,2,8,19551
226813,1004952,P00091142,F,26-35,14,A,1,1,8,14,17,8069


In [23]:
Xte = te.drop(columns=['Purchase'])
Xte = pipe.transform(Xte)

yte = te['Purchase'].values

In [24]:
Xte = pd.DataFrame(Xte)
yte = pd.DataFrame(yte, columns=['Purchase'])

In [25]:
te = pd.concat([Xte,yte], axis=1)
te.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,Purchase
0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8880
1,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,15173
2,3.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1433
3,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19551
4,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8069


In [26]:
print('Writing the dataframes to CSV and PKL files ... \n')

te.to_csv('bfs_test_FE.csv', index=False)
te.to_pickle('bfs_test_FE.pkl')

print('Test File Saved Successfully ...')

Writing the dataframes to CSV and PKL files ... 

Test File Saved Successfully ...
