# Feature Engineering 

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
os.getcwd()

'C:\\Users\\vijay.agrawal\\Final Project'

 # Data Preparation for Modeling
 
 ### Some of the columns might have to be dropped at this stage before running the model. This will be carried out in the subsequent steps:

In [3]:
# Read the file that was generated from the data cleanup and data exploratory stages:
df_final=pd.read_csv("transactions_full_merged.csv")

In [4]:
# Determine the columns in our dataset:
df_final.columns.tolist()

['DAY_DT',
 'LOC_INDT',
 'DBSKU',
 'ONLINE_FLAG',
 'TOTAL_SALES',
 'TOTAL_UNITS_SOLD',
 'TOTAL_SALES_PRFT',
 'TOTAL_COST',
 'HOLIDAY',
 'MONTH',
 'SEASON',
 'UNIT_COST_PRICE',
 'UNIT_SELLING_PRICE',
 'PROFIT_PER_UNIT',
 'FULL_PRICE_IND',
 'DEPARTMENT',
 'PRODUCT_CLASS',
 'LOC_INDT_SEQ',
 'CITY',
 'STATE',
 'POSTAL_CD',
 'STORE_SIZE',
 'STORE_TYPE_MALL',
 'STORE_TYPE_STRIP',
 'STORE_TYPE_OTHER',
 'STATE_NAME',
 'MEDIAN_HOUSEHLD_INCOME',
 'PCNT_WOMEN_EMPLOYED',
 'MEDIAN_HOUSEHLD_INCOME_BINS']

 # Target variable is UNIT_SELLING_PRICE
 
 ## The features that are of interest for modelling are as follows:
 ### 
 
ONLINE_FLAG                        
FULL_PRICE_IND                     
UNIT_COST_PRICE   
*UNIT_SELLING_PRICE* [*target*]     
TOTAL_UNITS_SOLD 
HOLIDAY                            
DEPARTMENT                         
LOC_INDT_SEQ                       
STORE_SIZE                         
STORE_TYPE_MALL                    
STORE_TYPE_STRIP                   
STORE_TYPE_OTHER                   
PCNT_WOMEN_EMPLOYED                
MEDIAN_HOUSEHLD_INCOME_BINS    
SEASON                                                          
PRODUCT_CLASS

 ## The features that are NOT of interest for modelling are as follows:
 ###

DAY_DT   
LOC_INDT   
DBSKU    
PROFIT_PER_UNIT    
TOTAL_COST    
TOTAL_SALES    
TOTAL_SALES_PRFT    
MONTH    
CITY     
STATE    
POSTAL_CD    
STATE_NAME    
MEDIAN_HOUSEHLD_INCOME    

 ## Drop the columns that are not required for modeling:

In [5]:
df_final = df_final.drop(['DAY_DT', 
                          'LOC_INDT',
                          'TOTAL_SALES', 
                          'TOTAL_COST', 
                          'PROFIT_PER_UNIT', 
                          'TOTAL_SALES_PRFT', 
                          'MONTH', 
                          'CITY', 'STATE', 'POSTAL_CD', 'STATE_NAME', 'MEDIAN_HOUSEHLD_INCOME'], axis=1)

In [6]:
# Verify that the columns have been dropped:
df_final.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,SEASON,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,DEPARTMENT,PRODUCT_CLASS,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,STORE_TYPE_STRIP,STORE_TYPE_OTHER,PCNT_WOMEN_EMPLOYED,MEDIAN_HOUSEHLD_INCOME_BINS
0,466896,0,1.0,0,4,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
1,466896,0,2.0,0,3,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
2,466896,0,1.0,0,4,15.5,21.0,0,10,10_31,1027,3100,0,0,1,0.57,2


# Feature Engineering - for Linear Regression:

 ## One Hot encoding - create dummy variables:

In [7]:
# We will call the final output file as df_final_lr for linear regression:
df_final_lr = df_final.copy()

In [8]:
# verify that the files have been copied to df_final_lr:
df_final_lr.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,SEASON,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,DEPARTMENT,PRODUCT_CLASS,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,STORE_TYPE_STRIP,STORE_TYPE_OTHER,PCNT_WOMEN_EMPLOYED,MEDIAN_HOUSEHLD_INCOME_BINS
0,466896,0,1.0,0,4,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
1,466896,0,2.0,0,3,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
2,466896,0,1.0,0,4,15.5,21.0,0,10,10_31,1027,3100,0,0,1,0.57,2


In [9]:
# One Hot encode the season feature:
df_final_lr = pd.get_dummies(df_final_lr,columns=['SEASON'])

In [10]:
# One Hot encode the department feature:
df_final_lr = pd.get_dummies(df_final_lr,columns=['DEPARTMENT'])

In [11]:
# One Hot encode the product class feature:
df_final_lr = pd.get_dummies(df_final_lr,columns=['PRODUCT_CLASS'])

In [12]:
# Verify that the required features have been one hot encoded:
df_final_lr.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,...,PRODUCT_CLASS_12_32,PRODUCT_CLASS_12_40,PRODUCT_CLASS_12_41,PRODUCT_CLASS_12_42,PRODUCT_CLASS_12_5,PRODUCT_CLASS_12_50,PRODUCT_CLASS_12_51,PRODUCT_CLASS_12_52,PRODUCT_CLASS_12_6,PRODUCT_CLASS_12_99
0,466896,0,1.0,0,15.5,16.8,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0
1,466896,0,2.0,0,15.5,16.8,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0
2,466896,0,1.0,0,15.5,21.0,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0


The final feature engineering file should have n-1 features for each one hot encoded feature - 
so drop one column accordingly:

In [13]:
df_final_lr = df_final_lr.drop(['DEPARTMENT_12'], axis=1)

In [14]:
df_final_lr = df_final_lr.drop(['SEASON_4'], axis=1)

In [15]:
df_final_lr = df_final_lr.drop(['PRODUCT_CLASS_12_99'], axis=1)

In [16]:
# Verify that the required features have been dropped:
df_final_lr.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,...,PRODUCT_CLASS_12_31,PRODUCT_CLASS_12_32,PRODUCT_CLASS_12_40,PRODUCT_CLASS_12_41,PRODUCT_CLASS_12_42,PRODUCT_CLASS_12_5,PRODUCT_CLASS_12_50,PRODUCT_CLASS_12_51,PRODUCT_CLASS_12_52,PRODUCT_CLASS_12_6
0,466896,0,1.0,0,15.5,16.8,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0
1,466896,0,2.0,0,15.5,16.8,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0
2,466896,0,1.0,0,15.5,21.0,0,1027,3100,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_final_lr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13043875 entries, 0 to 13043874
Data columns (total 45 columns):
DBSKU                          int64
ONLINE_FLAG                    int64
TOTAL_UNITS_SOLD               float64
HOLIDAY                        int64
UNIT_COST_PRICE                float64
UNIT_SELLING_PRICE             float64
FULL_PRICE_IND                 int64
LOC_INDT_SEQ                   int64
STORE_SIZE                     int64
STORE_TYPE_MALL                int64
STORE_TYPE_STRIP               int64
STORE_TYPE_OTHER               int64
PCNT_WOMEN_EMPLOYED            float64
MEDIAN_HOUSEHLD_INCOME_BINS    int64
SEASON_1                       uint8
SEASON_2                       uint8
SEASON_3                       uint8
DEPARTMENT_10                  uint8
PRODUCT_CLASS_10_20            uint8
PRODUCT_CLASS_10_21            uint8
PRODUCT_CLASS_10_30            uint8
PRODUCT_CLASS_10_31            uint8
PRODUCT_CLASS_10_32            uint8
PRODUCT_CLASS_10_40       

# Feature Engineering - for Tree based models

For tree based models, One Hot Encoding is not needed.

In [18]:
# We will call the file as df_final_tr
df_final_tr = df_final

In [19]:
# Verify that the file has been copied over:
df_final_tr.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,SEASON,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,DEPARTMENT,PRODUCT_CLASS,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,STORE_TYPE_STRIP,STORE_TYPE_OTHER,PCNT_WOMEN_EMPLOYED,MEDIAN_HOUSEHLD_INCOME_BINS
0,466896,0,1.0,0,4,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
1,466896,0,2.0,0,3,15.5,16.8,0,10,10_31,1027,3100,0,0,1,0.57,2
2,466896,0,1.0,0,4,15.5,21.0,0,10,10_31,1027,3100,0,0,1,0.57,2


In [20]:
# Remove the underscore from product class:
df_final_tr['PRODUCT_CLASS'] = df_final_tr.PRODUCT_CLASS.str.replace('_','')

In [21]:
# Convert Product_Class to an integer data type:
df_final_tr['PRODUCT_CLASS'] = df_final_tr['PRODUCT_CLASS'].astype('int64')

In [22]:
# Verify that the integer conversion has happened:
df_final_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13043875 entries, 0 to 13043874
Data columns (total 17 columns):
DBSKU                          int64
ONLINE_FLAG                    int64
TOTAL_UNITS_SOLD               float64
HOLIDAY                        int64
SEASON                         int64
UNIT_COST_PRICE                float64
UNIT_SELLING_PRICE             float64
FULL_PRICE_IND                 int64
DEPARTMENT                     int64
PRODUCT_CLASS                  int64
LOC_INDT_SEQ                   int64
STORE_SIZE                     int64
STORE_TYPE_MALL                int64
STORE_TYPE_STRIP               int64
STORE_TYPE_OTHER               int64
PCNT_WOMEN_EMPLOYED            float64
MEDIAN_HOUSEHLD_INCOME_BINS    int64
dtypes: float64(4), int64(13)
memory usage: 1.7 GB


In [23]:
# Verify the product class sequence transaformation:
df_final_tr.head(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,SEASON,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,DEPARTMENT,PRODUCT_CLASS,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,STORE_TYPE_STRIP,STORE_TYPE_OTHER,PCNT_WOMEN_EMPLOYED,MEDIAN_HOUSEHLD_INCOME_BINS
0,466896,0,1.0,0,4,15.5,16.8,0,10,1031,1027,3100,0,0,1,0.57,2
1,466896,0,2.0,0,3,15.5,16.8,0,10,1031,1027,3100,0,0,1,0.57,2
2,466896,0,1.0,0,4,15.5,21.0,0,10,1031,1027,3100,0,0,1,0.57,2


In [24]:
# Verify the product class sequence transaformation:
df_final_tr.tail(3)

Unnamed: 0,DBSKU,ONLINE_FLAG,TOTAL_UNITS_SOLD,HOLIDAY,SEASON,UNIT_COST_PRICE,UNIT_SELLING_PRICE,FULL_PRICE_IND,DEPARTMENT,PRODUCT_CLASS,LOC_INDT_SEQ,STORE_SIZE,STORE_TYPE_MALL,STORE_TYPE_STRIP,STORE_TYPE_OTHER,PCNT_WOMEN_EMPLOYED,MEDIAN_HOUSEHLD_INCOME_BINS
13043872,714022,0,1.0,0,3,16.5,29.4,0,10,1020,1178,2900,0,1,0,0.58,3
13043873,714162,0,1.0,0,3,14.0,26.4,0,10,1020,1178,2900,0,1,0,0.58,3
13043874,2183830,0,1.0,0,2,17.0,31.15,0,12,1220,1178,2900,0,1,0,0.58,3


### Profit Optimization Feature Engineering

In [25]:
df_final_optimal=df_final_lr.copy()

In [26]:
df_final_optimal['PROFIT_PERCENT'] = ((df_final_optimal['UNIT_SELLING_PRICE'] - 
                                       df_final_optimal['UNIT_COST_PRICE'])/df_final_optimal['UNIT_COST_PRICE'])*100

In [27]:
len(df_final_optimal[df_final_optimal['PROFIT_PERCENT'] <=0]) # Loss transactions

988142

How many profit transactions in sample ?

In [28]:
len(df_final_optimal[df_final_optimal['PROFIT_PERCENT'] > 0]) # Profit transactions

12055733

How many transactions which profit percentage lies from 0 to 100

In [29]:
len(df_final_optimal[(df_final_optimal['PROFIT_PERCENT'] > 0) & 
                     (df_final_optimal['PROFIT_PERCENT'] <= 100)]) # Profit transactions

3792467

How many transactions which profit percentage lies more then 100

In [30]:
df_final_optimal.columns

Index(['DBSKU', 'ONLINE_FLAG', 'TOTAL_UNITS_SOLD', 'HOLIDAY',
       'UNIT_COST_PRICE', 'UNIT_SELLING_PRICE', 'FULL_PRICE_IND',
       'LOC_INDT_SEQ', 'STORE_SIZE', 'STORE_TYPE_MALL', 'STORE_TYPE_STRIP',
       'STORE_TYPE_OTHER', 'PCNT_WOMEN_EMPLOYED',
       'MEDIAN_HOUSEHLD_INCOME_BINS', 'SEASON_1', 'SEASON_2', 'SEASON_3',
       'DEPARTMENT_10', 'PRODUCT_CLASS_10_20', 'PRODUCT_CLASS_10_21',
       'PRODUCT_CLASS_10_30', 'PRODUCT_CLASS_10_31', 'PRODUCT_CLASS_10_32',
       'PRODUCT_CLASS_10_40', 'PRODUCT_CLASS_10_41', 'PRODUCT_CLASS_10_42',
       'PRODUCT_CLASS_10_5', 'PRODUCT_CLASS_10_50', 'PRODUCT_CLASS_10_51',
       'PRODUCT_CLASS_10_52', 'PRODUCT_CLASS_10_6', 'PRODUCT_CLASS_10_99',
       'PRODUCT_CLASS_12_20', 'PRODUCT_CLASS_12_21', 'PRODUCT_CLASS_12_30',
       'PRODUCT_CLASS_12_31', 'PRODUCT_CLASS_12_32', 'PRODUCT_CLASS_12_40',
       'PRODUCT_CLASS_12_41', 'PRODUCT_CLASS_12_42', 'PRODUCT_CLASS_12_5',
       'PRODUCT_CLASS_12_50', 'PRODUCT_CLASS_12_51', 'PRODUCT_CLASS_12_52

Describe quantiles of PROFIT_PERCENT column

In [31]:
round(df_final_optimal.UNIT_SELLING_PRICE.describe([0.1,0.2,0.3,0.4,0.6,0.7,0.8,0.9,0.999]),1)

count    13043875.0
mean           37.2
std            13.4
min             0.0
10%            19.8
20%            24.5
30%            30.0
40%            34.8
50%            38.4
60%            40.5
70%            44.2
80%            48.0
90%            54.0
99.9%          78.0
max           150.0
Name: UNIT_SELLING_PRICE, dtype: float64

In [32]:
round(df_final_optimal.PROFIT_PERCENT.describe([0.1,0.2,0.3,0.4,0.6,0.7,0.8,0.9,0.999]),1)

count    13043875.0
mean          118.9
std           111.2
min          -100.0
10%             9.5
20%            40.8
30%            79.0
40%           109.1
50%           126.7
60%           145.4
70%           166.7
80%           182.9
90%           208.5
99.9%         348.0
max        189800.0
Name: PROFIT_PERCENT, dtype: float64

In [33]:
df_final_optimal.shape

(13043875, 46)

We can not build the model with all transactions. Then it will just predict price irrepsctive to profit or loss

We can build model with profitable transactions alone, but out objective is to maximize the profits

So we can put some minimum non-zero profit percentage cutoff before building model. Note that as cut-off increases, number of transactions will reduce, so inturn it will impact number of customers. So there should be so trade off between profit percentage cut-off and number of customers. For this model I choosen 0.1 percentaile as cut-off, to stay in more safer position. 

 # Save the datasets that were created for modeling:

In [34]:
# For linear regression:
df_final_lr.to_csv('lr_txn_modelingdata_full.csv', index=False)

In [35]:
# For tree based techniques:
df_final_tr.to_csv('tr_txn_modelingdata_full.csv', index=False)

In [36]:
df_final_optimal.to_csv('optimization_modelingdata_full.csv', index=False)