# BASIC PRE-PROCESSING - {"BLACK FRIDAY SALES" DATASET}

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from pickle import dump, load

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

### 2.1 Train Dataset

In [3]:
df = pd.read_pickle('bfs_train_init.pkl')
tr = df.copy()

In [4]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 549868 entries, 94454 to 146754
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     549868 non-null  int64  
 1   Product_ID                  549868 non-null  object 
 2   Gender                      549868 non-null  object 
 3   Age                         549868 non-null  object 
 4   Occupation                  549868 non-null  int64  
 5   City_Category               549868 non-null  object 
 6   Stay_In_Current_City_Years  549868 non-null  object 
 7   Marital_Status              549868 non-null  int64  
 8   Product_Category_1          549868 non-null  int64  
 9   Product_Category_2          376297 non-null  float64
 10  Product_Category_3          166768 non-null  float64
 11  Purchase                    549868 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 54.5+ MB


In [5]:
tr.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
94454,1002589,P00296042,M,26-35,0,C,3,0,8,13.0,16.0,5820
6914,1001101,P00251842,M,36-45,1,A,1,0,1,15.0,16.0,11715
167574,1001861,P00154642,M,51-55,16,A,1,1,8,,,8075
323135,1001746,P00251842,M,26-35,4,C,1,1,1,15.0,16.0,4336
31253,1004786,P00025942,M,46-50,6,B,1,1,8,14.0,,10057


## 3. Pre-Processing Level - II, After Data Assessment Process

### 0. Pre_Computed Values
#### - Pre_Computed values are executed separately once, to be applied on Train, Validation, Test.
#### - Pre_Computed values created using Train dataset, must be used in pipeline later to transform Train, Validation, and Test datasets
#### - Learning from Train and applying on Train, Validation, Test datasets

In [6]:
c2 = df['Product_Category_2'].mode()[0]
print(f'Mode of "Product_Category_2" is : {c2}')

c3 = df['Product_Category_3'].mode()[0]
print(f'Mode of "Product_Category_3" is : {c3} \n')

# Mode Values for "Product_Category_2" and "Product_Category_3" using "Product_Category_1"
c123_mode = df.pivot_table(index='Product_Category_1', values=['Product_Category_2','Product_Category_3'], aggfunc=(lambda x: x.mode())).reset_index()
#print(c123_mode)

# Filling Empty Arrays using Mode of the Inidividual Features ('Product_Category_2' and 'Product_Category_3')
c123_mode['Product_Category_2'] = c123_mode['Product_Category_2'].apply(lambda x: c2 if np.size(x)==0 else x)
c123_mode['Product_Category_3'] = c123_mode['Product_Category_3'].apply(lambda x: c3 if np.size(x)==0 else x)
print('Mode Values for "Product_Category_2" and "Product_Category_3" using "Product_Category_1" \n',c123_mode,'\n')

# c12_mode_map = dict(zip(c123_mode['Product_Category_1'],c123_mode['Product_Category_2']))
# #print(c12_mode_map)
# dump(c12_mode_map, open('c12_mode_map.pkl','wb'))
# print('Dictionary Map created between "Product_Category_1" and "Product_Category_2"')

# c13_mode_map = dict(zip(c123_mode['Product_Category_1'],c123_mode['Product_Category_3']))
# #print(c13_mode_map)
# dump(c13_mode_map, open('c13_mode_map.pkl','wb'))
# print('Dictionary Map created between "Product_Category_1" and "Product_Category_3"')

Mode of "Product_Category_2" is : 8.0
Mode of "Product_Category_3" is : 16.0 

Mode Values for "Product_Category_2" and "Product_Category_3" using "Product_Category_1" 
     Product_Category_1  Product_Category_2  Product_Category_3
0                    1                 2.0                15.0
1                    2                 4.0                15.0
2                    3                 4.0                 5.0
3                    4                 5.0                 9.0
4                    5                 8.0                14.0
5                    6                 8.0                16.0
6                    7                12.0                16.0
7                    8                14.0                17.0
8                    9                15.0                16.0
9                   10                13.0                16.0
10                  11                15.0                16.0
11                  12                14.0                17.0
12         

### 3.1 Missing value imputation for features "Product_Category_2" and "Product_Category_3"

In [7]:
def mvi_pc2_pc3(dff):
    print('Missing Values in the Dataset before Imputation: ')
    print(dff.isna().sum(),'\n')
    
    print('Processing Missing Values ----------------------------------------------------- \n')
    c12_map = load(open('c12_mode_map.pkl','rb'))
    c13_map = load(open('c13_mode_map.pkl','rb'))
    
    dff.loc[:,'Product_Category_2'] = dff.loc[:,'Product_Category_2'].fillna(dff.loc[:,'Product_Category_1'].map(c12_map))
    dff.loc[:,'Product_Category_3'] = dff.loc[:,'Product_Category_3'].fillna(dff.loc[:,'Product_Category_1'].map(c13_map))
    
    print('Missing Values in the Dataset after Imputation: ')
    print(dff.isna().sum(),'\n')
    
    return dff

### 3.2 Correcting datatypes for certain features

In [8]:
def correct_dt(dff):
    print('Datatypes of the features before Correction: ')
    print(dff.info(), '\n')
    
    print('Processing Datatypes ----------------------------------------------------- \n')
    
    dff['User_ID'] = dff['User_ID'].astype('object')
    dff['Occupation'] = dff['Occupation'].astype('object')
    dff['Marital_Status'] = dff['Marital_Status'].astype('object')

    dff['Product_Category_1'] = dff['Product_Category_1'].astype('object')
    dff['Product_Category_2'] = dff['Product_Category_2'].astype('int')
    dff['Product_Category_2'] = dff['Product_Category_2'].astype('object')
    dff['Product_Category_3'] = dff['Product_Category_3'].astype('int')
    dff['Product_Category_3'] = dff['Product_Category_3'].astype('object')
    
    print('Datatypes of the features after Correction: ')
    print(dff.info())
    
    return dff

## 4. PreProcess Dataset

### 4.1 PreProcess Pipeline

In [9]:
def preprocess_dataset(ppdf):

    ppdf = mvi_pc2_pc3(ppdf)         # Imputing missing values for 'Product_Category_2' and 'Product_Category_3'
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = correct_dt(ppdf)          # Correcting Datatypes
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
     
    return ppdf

### 4.2 PreProcess Train Dataset

In [10]:
tr = preprocess_dataset(tr)

Missing Values in the Dataset before Imputation: 
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173571
Product_Category_3            383100
Purchase                           0
dtype: int64 

Processing Missing Values ----------------------------------------------------- 

Missing Values in the Dataset after Imputation: 
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase

In [11]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 549868 entries, 94454 to 146754
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   User_ID                     549868 non-null  object
 1   Product_ID                  549868 non-null  object
 2   Gender                      549868 non-null  object
 3   Age                         549868 non-null  object
 4   Occupation                  549868 non-null  object
 5   City_Category               549868 non-null  object
 6   Stay_In_Current_City_Years  549868 non-null  object
 7   Marital_Status              549868 non-null  object
 8   Product_Category_1          549868 non-null  object
 9   Product_Category_2          549868 non-null  object
 10  Product_Category_3          549868 non-null  object
 11  Purchase                    549868 non-null  int64 
dtypes: int64(1), object(11)
memory usage: 54.5+ MB


In [12]:
tr.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
94454,1002589,P00296042,M,26-35,0,C,3,0,8,13,16,5820
6914,1001101,P00251842,M,36-45,1,A,1,0,1,15,16,11715
167574,1001861,P00154642,M,51-55,16,A,1,1,8,14,17,8075
323135,1001746,P00251842,M,26-35,4,C,1,1,1,15,16,4336
31253,1004786,P00025942,M,46-50,6,B,1,1,8,14,17,10057


### 4.3 PreProcess Validation Dataset

In [13]:
va = pd.read_pickle('bfs_valid_init.pkl')

va.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
54386,1002279,P00221542,M,51-55,7,B,2,1,5,15.0,,7058
536560,1004577,P0095842,M,36-45,7,C,0,1,3,4.0,12.0,8198
128570,1001764,P00044742,M,18-25,0,B,1,0,5,,,5426
409943,1003128,P00120042,M,18-25,2,B,3,0,1,2.0,,11930
35044,1005406,P00050242,F,36-45,0,C,3,1,8,,,6164


In [14]:
va = preprocess_dataset(va)

va.head()

Missing Values in the Dataset before Imputation: 
User_ID                        0
Product_ID                     0
Gender                         0
Age                            0
Occupation                     0
City_Category                  0
Stay_In_Current_City_Years     0
Marital_Status                 0
Product_Category_1             0
Product_Category_2            28
Product_Category_3            78
Purchase                       0
dtype: int64 

Processing Missing Values ----------------------------------------------------- 

Missing Values in the Dataset after Imputation: 
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64 


 -------

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
54386,1002279,P00221542,M,51-55,7,B,2,1,5,15,14,7058
536560,1004577,P0095842,M,36-45,7,C,0,1,3,4,12,8198
128570,1001764,P00044742,M,18-25,0,B,1,0,5,8,14,5426
409943,1003128,P00120042,M,18-25,2,B,3,0,1,2,15,11930
35044,1005406,P00050242,F,36-45,0,C,3,1,8,14,17,6164


### 4.4 PreProcess Test Dataset

In [15]:
te = pd.read_pickle('bfs_test_init.pkl')

te.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
264361,1004682,P00362042,M,26-35,7,B,3,1,5,7.0,,8880
177205,1003441,P00157642,F,26-35,14,A,2,0,1,14.0,16.0,15173
542026,1005463,P00085442,M,36-45,0,B,3,1,12,14.0,,1433
481858,1002143,P00110742,M,18-25,4,B,2,0,1,2.0,8.0,19551
226813,1004952,P00091142,F,26-35,14,A,1,1,8,,,8069


In [16]:
te = preprocess_dataset(te)

te.head()

Missing Values in the Dataset before Imputation: 
User_ID                        0
Product_ID                     0
Gender                         0
Age                            0
Occupation                     0
City_Category                  0
Stay_In_Current_City_Years     0
Marital_Status                 0
Product_Category_1             0
Product_Category_2            39
Product_Category_3            69
Purchase                       0
dtype: int64 

Processing Missing Values ----------------------------------------------------- 

Missing Values in the Dataset after Imputation: 
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64 


 -------

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
264361,1004682,P00362042,M,26-35,7,B,3,1,5,7,14,8880
177205,1003441,P00157642,F,26-35,14,A,2,0,1,14,16,15173
542026,1005463,P00085442,M,36-45,0,B,3,1,12,14,17,1433
481858,1002143,P00110742,M,18-25,4,B,2,0,1,2,8,19551
226813,1004952,P00091142,F,26-35,14,A,1,1,8,14,17,8069


## 5. Saving the Pre-Processed Data to CSV File and PKL File

In [17]:
def train_valid_test_save_CSV_PKL(trdf, vadf, tedf):
    trdf.to_csv('bfs_train_pp.csv', index=False)
    trdf.to_pickle('bfs_train_pp.pkl')
    
    vadf.to_csv('bfs_valid_pp.csv', index=False)
    vadf.to_pickle('bfs_valid_pp.pkl')
    
    tedf.to_csv('bfs_test_pp.csv', index=False)
    tedf.to_pickle('bfs_test_pp.pkl')
    
    print(f'Train, Validation, and Test files created successfully ...')

In [18]:
train_valid_test_save_CSV_PKL(tr, va, te)

Train, Validation, and Test files created successfully ...
