# BASIC PRE-PROCESSING - {"BLACK FRIDAY SALES" DATASET}

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

In [3]:
data = pd.read_csv('bfs_train.csv')
df = data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


## 3. Pre-Processing Level - I, After Data Assessment Process

##### Note: Nothing to pre-process here.
##### Perform train, validation, test splits on original dataset

### 3.1 Splitting Original Data into Train, Validation, and Test datasets

In [6]:
def train_valid_test_split_REG(fdf, target, vs, ts):
    
    print(f'Separating X and y from dataframe ----------------------------------------------------------------------------- \n')
    X = fdf.drop(columns=[target])
    y = fdf[target]
 
    
    print(f'Creating Main[Train + Validation] and Test datasets ----------------------------------------------------------- \n') 
    Xmain, Xtest, ymain, ytest = train_test_split(X, y, test_size=ts, random_state=46)   # test size in number of samples
        
    
    print(f'Creating Train and Validation datasets ------------------------------------------------------------------------ \n') 
    Xtrain, Xval, ytrain, yval = train_test_split(Xmain, ymain, test_size=vs, random_state=46) # validation size in number of samples
        
    
    print(f'Creating Train, Validation, and Test dataframes as outputs --------------------------------------------------- \n')
    df_train = pd.concat([Xtrain, ytrain], axis=1)
    
    df_valid = pd.concat([Xval, yval], axis=1)
   
    df_test = pd.concat([Xtest, ytest], axis=1)
    
    print(f'Data Splitting Done ... --------------------------------------------------------------------------------------- \n')
        
    return df_train, df_valid, df_test

In [7]:
train, valid, test = train_valid_test_split_REG(df, 'Purchase', 100, 100)

Separating X and y from dataframe ----------------------------------------------------------------------------- 

Creating Main[Train + Validation] and Test datasets ----------------------------------------------------------- 

Creating Train and Validation datasets ------------------------------------------------------------------------ 

Creating Train, Validation, and Test dataframes as outputs --------------------------------------------------- 

Data Splitting Done ... --------------------------------------------------------------------------------------- 



### 3.2 Saving the Spilt Data into CSV and PKL Files

In [8]:
def train_valid_test_save_CSV_PKL(tr, va, te):
    tr.to_csv('bfs_train_init.csv', index=False)
    tr.to_pickle('bfs_train_init.pkl')
    
    va.to_csv('bfs_valid_init.csv', index=False)
    va.to_pickle('bfs_valid_init.pkl')
    
    te.to_csv('bfs_test_init.csv', index=False)
    te.to_pickle('bfs_test_init.pkl')
    
    print(f'Train, Validation, and Test files created successfully ...')

In [9]:
train_valid_test_save_CSV_PKL(train, valid, test)

Train, Validation, and Test files created successfully ...
