# BASIC PRE-PROCESSING - {"BIGMART SALES" DATASET}

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

In [3]:
data = pd.read_csv('bms_train.csv')
df = data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [5]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.0193,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.0168,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## 3. Pre-Processing Level - I, After Data Assessment Process

##### Note: Nothing to pre-process here.
##### Perform train, validation, test splits on original dataset

### 3.1 Performing Train, Validation, and Test Split on Dataset

In [6]:
def train_valid_test_split_REG(fdf, target, vs, ts):
    
    print(f'Separating X and y from dataframe ----------------------------------------------------------------------------- \n')
    X = fdf.drop(columns=[target])
    y = fdf[target]
 
    
    print(f'Creating Main[Train + Validation] and Test datasets ----------------------------------------------------------- \n') 
    Xmain, Xtest, ymain, ytest = train_test_split(X, y, test_size=ts, random_state=46)   # test size in number of samples
        
    
    print(f'Creating Train and Validation datasets ------------------------------------------------------------------------ \n') 
    Xtrain, Xval, ytrain, yval = train_test_split(Xmain, ymain, test_size=vs, random_state=46) # validation size in number of samples
        
    
    print(f'Creating Train, Validation, and Test dataframes as outputs --------------------------------------------------- \n')
    df_train = pd.concat([Xtrain, ytrain], axis=1)
    
    df_valid = pd.concat([Xval, yval], axis=1)
   
    df_test = pd.concat([Xtest, ytest], axis=1)
    
    print(f'Data Splitting Done ... --------------------------------------------------------------------------------------- \n')
        
    return df_train, df_valid, df_test

In [7]:
train, valid, test = train_valid_test_split_REG(df, 'Item_Outlet_Sales', 100, 100)

Separating X and y from dataframe ----------------------------------------------------------------------------- 

Creating Main[Train + Validation] and Test datasets ----------------------------------------------------------- 

Creating Train and Validation datasets ------------------------------------------------------------------------ 

Creating Train, Validation, and Test dataframes as outputs --------------------------------------------------- 

Data Splitting Done ... --------------------------------------------------------------------------------------- 



### 3.2 Saving the Spilt Data into CSV and PKL Files

In [8]:
def train_valid_test_save_CSV_PKL(tr, va, te):
    tr.to_csv('bms_train_init.csv', index=False)
    tr.to_pickle('bms_train_init.pkl')
    
    va.to_csv('bms_valid_init.csv', index=False)
    va.to_pickle('bms_valid_init.pkl')
    
    te.to_csv('bms_test_init.csv', index=False)
    te.to_pickle('bms_test_init.pkl')
    
    print(f'Train, Validation, and Test files created successfully ...')

In [9]:
train_valid_test_save_CSV_PKL(train, valid, test)

Train, Validation, and Test files created successfully ...
