# **Dataset Pre-Processing**

In [1]:
import pandas as pd
dataset = pd.read_csv('C:/Users/nextn/Downloads/Git/demand_prediction/data/raw/retail_store_inventory.csv')

In [2]:
dataset

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.50,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73095,2024-01-01,S005,P0016,Furniture,East,96,8,127,18.46,73.73,20,Snowy,0,72.45,Winter
73096,2024-01-01,S005,P0017,Toys,North,313,51,101,48.43,82.57,10,Cloudy,0,83.78,Autumn
73097,2024-01-01,S005,P0018,Clothing,West,278,36,151,39.65,11.11,10,Rainy,0,10.91,Winter
73098,2024-01-01,S005,P0019,Toys,East,374,264,21,270.52,53.14,20,Rainy,0,55.80,Spring


In [3]:
dataset.info()
pd.set_option('display.max_columns', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                73100 non-null  object 
 1   Store ID            73100 non-null  object 
 2   Product ID          73100 non-null  object 
 3   Category            73100 non-null  object 
 4   Region              73100 non-null  object 
 5   Inventory Level     73100 non-null  int64  
 6   Units Sold          73100 non-null  int64  
 7   Units Ordered       73100 non-null  int64  
 8   Demand Forecast     73100 non-null  float64
 9   Price               73100 non-null  float64
 10  Discount            73100 non-null  int64  
 11  Weather Condition   73100 non-null  object 
 12  Holiday/Promotion   73100 non-null  int64  
 13  Competitor Pricing  73100 non-null  float64
 14  Seasonality         73100 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 8.4+

In [4]:
dataset.isna().sum()

Date                  0
Store ID              0
Product ID            0
Category              0
Region                0
Inventory Level       0
Units Sold            0
Units Ordered         0
Demand Forecast       0
Price                 0
Discount              0
Weather Condition     0
Holiday/Promotion     0
Competitor Pricing    0
Seasonality           0
dtype: int64

In [5]:
# Drop Columns: Store ID, Product ID
dataset.drop(columns=['Store ID', 'Product ID'], axis=1, inplace=True)

In [6]:
# Rename Features
dataset = dataset.rename(columns={'Inventory Level': 'Inventory', 'Units Sold': 'Sales',
                                  'Units Ordered': 'Order', 'Demand Forecast': 'Demand',
                                  'Weather Condition': 'Weather', 'Holiday/Promotion': 'Promotion',
                                  'Competitor Pricing': 'Competitor Price'})

In [7]:
# Converting DT
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset

Unnamed: 0,Date,Category,Region,Inventory,Sales,Order,Demand,Price,Discount,Weather,Promotion,Competitor Price,Seasonality
0,2022-01-01,Groceries,North,231,127,55,135.47,33.50,20,Rainy,0,29.69,Autumn
1,2022-01-01,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73095,2024-01-01,Furniture,East,96,8,127,18.46,73.73,20,Snowy,0,72.45,Winter
73096,2024-01-01,Toys,North,313,51,101,48.43,82.57,10,Cloudy,0,83.78,Autumn
73097,2024-01-01,Clothing,West,278,36,151,39.65,11.11,10,Rainy,0,10.91,Winter
73098,2024-01-01,Toys,East,374,264,21,270.52,53.14,20,Rainy,0,55.80,Spring


In [8]:
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day

In [9]:
dataset.drop(columns=['Date'], axis=1, inplace=True)

In [10]:
# OHE
dataset = pd.get_dummies(dataset, columns=['Category', 'Region', 'Weather', 'Seasonality'], drop_first=True).astype(int)

# **Feature Scaling**

In [12]:
X = dataset.drop('Demand', axis=1)
y = dataset['Demand']

In [13]:
feature_names = list(X.keys())
print(feature_names)

['Inventory', 'Sales', 'Order', 'Price', 'Discount', 'Promotion', 'Competitor Price', 'Year', 'Month', 'Day', 'Category_Electronics', 'Category_Furniture', 'Category_Groceries', 'Category_Toys', 'Region_North', 'Region_South', 'Region_West', 'Weather_Rainy', 'Weather_Snowy', 'Weather_Sunny', 'Seasonality_Spring', 'Seasonality_Summer', 'Seasonality_Winter']


In [14]:
# Scaler Transform
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_scaled

array([[-0.33451587, -0.08689852, -1.05217166, ..., -0.57823486,
        -0.57798212, -0.57756088],
       [-0.54229028,  0.12426824, -0.84175444, ..., -0.57823486,
        -0.57798212, -0.57756088],
       [-1.32721586, -0.65613063, -1.12868701, ..., -0.57823486,
         1.73015734, -0.57756088],
       ...,
       [ 0.02716553, -0.92238436,  0.78419679, ..., -0.57823486,
        -0.57798212,  1.73141919],
       [ 0.76591902,  1.17092084, -1.70255215, ...,  1.72940111,
        -0.57798212, -0.57756088],
       [-1.21178563, -1.19781926,  1.05200052, ...,  1.72940111,
        -0.57798212, -0.57756088]])

In [16]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X_scaled.shape[1]}")
print(f"Number of components after PCA: {X_pca.shape[1]}")

Original number of features: 23
Number of components after PCA: 19
