## Retail Dataset Project (EDA + Business Insights)

#### Required Libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# style for our graphs
plt.style.use('ggplot')

In [29]:
# Load dataset
sales = pd.read_csv("Retail_Dataset/sales.csv")
features = pd.read_csv("Retail_Dataset/features.csv")
stores = pd.read_csv("Retail_Dataset/stores.csv")

In [30]:
sales.shape , features.shape , stores.shape

((421570, 5), (8190, 12), (45, 3))

In [31]:
sales.head(3)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05/02/2010,24924.5,False
1,1,1,12/02/2010,46039.49,True
2,1,1,19/02/2010,41595.55,False


In [32]:
features.head(3)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False


In [33]:
stores.head(3)

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392


In [34]:
print("============ SALES INFO ===============")
print(sales.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB
None


In [35]:
print("============ FEATURES INFO ===============")
print(features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   MarkDown1     4032 non-null   float64
 5   MarkDown2     2921 non-null   float64
 6   MarkDown3     3613 non-null   float64
 7   MarkDown4     3464 non-null   float64
 8   MarkDown5     4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB
None


In [36]:
print("============ STORES INFO ===============")
print(stores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB
None


In [46]:
# Convert Date Columns
sales['Date'] = pd.to_datetime(sales['Date'], format="%d/%m/%Y", dayfirst=True)
features['Date'] = pd.to_datetime(features['Date'], format="%d/%m/%Y", dayfirst=True)

In [47]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday     421570 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 13.3 MB


In [48]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(9), int64(1)
memory usage: 712.0 KB


In [49]:
# Merge dataset stores , features
storeFeature = pd.merge(features,stores, on='Store' , how='inner')

In [50]:
storeFeature.shape

(8190, 14)

In [51]:
storeFeature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
 12  Type          8190 non-null   object        
 13  Size          8190 non-null   int64         
dtypes: bool(1), datetime64[ns](1), float64(9), int64(2), object(1)
memory usage: 839.9+ KB


In [52]:
# Sales + storeFeature dataset (merge)
df = pd.merge(sales, storeFeature , on='Store' , how='inner')

In [54]:
# final data set
print("============== FINAL DATASET =================")



In [55]:
df.head(3)

Unnamed: 0,Store,Dept,Date_x,Weekly_Sales,IsHoliday_x,Date_y,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size
0,1,1,2010-02-05,24924.5,False,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,1,2010-02-05,24924.5,False,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True,A,151315
2,1,1,2010-02-05,24924.5,False,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False,A,151315


In [56]:
print("============== FINAL DATASET =================")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76725740 entries, 0 to 76725739
Data columns (total 18 columns):
 #   Column        Dtype         
---  ------        -----         
 0   Store         int64         
 1   Dept          int64         
 2   Date_x        datetime64[ns]
 3   Weekly_Sales  float64       
 4   IsHoliday_x   bool          
 5   Date_y        datetime64[ns]
 6   Temperature   float64       
 7   Fuel_Price    float64       
 8   MarkDown1     float64       
 9   MarkDown2     float64       
 10  MarkDown3     float64       
 11  MarkDown4     float64       
 12  MarkDown5     float64       
 13  CPI           float64       
 14  Unemployment  float64       
 15  IsHoliday_y   bool          
 16  Type          object        
 17  Size          int64         
dtypes: bool(2), datetime64[ns](2), float64(10), int64(3), object(1)
memory usage: 9.3+ GB
None
