In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
!pip install pmdarima
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Walmart.csv'
data = pd.read_csv(file_path)

#**Data Exploration**

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#Checking NULL values
data.isnull().sum()

In [None]:
#Checking for duplicate values
data.duplicated().sum()

In [None]:
#Checking for unique values in each column
data.nunique().sort_values()

In [None]:
data.corr()

#**Data Transformation**

In [None]:
# Conversion of 'Date' column: The 'Date' column  from a string format to a datetime format using the pd.to_datetime() function.
# Creating New column for week,day,month,year
# Sorting by 'Date': The datasets is sorted in ascending order based on the 'Date' column to ensure that the data is arranged chronologically.

In [None]:
#Converting Data Colummn
data.Date=pd.to_datetime(data['Date'])

#Creating New Columns
data['week'] = data['Date'].dt.week
data['day'] = data['Date'].dt.day
data['month'] = data['Date'].dt.month
data['year'] = data['Date'].dt.year
data.head()

#Sorting the Column in ascending order based on date column
#data = data.sort_values('Date')

data.head()

In [None]:
data.tail()

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(data.corr(),annot = True,fmt='.2f',cmap='Reds')
plt.show()

In [None]:
# Observations
# 1. Based solely on corelation, we can infer that their is a positive corelation between Fuel Price and year
# 2. Temperature,Fuel price, CPI and Unemployment are very weakly coorelated with the weekly sales

In [None]:
# distribution of Weekly_Sales
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Weekly_Sales', kde=True)
plt.title('Distribution of Weekly Sales')
plt.show()

#**EDA**

In [None]:
#Average Weekly Sales Store wise
plt.figure(figsize=(15,7))
sns.barplot(x='Store',y='Weekly_Sales',data=data,estimator='mean')
plt.grid()
plt.title('Average Weekly Sales per Store', fontsize=12)
plt.ylabel('Sales')
plt.xlabel('Store')
plt.show()

In [None]:
# Average weekly sales of 80% of stores is below 1500000

In [None]:
#Average Monthly Sales
plt.figure(figsize=(12,5))
sns.barplot(x='month',y='Weekly_Sales',data=data)
plt.ylabel('Sales',fontsize=14)
plt.xlabel('Months',fontsize=14)
plt.title('Average Monthly Sales',fontsize=16)
plt.grid()
plt.show()

In [None]:

plt.figure(figsize=(12, 5))
sns.barplot(x='month', y='Weekly_Sales', hue='year', data=data)
plt.ylabel('Sales', fontsize=14)
plt.xlabel('Months', fontsize=14)
plt.title('Average Monthly Sales by Year', fontsize=16)
plt.grid()
plt.legend(title='Year', title_fontsize='12', fontsize='12')
plt.show()

In [None]:
#Month of January witnessed the lowest sales
#From Feburary till October the weekly sales nearly remains constant
#November and December showed the highest sales every year

In [None]:
#Average weekly salves VS year

#filters the data for each year and calculates the mean of the 'Weekly_Sales' column for each week
weekly_sales_2010 = data[data.year==2010].groupby('week')['Weekly_Sales'].mean()
weekly_sales_2011 = data[data.year==2011].groupby('week')['Weekly_Sales'].mean()
weekly_sales_2012 = data[data.year==2012].groupby('week')['Weekly_Sales'].mean()

plt.figure(figsize=(15,8))
plt.plot(weekly_sales_2010.index, weekly_sales_2010.values)
plt.plot(weekly_sales_2011.index, weekly_sales_2011.values)
plt.plot(weekly_sales_2012.index, weekly_sales_2012.values)

plt.xticks(np.arange(1, 53, step=1), fontsize=10)
plt.yticks( fontsize=10)
plt.xlabel('Week of Year', fontsize=10)
plt.ylabel('Sales', fontsize=10)

plt.title("Average Weekly Sales - Per Year", fontsize=24)
plt.legend(['2010', '2011', '2012'], fontsize=20);
plt.show()

In [None]:
# Line plot of weekly sales over time
plt.figure(figsize=(8, 6))
sns.lineplot(data=data, x='Date', y='Weekly_Sales')
plt.title('Weekly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
# There's a clear pattern about the sales across the years, by the end of year the sales rise up by a huge margin.

In [None]:
holiday_counts = data.Holiday_Flag.value_counts()

In [None]:
holiday_sales = data.groupby('Holiday_Flag')['Weekly_Sales'].mean()

In [None]:
plt.figure(figsize=(12, 5))

# Plot 1 - Holiday Flag Counts
plt.subplot(1, 2, 1)
sns.barplot(x=holiday_counts.index, y=holiday_counts.values)
plt.ylabel('Count')
plt.xlabel('Holiday Flag')
plt.title('Holiday Flag Counts')

# Plot 2 - Holiday vs non-Holiday Sales
plt.subplot(1, 2, 2)
sns.barplot(x=holiday_sales.index, y=holiday_sales.values)
plt.ylabel('Sales')
plt.xlabel('Holiday Flag')
plt.title('Holiday vs non-Holiday Sales')

plt.tight_layout()
plt.show()


In [None]:
#Despite being the less percentage of holiday weeks the sales in the holidays week are higher than in the non-holiday weeks

In [None]:
#Relationship between Temperature and sales
plt.figure(figsize=(30, 5))

plt.subplot(1, 2, 1)
sns.scatterplot(x=data.Temperature, y=data.Weekly_Sales)

plt.xlabel('Temperature')
plt.ylabel('Sales')
plt.title('Temperature vs Sales')

plt.subplot(1, 2, 2)
sns.pointplot(x="Date", y="Temperature", data=data, color = 'red',linestyles='solid',errorbar=None)
plt.xlabel('weeks')
plt.ylabel('Temperature')
plt.title('Temperature vs Time')
plt.xticks([])
plt.show()

In [None]:
# There seems to be no relatiobship between the temperature in the region and weekly sales of the stores.
# At low and very high temperatures the sales seems to dip a bit but in general there doesn't exist a clear relationship
# We can clearly shows Temperature is more of a seasonal and repeated in cycle

In [None]:
#Relationship between FuelPrice and sales
plt.figure(figsize=(20,5))

plt.subplot(1, 3, 1)
sns.scatterplot(x=data.Fuel_Price, y=data.Weekly_Sales);

plt.xlabel('Fuel_Price')
plt.ylabel('Sales')
plt.title('Fuel_Price vs Sales')

#Fuel Price over the time
plt.subplot(1, 3, 2)
sns.pointplot(x="Date", y="Fuel_Price", data=data,color = 'orange',linestyles='solid',errorbar=None)
plt.xlabel('Date')
plt.ylabel('Fuel_Price')
plt.title('Fuel_Price over time')
plt.xticks([])

#Fuel price over the years
plt.subplot(1, 3, 3)
sns.barplot(x=data['year'],y=data['Fuel_Price'])
plt.xlabel('Year')
plt.ylabel('Fuel_price')
plt.title('Fuel_Price over years')
plt.show()


In [None]:
# Between fuel price and the sales there doesn't seem to exist any clear relationship
# As the year increases fuel prices also increases

In [None]:
#Relationship between CPI and sales
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
sns.scatterplot(x=data.CPI, y=data.Weekly_Sales);

plt.xlabel('CPI')
plt.ylabel('Sales')
plt.title('CPI vs Sales')

# Change between CPI over time
plt.subplot(1, 2, 2)
sns.pointplot(x="Date", y="CPI", data=data,color='lightgreen')
plt.xlabel('Date')
plt.ylabel('CPI')
plt.title('CPI over Time')
plt.xticks([])


plt.subplots_adjust(wspace=0.9)
plt.show()

In [None]:
# There are 3 clear clusters but there doesn't exist any clear correlation between CPI and weekly sales

In [None]:
#Relationship between Unemployment and sales

plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
sns.scatterplot(x=data.Unemployment, y=data.Weekly_Sales);

plt.xlabel('Unemployment')
plt.ylabel('Sales')
plt.title('Unemployment vs Sales')

# Change in Unemployment over time
plt.subplot(1, 2, 2)
sns.pointplot(x="Date", y="Unemployment", data=data, color='khaki')
plt.xlabel('Time Period')
plt.ylabel('Unemployment')
plt.title('Unemployment over Time')
plt.xticks([])

plt.subplots_adjust(wspace=0.9)
plt.show()

In [None]:
# In relation to unemployment, it can be seen that the lower the Unemployment, higher the sales
# Unemployment has decresed over time

#**Model**

In [None]:
data.head()

In [None]:
#Extracting required data
data1 = data[['Store', 'Date','Weekly_Sales']]
data1.head()

In [None]:
#preparing data for time series model
#This extract the data for the given store number and prepares the dataset for time series model

def select_store(data, store_number):
    # Extract data for the specified store number
    data_store = data[data['Store'] == store_number].drop('Store', axis=1)

    # Set the 'Date' column as the index
    data_store.index = pd.to_datetime(data_store['Date'])
    del data_store['Date']

    # Sort the DataFrame based on the index (date) in ascending order
    data_store = data_store.sort_index(ascending=True)

    return data_store



# Call the function with the store number
#store_number = 1
#data_store = select_store(data1, store_number)

# Display the processed data for Store 1
#data_store


Store 1

In [None]:
# For Store 1
store_number = 1
data_store_1 = select_store(data1, store_number)


In [None]:
#from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(data_store_1.Weekly_Sales, period=52)
fig = plt.figure()
fig = decomposition.plot()
fig.set_size_inches(12, 10)
plt.show()

In [None]:
#Function to calculate and plot rolling statistics
def plot_rolling_stats(data, window_size):

    # Sort the DataFrame based on the index (date) in ascending order
    data = data.sort_index(ascending=True)

    # Calculate rolling statistics (mean and standard deviation)
    data['Rolling_Mean'] = data['Weekly_Sales'].rolling(window=window_size).mean()
    data['Rolling_Std'] = data['Weekly_Sales'].rolling(window=window_size).std()

    # Plot 'Weekly_Sales', rolling mean, and rolling standard deviation
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['Weekly_Sales'], label='Weekly Sales')
    plt.plot(data.index, data['Rolling_Mean'], label='Rolling Mean', linestyle='--')
    plt.plot(data.index, data['Rolling_Std'], label='Rolling Std', linestyle='-.')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.title('Weekly Sales with Rolling Mean and Rolling Std')
    plt.legend()
    plt.grid(True)
    plt.show()



In [None]:
plot_rolling_stats(data_store_1, 4)

In [None]:
#Function to check Checking the Stationarity of data
#ADF test
def adf_test(dataset):
     dftest = adfuller(dataset, autolag = 'AIC')
     print("1. ADF : ",dftest[0])
     print("2. P-Value : ", dftest[1])
     print("3. Num Of Lags : ", dftest[2])
     print("4. Num Of Observations Used For ADF Regression:",      dftest[3])
     print("5. Critical Values :")
     for key, val in dftest[4].items():
         print("\t",key, ": ", val)
     if dftest[1] <= 0.05:
      print("strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary")
     else:
      print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")


#If p<0.05 ; Data is stationary
#if p>0.05; Data is not stationary

In [None]:
adf_test(data_store_1['Weekly_Sales'])

In [None]:
#ACF & PACF (to find p,d,q)
#from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

# Plot ACF
plt.figure(figsize=(10, 5))
plot_acf(data_store_1)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('Autocorrelation')
plt.show()

# Plot PACF
plt.figure(figsize=(10, 5))
plot_pacf(data_store_1)
plt.title('Partial Autocorrelation Function (PACF)')
plt.xlabel('Lags')
plt.ylabel('Partial Autocorrelation')
plt.show()

In [None]:
# we can use this module to get the optimumn value for p,d,q

#from pmdarima import auto_arima

order1 = auto_arima(data_store_1['Weekly_Sales'], trace=True)
order1.summary()

In [None]:
# from the output we can see that the optimum value for (p,d,q) is (0,1,2)

In [None]:
# Splitting the time series into train and test sets

#from sklearn.model_selection import train_test_split
train1,test1 = train_test_split(data_store_1,test_size = 0.10,shuffle=False)

# Plotting both graphs in one figure with different colors
plt.figure(figsize=(20, 6))
plt.plot(train1.index, train1, label='train', color='blue')
plt.plot(test1.index, test1, label='test', color='red')

plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Train and Test Time Series')
plt.legend()
plt.show()

ARIMA Model


In [None]:
#from statsmodels.tsa.arima.model import ARIMA

model_Store1_1=ARIMA(train1['Weekly_Sales'],order=(0,1,2))
model_Store1_1_fit=model_Store1_1.fit()
#model_Store1_1_fit.summary()

In [None]:
prediction_Store1_1 = model_Store1_1_fit.predict(start=len(train1),end=(len(data_store_1)-1))

In [None]:
test1_index = test1.index
prediction_Store1_1.index = test1_index

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(train1.index, train1, label='train', color='blue')
plt.plot(test1.index, test1, label='test', color='orange')
plt.plot(prediction_Store1_1.index, prediction_Store1_1, label='test', color='red')

In [None]:
prediction_Store1_1
prediction_df = prediction_Store1_1.to_frame(name='Predicted_Weekly_Sales')
prediction_df['Predicted_Weekly_Sales'] = prediction_df['Predicted_Weekly_Sales'].astype(int)
prediction_df

In [None]:
mae = mean_absolute_error(test1['Weekly_Sales'], prediction_df['Predicted_Weekly_Sales'])
print("ARIMA Model - MAE:",mae)
mse = mean_squared_error(test1['Weekly_Sales'], prediction_df['Predicted_Weekly_Sales'])
print("ARIMA Model - MSE:", mse)
rmse = rmse = np.sqrt(mse)
print("ARIMA Model - RMSE:", rmse)

SARIMA Model

In [None]:
#from statsmodels.tsa.statespace.sarimax import SARIMAX
model_Store1_2 = SARIMAX(train1['Weekly_Sales'],order=(0,1,2),seasonal_order=(0,1,2,52))
model_Store1_2_fit=model_Store1_2.fit()
#model_Store1_2_fit.summary()

In [None]:
prediction_Store1_2 = model_Store1_2_fit.predict(start=len(train1),end=(len(data_store_1)-1))

In [None]:
test1_index = test1.index
prediction_Store1_2.index = test1_index

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(train1.index, train1, label='train', color='blue')
plt.plot(test1.index, test1, label='test', color='orange')
plt.plot(prediction_Store1_2.index, prediction_Store1_2, label='test', color='red')


Future Prediction

In [None]:
#Forcasting values for next 12 weeks

# Create the SARIMA model and fit it to the entire dataset.
model1_f = SARIMAX(data_store_1['Weekly_Sales'], order=(0, 1, 2), seasonal_order=(0, 1, 2, 52))
model1_f_fit = model1_f.fit()

# Predict data for the next 12 weeks.
predictionf_1 = model1_f_fit.predict(start=len(data_store_1), end=len(data_store_1) + 11)

# Create a date range for the next 12 weeks (assuming 'data_store_1' has a DatetimeIndex).
next_12_weeks = pd.date_range(start=data_store_1.index[-1], periods=12, freq='W')

# Assign the index to the predictions for plotting.
predictionf_1.index = next_12_weeks

# Convert the prediction Series to a DataFrame.
predictionf_1_df = predictionf_1.to_frame(name='Predicted_Weekly_Sales')
predictionf_1_df['Predicted_Weekly_Sales'] = predictionf_1_df['Predicted_Weekly_Sales'].astype(int)

# Merge the predicted DataFrame with the original data_store_1 DataFrame.
data_with_predictions_1 = pd.concat([data_store_1, predictionf_1_df])

# Plot the graph.
plt.figure(figsize=(10, 6))
plt.plot(data_with_predictions_1.index, data_with_predictions_1['Weekly_Sales'], label='Actual')
plt.plot(data_with_predictions_1.index, data_with_predictions_1['Predicted_Weekly_Sales'], label='Predicted', color='red')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.title('Weekly Sales Prediction for the Next 12 Weeks of Store 1')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#Function to create 1) ARIMA Model
#                   2) SARIMA Model
#                   3) Future Forecast
#and plot their graphs

def arima_forecast(train, test, order):

  with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    # Fit the ARIMA model
    model = ARIMA(train['Weekly_Sales'], order=order)
    model_fit = model.fit()

    # Make predictions
    predictions = model_fit.predict(start=len(train), end=(len(train) + len(test) - 1))

    # Set prediction index to match test index
    predictions.index = test.index

    # Plot the results
    plt.figure(figsize=(20, 6))
    plt.plot(train.index, train['Weekly_Sales'], label='train', color='blue')
    plt.plot(test.index, test['Weekly_Sales'], label='test', color='orange')
    plt.plot(predictions.index, predictions, label='predictions', color='red')
    plt.title('Weekly Sales Prediction using ARIMA Model')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.legend()
    plt.show()

    return predictions

def sarima_forecast(train, test, seasonal_order,order):
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Fit the SARIMA model
    model = SARIMAX(train['Weekly_Sales'], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()

    # Make predictions
    predictions = model_fit.predict(start=len(train), end=(len(train) + len(test) - 1))

    # Set prediction index to match test index
    predictions.index = test.index

    # Plot the results
    plt.figure(figsize=(20, 6))
    plt.plot(train.index, train['Weekly_Sales'], label='train', color='blue')
    plt.plot(test.index, test['Weekly_Sales'], label='test', color='orange')
    plt.plot(predictions.index, predictions, label='predictions', color='red')
    plt.title('Weekly Sales Prediction using SARIMA Model')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.legend()
    plt.show()


    return predictions

def future_forecast(data, order, seasonal_order):
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Create the SARIMA model and fit it to the entire dataset.
    model = SARIMAX(data['Weekly_Sales'], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()

    # Predict data for the next 12 weeks.
    prediction = model_fit.predict(start=len(data), end=len(data) + 11)

    # Create a date range for the next 12 weeks (assuming 'data' has a DatetimeIndex).
    next_12_weeks = pd.date_range(start=data.index[-1], periods=12, freq='W')

    # Assign the index to the predictions for plotting.
    prediction.index = next_12_weeks

    # Convert the prediction Series to a DataFrame.
    prediction_df = prediction.to_frame(name='Predicted_Weekly_Sales')
    prediction_df['Predicted_Weekly_Sales'] = prediction_df['Predicted_Weekly_Sales'].astype(int)

    # Merge the predicted DataFrame with the original data DataFrame.
    data_with_predictions = pd.concat([data, prediction_df])

    # Plot the graph.
    plt.figure(figsize=(20, 6))
    plt.plot(data_with_predictions.index, data_with_predictions['Weekly_Sales'], label='Actual',color='blue')
    plt.plot(data_with_predictions.index, data_with_predictions['Predicted_Weekly_Sales'], label='Predicted', color='red')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.title('Weekly Sales Prediction for the Next 12 Weeks')
    plt.legend()

    plt.show()

    return data_with_predictions




In [None]:
# For Store 25
store_number = 25
data_store_25 = select_store(data1, store_number)
train25,test25 = train_test_split(data_store_25,test_size = 0.10,shuffle=False)

In [None]:
train = train25
test = test25
order = (0, 1, 2)
seasonal_order = (0,1,2,52)
data = data_store_25
arima_forecast(train, test, order)
sarima_forecast(train, test, seasonal_order,order)
future_forecast(data, order, seasonal_order)

In [None]:
# For Store 35
store_number = 35
data_store_35 = select_store(data1, store_number)
train35,test35 = train_test_split(data_store_35,test_size = 0.10,shuffle=False)

In [None]:
train = train35
test = test35
order = (0, 1, 2)
seasonal_order = (0,1,2,52)
data = data_store_35
arima_forecast(train, test, order)
sarima_forecast(train, test, seasonal_order,order)
future_forecast(data, order, seasonal_order)