# Sales Forecast - Data Exploration

Initial exploration of the Store Sales dataset.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import (
    load_train_data,
    load_stores,
    load_oil,
    load_holidays,
    download_kaggle_data
)

sns.set_theme(style='whitegrid')
%matplotlib inline

## Download Data

Uncomment and run once to download the dataset from Kaggle.

In [None]:
# download_kaggle_data()

## Load Data

In [None]:
train = load_train_data()
stores = load_stores()
oil = load_oil()
holidays = load_holidays()

print(f"Training data shape: {train.shape}")
print(f"Date range: {train['date'].min()} to {train['date'].max()}")

In [None]:
train.head()

In [None]:
train.info()

## Basic Statistics

In [None]:
train.describe()

In [None]:
print(f"Number of stores: {train['store_nbr'].nunique()}")
print(f"Number of product families: {train['family'].nunique()}")
print(f"Product families: {train['family'].unique().tolist()}")

## Sales Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sales distribution
axes[0].hist(train['sales'], bins=50, edgecolor='black')
axes[0].set_xlabel('Sales')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Sales Distribution')

# Log-transformed sales
axes[1].hist(train[train['sales'] > 0]['sales'].apply(lambda x: x + 1).apply('log'), bins=50, edgecolor='black')
axes[1].set_xlabel('Log(Sales + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Log-transformed Sales Distribution')

plt.tight_layout()
plt.show()

## Sales Over Time

In [None]:
daily_sales = train.groupby('date')['sales'].sum().reset_index()

plt.figure(figsize=(14, 5))
plt.plot(daily_sales['date'], daily_sales['sales'])
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.title('Daily Total Sales Over Time')
plt.show()

## Sales by Store

In [None]:
store_sales = train.groupby('store_nbr')['sales'].sum().sort_values(ascending=False)

plt.figure(figsize=(14, 5))
store_sales.plot(kind='bar')
plt.xlabel('Store Number')
plt.ylabel('Total Sales')
plt.title('Total Sales by Store')
plt.xticks(rotation=45)
plt.show()

## Sales by Product Family

In [None]:
family_sales = train.groupby('family')['sales'].sum().sort_values(ascending=True)

plt.figure(figsize=(10, 8))
family_sales.plot(kind='barh')
plt.xlabel('Total Sales')
plt.ylabel('Product Family')
plt.title('Total Sales by Product Family')
plt.show()

## Store Information

In [None]:
stores.head(10)

In [None]:
print(f"Store types: {stores['type'].unique().tolist()}")
print(f"Cities: {stores['city'].nunique()}")
print(f"States: {stores['state'].nunique()}")

## Oil Prices

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(oil['date'], oil['dcoilwtico'])
plt.xlabel('Date')
plt.ylabel('Oil Price (USD)')
plt.title('Oil Price Over Time')
plt.show()