# Sales Trends Forecasting - Exploratory Data Analysis

This notebook contains the exploratory data analysis of our sales dataset. We'll:
1. Load and examine the data
2. Handle missing values
3. Clean the dataset
4. Perform basic statistical analysis
5. Create visualizations to understand patterns

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('../data/sales_data.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nFirst few rows:")
df.head()

## Data Cleaning

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

In [None]:
# Basic statistics of numerical columns
print("Statistical Summary:")
df.describe()

In [None]:
def clean_dataset(df):
    """Clean the dataset by handling missing values, duplicates, and data type conversions"""
    # Create a copy of the dataframe
    df_clean = df.copy()
    
    # Remove duplicates
    df_clean = df_clean.drop_duplicates()
    
    # Handle missing values (customize based on actual data)
    numeric_columns = df_clean.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    
    # Fill categorical missing values with mode
    categorical_columns = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
    
    return df_clean

# Clean the dataset
df_clean = clean_dataset(df)

# Verify cleaning results
print("Missing values after cleaning:")
print(df_clean.isnull().sum())

## Data Visualization

In [None]:
# Create visualizations for numerical columns
numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns

# Distribution plots
plt.figure(figsize=(15, 5*len(numeric_cols)))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), 2, 2*i-1)
    sns.histplot(data=df_clean, x=col, kde=True)
    plt.title(f'Distribution of {col}')
    
    plt.subplot(len(numeric_cols), 2, 2*i)
    sns.boxplot(data=df_clean, x=col)
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))
correlation_matrix = df_clean[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Save the cleaned dataset
df_clean.to_csv('../data/sales_data_cleaned.csv', index=False)
print("Cleaned dataset has been saved to '../data/sales_data_cleaned.csv'")