## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Import custom modules
import sys
sys.path.append('../src')

from data_loader import load_csv, save_processed_data
from preprocessing import handle_missing_values, remove_duplicates
from visualization import plot_distribution, plot_correlation_matrix, plot_categorical_count
from statistics import descriptive_statistics, correlation_analysis, t_test_independent

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load the sample sales data
df = load_csv('../data/raw/sample_sales_data.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns\n")

# Display first few rows
print("First 5 rows:")
df.head()

In [None]:
# Check data types and missing values
print("Dataset Information:")
print(df.info())
print("\n" + "="*50 + "\n")

# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n")

# Basic statistics
print("Basic Statistics:")
df.describe()

## 3. Data Cleaning and Preprocessing

In [None]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicates if any
df = remove_duplicates(df)

# Verify data types
print("\nData types after conversion:")
print(df.dtypes)

# Create a copy for processing
df_processed = df.copy()

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Summary statistics
print("Descriptive Statistics:")
descriptive_statistics(df_processed)

In [None]:
# Unique values in categorical columns
print("Unique Values in Categorical Columns:")
for col in ['Product', 'Category', 'Region', 'Customer_Segment']:
    print(f"\n{col}: {df_processed[col].nunique()} unique values")
    print(df_processed[col].value_counts())

In [None]:
# Sales analysis by category
sales_by_category = df_processed.groupby('Category').agg({
    'Sales': ['sum', 'mean', 'count'],
    'Quantity': 'sum'
}).round(2)

print("Sales Analysis by Category:")
print(sales_by_category)

print("\n" + "="*50 + "\n")

# Sales analysis by region
sales_by_region = df_processed.groupby('Region').agg({
    'Sales': ['sum', 'mean', 'count'],
    'Quantity': 'sum'
}).round(2)

print("Sales Analysis by Region:")
print(sales_by_region)

## 5. Data Visualization

In [None]:
# Distribution of Sales
plot_distribution(df_processed, 'Sales', bins=20, kde=True)

In [None]:
# Sales by Category
plot_categorical_count(df_processed, 'Category')

In [None]:
# Sales trends over time
plt.figure(figsize=(14, 6))
daily_sales = df_processed.groupby('Date')['Sales'].sum()
plt.plot(daily_sales.index, daily_sales.values, marker='o', linewidth=2)
plt.title('Daily Sales Trend', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Sales by Region and Customer Segment
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Sales by Region
region_sales = df_processed.groupby('Region')['Sales'].sum().sort_values(ascending=False)
axes[0].bar(region_sales.index, region_sales.values, color='skyblue')
axes[0].set_title('Total Sales by Region', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Region', fontsize=12)
axes[0].set_ylabel('Total Sales ($)', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

# Sales by Customer Segment
segment_sales = df_processed.groupby('Customer_Segment')['Sales'].sum().sort_values(ascending=False)
axes[1].bar(segment_sales.index, segment_sales.values, color='lightcoral')
axes[1].set_title('Total Sales by Customer Segment', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Customer Segment', fontsize=12)
axes[1].set_ylabel('Total Sales ($)', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
plot_correlation_matrix(df_processed, columns=list(numerical_cols))

## 6. Statistical Analysis

In [None]:
# Correlation between Sales and Quantity
corr, p_value = correlation_analysis(df_processed, 'Sales', 'Quantity', method='pearson')

print("Correlation Analysis: Sales vs Quantity")
print(f"Correlation Coefficient: {corr:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print(f"The correlation is statistically significant (p < 0.05)")
else:
    print(f"The correlation is not statistically significant (p >= 0.05)")

In [None]:
# Compare average sales between customer segments
corporate_sales = df_processed[df_processed['Customer_Segment'] == 'Corporate']['Sales']
individual_sales = df_processed[df_processed['Customer_Segment'] == 'Individual']['Sales']

result = t_test_independent(corporate_sales, individual_sales)

print("T-Test: Corporate vs Individual Customer Segments")
print(f"Corporate Average Sales: ${corporate_sales.mean():.2f}")
print(f"Individual Average Sales: ${individual_sales.mean():.2f}")
print(f"\nT-statistic: {result['statistic']:.4f}")
print(f"P-value: {result['p_value']:.4f}")
print(f"\n{result['interpretation']}")

## 7. Feature Engineering

In [None]:
# Extract date features
df_processed['Year'] = df_processed['Date'].dt.year
df_processed['Month'] = df_processed['Date'].dt.month
df_processed['Day'] = df_processed['Date'].dt.day
df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek
df_processed['WeekOfYear'] = df_processed['Date'].dt.isocalendar().week

# Create total revenue feature
df_processed['TotalRevenue'] = df_processed['Sales'] * df_processed['Quantity']

# Create price per unit
df_processed['PricePerUnit'] = df_processed['Sales'] / df_processed['Quantity']

# Create sales category
df_processed['SalesCategory'] = pd.cut(
    df_processed['Sales'], 
    bins=[0, 50, 200, 500, float('inf')],
    labels=['Low', 'Medium', 'High', 'Very High']
)

print("New features created:")
print(df_processed[['Date', 'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear', 
                     'TotalRevenue', 'PricePerUnit', 'SalesCategory']].head(10))

## 8. Save Processed Data

In [None]:
# Save the processed dataset
save_processed_data(
    df_processed, 
    'processed_sales_data.csv',
    output_dir='../data/processed'
)

print("\nProcessing complete!")
print(f"Final dataset shape: {df_processed.shape[0]} rows x {df_processed.shape[1]} columns")