In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Setting plot style
plt.style.use('seaborn')

# 1. Data Loading and Initial Exploration
print("Loading and exploring data...")
data = pd.read_csv('sales_data.csv', encoding='utf-8')

print("Dataset Info:")
print(data.info())
print("\nFirst few rows:")
print(data.head())
print("\nMissing Values:")
print(data.isnull().sum())

# 2. Data Preprocessing
print("\nPreprocessing data...")
def excel_date_to_datetime(excel_date):
    return pd.to_datetime('1899-12-30') + pd.to_timedelta(excel_date, 'D')

# Convert Excel serial date to datetime
data['تاریخ سفارش'] = data['تاریخ سفارش'].apply(excel_date_to_datetime)

# Handle missing values in sales and profit
data['فروش'] = data.apply(lambda row: row['تعداد سفارش'] * row['قیمت واحد'] 
                         if pd.isna(row['فروش']) else row['فروش'], axis=1)
data['سود'] = data.groupby('دسته‌بندی محصول')['سود'].transform(lambda x: x.fillna(x.median()))

# Create profit margin column
data['حاشیه سود'] = (data['سود'] / data['فروش'] * 100).round(2)

print("Data Types After Preprocessing:")
print(data.dtypes)
print("\nMissing Values After Preprocessing:")
print(data.isnull().sum())

# 3. Exploratory Data Analysis
print("\nPerforming exploratory data analysis...")
# Summary statistics
print("Summary Statistics:")
print(data.describe())

# Plot distributions
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.histplot(data['فروش'], bins=50, kde=True)
plt.title('Distribution of Sales')
plt.xlabel('Sales')

plt.subplot(2, 2, 2)
sns.histplot(data['سود'], bins=50, kde=True)
plt.title('Distribution of Profit')
plt.xlabel('Profit')

plt.subplot(2, 2, 3)
sns.histplot(data['حاشیه سود'], bins=50, kde=True)
plt.title('Distribution of Profit Margin')
plt.xlabel('Profit Margin (%)')

plt.subplot(2, 2, 4)
sns.histplot(data['تعداد سفارش'], bins=50, kde=True)
plt.title('Distribution of Order Quantity')
plt.xlabel('Order Quantity')

plt.tight_layout()
plt.savefig('distributions.png')
plt.close()

# Categorical variable analysis
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
data['دسته‌بندی محصول'].value_counts().plot(kind='bar')
plt.title('Product Category Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
data['بخش مشتری'].value_counts().plot(kind='bar')
plt.title('Customer Segment Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
data['روش ارسال'].value_counts().plot(kind='bar')
plt.title('Shipping Method Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('categorical_distributions.png')
plt.close()

# Correlation matrix
plt.figure(figsize=(10, 8))
numeric_cols = ['تعداد سفارش', 'قیمت واحد', 'فروش', 'سود', 'حاشیه سود']
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# 4. Statistical Tests
print("\nRunning statistical tests...")
# Normality test (Shapiro-Wilk)
print("Normality Tests:")
print("Sales:", stats.shapiro(data['فروش'].dropna())[:2])
print("Profit:", stats.shapiro(data['سود'].dropna())[:2])

# ANOVA for profit across customer segments
segments = data['بخش مشتری'].unique()
profit_by_segment = [data[data['بخش مشتری'] == segment]['سود'].dropna() for segment in segments]
anova_result = stats.f_oneway(*profit_by_segment)
print("\nANOVA for Profit across Customer Segments:")
print(f"F-statistic: {anova_result.statistic:.2f}, p-value: {anova_result.pvalue:.4f}")

# ANOVA for sales across product categories
categories = data['دسته‌بندی محصول'].unique()
sales_by_category = [data[data['دسته‌بندی محصول'] == cat]['فروش'].dropna() for cat in categories]
anova_result = stats.f_oneway(*sales_by_category)
print("\nANOVA for Sales across Product Categories:")
print(f"F-statistic: {anova_result.statistic:.2f}, p-value: {anova_result.pvalue:.4f}")

# 5. Advanced Analysis
print("\nPerforming advanced analysis...")
# Time series analysis
monthly_sales = data.groupby(data['تاریخ سفارش'].dt.to_period('M'))['فروش'].sum()
monthly_profit = data.groupby(data['تاریخ سفارش'].dt.to_period('M'))['سود'].sum()

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
monthly_sales.plot()
plt.title('Monthly Sales Trend')
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Sales')

plt.subplot(1, 2, 2)
monthly_profit.plot()
plt.title('Monthly Profit Trend')
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Profit')

plt.tight_layout()
plt.savefig('time_series.png')
plt.close()

# Top 10 customers by sales
top_customers = data.groupby('نام مشتری')['فروش'].sum().nlargest(10)
plt.figure(figsize=(10, 6))
top_customers.plot(kind='bar')
plt.title('Top 10 Customers by Sales')
plt.xticks(rotation=45)
plt.ylabel('Total Sales')
plt.savefig('top_customers.png')
plt.close()

# Profitability by category and segment
pivot_profit = data.pivot_table(values='حاشیه سود', 
                              index='دسته‌بندی محصول', 
                              columns='بخش مشتری', 
                              aggfunc='mean')
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_profit, annot=True, fmt='.2f', cmap='RdYlGn')
plt.title('Average Profit Margin by Category and Segment (%)')
plt.savefig('profitability_heatmap.png')
plt.close()

# 6. Key Insights and Recommendations
print("\nKey Insights and Recommendations:")
print("1. Sales Distribution: The sales distribution is right-skewed, indicating many small orders and few large orders.")
print("2. Profit Variability: Profit shows significant variability, with some categories (e.g., Technology) having higher margins.")
print("3. Customer Segments: The ANOVA test suggests differences in profit across customer segments (p-value < 0.05).")
print("4. Trends: Monthly sales show seasonal patterns that could be leveraged for inventory planning.")
print("5. Top Customers: A small number of customers contribute significantly to sales.")
print("\nRecommendations:")
print("- Focus on high-margin product categories like Technology for promotional campaigns.")
print("- Investigate negative profit orders to identify pricing or cost issues.")
print("- Develop targeted marketing for top customers to maintain loyalty.")
print("- Optimize shipping methods based on profitability analysis.")

In [None]:
for cat in list(set(data_set['product_category'])):
    filtered_data = data_set[data_set['product_category'] == cat]['profit/sale']

# Create distribution plot
    plt.figure(figsize=(10, 6))
    sns.histplot(filtered_data, bins=30, kde=True, color='#1f77b4')
    plt.title(f'Distribution of Sales for {cat}')
    plt.xlabel('Sales')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    print(f"Summary Statistics for Sales in {cat}:")
    print(filtered_data.describe())

In [None]:
groups = []
for col in df.filter(like='category_'):  # Adjust to match your one-hot columns
    groups.append(df[df[col] == 1]['target'])

# 2. Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(*groups)
print(f"ANOVA Results: F-statistic={f_stat:.3f}, p-value={p_value:.4f}")

# 3. Post-hoc test (if ANOVA is significant)
if p_value < 0.05:  # Common significance threshold
    print("\nSignificant difference found. Running post-hoc analysis...")
    # Combine all data for Tukey test
    melted_df = df.melt(id_vars=['target'], 
                        value_vars=df.filter(like='category_').columns,
                        var_name='category',
                        value_name='is_member')
    tukey = pairwise_tukeyhsd(endog=melted_df['target'],
                             groups=melted_df[melted_df['is_member'] == 1]['category'],
                             alpha=0.05)
    print(tukey)
else:
    print("\nNo significant difference between groups")