## Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
import sqlite3
from scipy.stats import ttest_ind
import scipy.stats as stats
warnings.filterwarnings('ignore')

## Loading the Dataset

In [6]:
# Creating DataBase Connection
conn = sqlite3.connect('inventory.db')

# feting vendor summery Data
df = pd.read_sql_query("select * from Vendor_sales_summery",conn)
df.head()

DatabaseError: Execution failed on sql 'select * from Vendor_sales_summery': no such table: Vendor_sales_summery

## Exploratory Data Analysis

- Previously, we examined the various tables in the database to identify key variables, understand their relationships, and determine which ones should    be included in the final analysis.
- In this phase of EDA, we will analyze the resultant table to gain insights into the distribution of each column. This will help us understand data       patterns, identify anomalies, and ensure data quality before proceeding with further analysis.

In [None]:
# summery statistics
df.describe().T

In [None]:
# Distribution plots for numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns

plt.figure(figsize=(20, 15))

for i, col in enumerate(numerical_cols):
    plt.subplot(4, 5, i + 1)  # Adjust 4,5 depending on the number of columns
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(col)
    plt.tight_layout()

plt.show()

In [None]:
# Outliers Detection with Boxplot

plt.figure(figsize=(15,10))
for i, col in enumerate(numerical_cols):
    plt.subplot(4, 4, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

# Summary Statistics Insights:

### Negative & Zero Values:

- Gross Profit: Minimum value is -52,002.78, indicating losses. Some products or transactions may be selling at a loss due to high costs or selling at discounts
lower than the purchase price..
- Profit Margin: Has a minimum of which suggests cases where revenue is zero or even lower than costs.
- Total Sales Quantity & Sales Dollars: Minimum values are O, meaning some products were purchased but never sold. These could be slow-moving or obsolete
stock.

### Outliers Indicated by High Standard Deviations:

- Purchase & Actual Prices: The max values (5,681.81 & 7,499.99) are significantly higher than the mean (24.39 & 35.64), indicating potential premium
products.
- Freight Cost: Huge variation, from 0.09 to 257,032.07, suggests logistics inefficiencies or bulk shipments.
- Stock Turnover. Ranges from O to 274.5, implying some products sell extremely fast while others remain in stock indefinitely. Value more than I indicates that Sold quantity for that product is higher than purchased quantity due to either sales are being fulfilled from older stock.

In [None]:
# let's filter the data by removing inconsistencies

df = pd.read_sql_query("""select *
from Vendor_sales_summery
where GrossProfit > 0
and ProfitMargin > 0
and TotalSalesQuantity > 0
""",conn)

In [None]:
df

In [None]:
# Distribution plots for numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns

plt.figure(figsize=(20, 15))

for i, col in enumerate(numerical_cols):
    plt.subplot(4, 5, i + 1)  # Adjust 4,5 depending on the number of columns
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(col)
    plt.tight_layout()

plt.show()

In [None]:
# Count Plots for Categorical Columns

categorical_cols = ["VendorName", "Description"]

plt.figure(figsize=(12, 5))

for i, col in enumerate(categorical_cols):
    plt.subplot(1, 2, i + 1)
    order = df[col].value_counts().index[:10]
    sns.countplot(y=df[col], order=order)
    plt.title(f"Count plot of {col}")
    plt.xlabel("Count")
    plt.ylabel(col)

plt.tight_layout()
plt.show()

In [None]:
# Coorealtion Heatmap
plt.figure(figsize=(12,8))
correlation_matrix=df[numerical_cols].corr()
sns.heatmap(correlation_matrix,annot=True, fmt='.2f', cmap="coolwarm",linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

## Correlation Insights

- PurchasePrice has weak correlations with TotalSalesDollars (-0.012) and GrossProfit (-0.016), suggesting that price variations do not significantly impact sales revenue or profit.
- Strong correlation between total purchase quantity and total sales quantity (0.999), confirming efficient inventory turnover.
- Negative correlation between profit margin & total sales price (-0.179) suggests that as sales price increases, margins decrease, possibly due to competitive pricing pressures.
- StockTurnover has weak negative correlations with both GrossProfit (-0.038) and ProfitMargin (-0.055), indicating that faster turnover does not necessarily result in higher profitability.

# Data Analysis

1. Identify Brands that needs Promotional or Pricing Adjustments which exhibit lower sales performance but higher profit margins.

In [None]:
brand_performance = df.groupby('Description').agg({
    'TotalSalesDollars' : 'sum',
    "ProfitMargin" : 'mean'}).reset_index()

In [None]:
low_sales_thresold = brand_performance['TotalSalesDollars'].quantile(0.15)
high_margine_thresold = brand_performance['ProfitMargin'].quantile(0.85)

In [None]:
low_sales_thresold

In [None]:
high_margine_thresold

In [None]:
# filter breands with low sales but high profit margins

target_brands = brand_performance[
    (brand_performance['TotalSalesDollars'] <= low_sales_thresold) &
    (brand_performance['ProfitMargin'] <= high_margine_thresold)
]

print("Brands with Low Sates but High Profit Margins: ")
display(target_brands.sort_values( 'TotalSalesDollars'))

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=brand_performance, x='TotalSalesDollars',y='ProfitMargin',color='blue',label='All Brands',alpha=0.2)
sns.scatterplot(data=target_brands, x='TotalSalesDollars', y='ProfitMargin',color='red',label= "Target Brands" )

plt.axhline(high_margine_thresold, linestyle='--',color= 'black',label="high Margin Threshold")
plt.axvline(low_sales_thresold, linestyle='--',color= 'black',label="low Margin Threshold")

plt.xlabel = ("Total Sales ($)")
plt.ylabel = ("Profit Margin (%)")
plt.title("Brands for Promotional or Pricing Adjustments")
plt. legend()
plt.grid(True)
plt. show()

In [None]:
brand_performance = brand_performance[brand_performance["TotalSalesDollars"]<10000] # for better visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=brand_performance, x='TotalSalesDollars', y='ProfitMargin', color="blue", label="All Brands", alpha = 0.2)
sns.scatterplot(data=target_brands, x='TotalSalesDollars', y='ProfitMargin', color="red", label="Target Brands")

plt.axhline(high_margin_threshold, linestyle='--', color='black', label="High Margin Threshold")
plt.axvline(low_sales_threshold, linestyle='--', color='black', label="Low Sales Threshold")

plt.xlabel("Total Sales ($)")
plt.ylabel("Profit Margin (%)")
plt.title("Brands for Promotional or Pricing Adjustments")
plt.legend()
plt.grid(True)
plt.show()

2. Which vendors and brands demonstrate the highqst sales performance?

In [None]:
 # Top Vendors & Brands by Sales Performance

top_vendors = df.groupby("VendorName")["TotalSalesDollars"].sum().nlargest(10)
top_brands = df.groupby("Description")["TotalSalesDollars"].sum().nlargest(10)

In [None]:
top_vendors

In [None]:
top_brands

In [None]:
def format_dollars(value):
    if value >= 1_000_000:
        return f"{value/1_000_000:2f}M"
    elif value >= 1_000:
        return f"{value/1_000:2f}K"
    else:
        return str(value)

In [None]:
top_brands.apply(lambda x : format_dollars(x))

In [None]:
plt.figure(figsize=(15,5))

# plot for top vendors 
plt.subplot(1,2,1)
ax1=sns.barplot(y=top_vendors.index,x=top_vendors.values,palette="Blues_r")
plt.title("Top 10 vendors by Sales")

for bar in ax1.patches:
    ax1.text(bar.get_width() + (bar.get_width() * 0.02),
             bar.get_y() + bar.get_height()/2,
             format_dollars(bar.get_width()),
             ha='left',va='center',fontsize=10,color='black')

# Plot for Top Brands
plt. subplot(1, 2, 2)
ax2 = sns.barplot(y=top_brands.index.astype(str), x=top_brands.values, palette="Reds_r")
plt.title=("Top 1O Brands by Sales")

for bar in ax2.patches:
    ax2.text(bar.get_width() + (bar.get_width() * 0.02),
             bar.get_y() + bar.get_height() / 2,
             format_dollars(bar.get_width()) ,
             ha='left', va='center',fontsize=10, color='black')

plt.tight_layout()
plt.show()

3. Which vendors contribute the most to total purchase dollars?

In [None]:
vendor_performance = df.groupby('VendorName').agg({
    'TotalPurchaseDollars' : 'sum',
    "GrossProfit" : 'sum',
    'TotalSalesDollars': 'sum'}).reset_index()

vendor_performance.shape

In [None]:
vendor_performance['PurchaseContribution%'] = vendor_performance['TotalPurchaseDollars']/ vendor_performance['TotalPurchaseDollars'].sum()

In [None]:
vendor_performance = round(vendor_performance.sort_values('PurchaseContribution%',ascending = False),2)

In [None]:
# Display to 10 vendors

top_vendors = vendor_performance.head(10)
top_vendors['TotalSalesDollars'] = top_vendors['TotalSalesDollars'].apply(format_dollars)
top_vendors['TotalPurchaseDollars'] = top_vendors['TotalPurchaseDollars'].apply(format_dollars)
top_vendors['GrossProfit'] = top_vendors['GrossProfit'].apply(format_dollars)
top_vendors

In [None]:
top_vendors['PurchaseContribution%'].sum()

In [None]:
top_vendors['Cumulative_Contribution'] = top_vendors['PurchaseContribution%'].cumsum()
top_vendors

In [None]:
vendor_performance['PurchaseContribution%'] = vendor_performance['TotalPurchaseDollars']/vendor_performance['TotalPurchaseDollars'].sum()*100

In [None]:
top_vendors['Cumulative_Contribution%'] = top_vendors['PurchaseContribution%'].cumsum()

fig, ax1 = plt.subplots(figsize=(10,6))

# Bar Plot for Purchase Contribution%
sns.barplot(x=top_vendors['VendorName'], y=top_vendors['PurchaseContribution%'], palette='mako',ax=ax1)

for i, value in enumerate(top_vendors['PurchaseContribution%']):
    ax1.text(i,value-1,str(value)+'%',ha='center',fontsize=10,color='white')

# Line Plot for Cumulative Contribution%
ax2 = ax1.twinx()
ax2.plot(top_vendors['VendorName'],top_vendors['Cumulative_Contribution%'], color='red',marker='o',linestyle= 'dashed',label='Cumulative Distribution')

ax1.set_xticklabels(top_vendors['VendorName'], rotation=90)
ax1.set_ylabel('Purchase Contribution%',color='blue')
ax2.set_ylabel('Cumulative Contribution%',color='red')
ax1.set_xlabel('Vendors')
ax1.set_title('Pareto Chart: Vendor Contribution to Totat Purchases')

ax2.axhline(y=100,color='grey',linestyle='dashed',alpha=0.7)
ax2.legend(loc='upper right')

plt.show()

4. How much of total procurement is dependent on top vendors ?

In [None]:
print(f"Total Purchase Contribution of Top 10 Vendors is {round(top_vendors['PurchaseContribution%'].sum(),2)} %")

In [None]:
vendors = list(top_vendors['VendorName'].values)
purchase_contributions = list(top_vendors['PurchaseContribution%'].values)
total_contribution = sum(purchase_contributions)
remaining_contribution = 100 - total_contribution

# Append "other Vendors" category
vendors.append("Other Vendors")
purchase_contributions.append(remaining_contribution)

# Donut Chart
fig, ax= plt.subplots(figsize=(8,8))
wedges,texts,autotexts = ax.pie(purchase_contributions, labels = vendors, autopct='%1.1f%%',
                               startangle=140,pctdistance=0.85,colors=plt.cm.Paired.colors)

# Draw a white circle in the center to create a "donut" effect
center_circle = plt.Circle((0,0),0.70,fc='white')
fig.gca().add_artist(center_circle)

# Add total contribution annotation in the center
plt.text(0,0,f"Top 10 Total :/n{total_contribution:.2f}%",fontsize=14,fontweight='bold',ha='center',va='center')

plt.title("Top 10 Vendors Purchase Contribution (%)")
plt.show()

5. Does Purchasing in bulk reduce the unit price and what is the optimal purchase volumne for cost saving ?

In [None]:
df['UnitPurchasePrice'] = df['TotalPurchaseDollars']/df['TotalPurchaseQuantity']

In [None]:
df["OrderSize"] = pd.qcut(df["TotalPurchaseQuantity"],q=3,labels=["Small","Medium","Large"])

In [None]:
df[['OrderSize','TotalPurchaseQuantity']]

In [None]:
df.groupby('OrderSize')[['UnitPurchasePrice']].mean()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df,x="OrderSize",y="UnitPurchasePrice",palette="Set2")
plt.tile("Impact of Bulk Purchasing on Unit Price")
plt.xlabel("Oder size")
plt.ylabel("Average Unit Purchase Price")
plt.show()

- Vendors buying in Bulk(large order size) get the lowest unit price ($10.78 per Unit), Meaning higher margins if they can manage inventory efficiently.
- The price differnece between small and large orders is substantial (~72% reduction in unit cost)
- This suggests that bulk pricing strategies successfully encourage vendors to purchase in large volume, Leading to higher overall sales despite lower per unit revenue.    