<a href="https://colab.research.google.com/github/mdarifahammedreza/Data-Science-practice/blob/main/Visual_Data_Analysis_on_Chocolate_Sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
atharvasoundankar_chocolate_sales_path = kagglehub.dataset_download('atharvasoundankar/chocolate-sales')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data loading and preprocessing

In [None]:
df_choc = pd.read_csv("/kaggle/input/chocolate-sales/Chocolate Sales.csv")


In [None]:
df_choc.head()

## (a) Data Exploration

In [None]:
df_choc.shape

In [None]:
df_choc.info()

## (b) Data Transformation

In [None]:
#change type and format of Date Column
df_choc['Date'] = pd.to_datetime(df_choc['Date'], format='%d-%b-%y')

#Remove "$" and "," and "spaces" and change type into float of Amount Column
df_choc['Amount'] = df_choc['Amount'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).str.strip().astype(float)

In [None]:
df_choc.info()

In [None]:
df_choc.sample(5)

In [None]:
df_choc.isnull().sum()

In [None]:
df_choc.duplicated().sum()

# 2. Expolatory Data analysis

## (a) Sales by Country

In [None]:
country_sales = df_choc.groupby('Country')['Amount'].sum().sort_values(ascending=False).reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data= country_sales, x='Amount', y='Country', palette='viridis')
plt.title('Total Sales by Country')
plt.xlabel('Sales Amount (1M)')
plt.ylabel('Country')
plt.show()

## (b) Salesperson Performance


In [None]:
top_salespeople = df_choc.groupby('Sales Person')['Amount'].sum().sort_values(ascending=False).head(10).reset_index()

plt.figure(figsize=(12, 6))
sns.barplot(data=top_salespeople, x='Amount', y='Sales Person', palette='coolwarm')
plt.title('Top 10 Salespeople by Total Sales')
plt.xlabel('Total Sales')
plt.ylabel('Sales Person')
plt.show()

## (c) Average Sale per Box by Product

In [None]:
df_choc['Revenue per Box'] = df_choc['Amount'] / df_choc['Boxes Shipped']
avg_per_box = df_choc.groupby('Product')['Revenue per Box'].mean().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(data=avg_per_box, x='Revenue per Box', y='Product', palette='cubehelix')
plt.title('Average Revenue per Box by Product')
plt.show()

 ## (d) Total Boxes Shipped by Country

In [None]:
country_boxes = df_choc.groupby('Country')['Boxes Shipped'].sum().sort_values(ascending=False).reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=country_boxes, y='Boxes Shipped', x='Country', palette='magma')
plt.title('Total Boxes Shipped by Country')
plt.xlabel('Country')
plt.ylabel('Boxes Shipped')
plt.show()

## (e) Product Sales Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_choc, x='Product', y='Amount', palette='Set2')
plt.title('Sales Amount Distribution by Product')
plt.xticks(rotation=45)
plt.show()

## (f) Sales Distribution per Salesperson

In [None]:
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_choc, x='Sales Person', y='Amount', palette='Set3')
plt.title('Sales Distribution per Salesperson')
plt.xticks(rotation=90)
plt.show()

## (g) Boxes Shipped vs. Amount

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_choc, x='Boxes Shipped', y='Amount', hue='Product')
plt.title('Boxes Shipped vs. Amount')
plt.xlabel('Boxes Shipped')
plt.ylabel('Sales Amount')
plt.show()

## (h) Sales Heatmap by Country & Product

In [None]:
pivot = df_choc.pivot_table(index='Country', columns='Product', values='Amount', aggfunc='sum').fillna(0)

plt.figure(figsize=(12, 8))
sns.heatmap(pivot, annot=False, fmt='.0f', cmap='YlGnBu')
plt.title('Heatmap of Sales by Country and Product')
plt.xlabel('Product')
plt.ylabel('Country')
plt.show()