In [None]:
'''
Data Aggregation
Problem Statement: Analyzing Sales Performance by Region in a Retail Company
Dataset: &quot;Retail_Sales_Data.csv&quot;
Description: The dataset contains information about sales transactions in a retail
company. It
includes attributes such as transaction date, product category, quantity sold, and sales
amount. The goal is to perform data aggregation to analyze the sales performance by
region
and identify the top-performing regions.
Tasks to Perform:
1. Import the &quot;Retail_Sales_Data.csv&quot; dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for aggregating sales data, such as region, sales
amount, and product category.
4. Group the sales data by region and calculate the total sales amount for each region.
5. Create bar plots or pie charts to visualize the sales distribution by region.
6. Identify the top-performing regions based on the highest sales amount.
7. Group the sales data by region and product category to calculate the total sales
amount for each combination.
8. Create stacked bar plots or grouped bar plots to compare the sales amounts across
different regions and product categories.
'''
None

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# 1. Import the Retail_Sales_Data.csv dataset.
data = pd.read_csv('Retail_Sales_Data.csv')
data

In [None]:
# 2. Explore the dataset to understand its structure and content.
data.describe()

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
# 3. Identify the relevant variables for aggregating sales data, such as region, sales
# amount, and product category. (done in data understanding)
# In this case, weâ€™ll focus on Region, Sales_Amount, and Product_Category for analysis.

In [None]:
# 4. Group the sales data by region and calculate the total sales amount for each region.
sales_by_region = data.groupby("Region")["Sales_Amount"].sum().sort_values(ascending=False)
sales_by_region

In [None]:
# 5. Create bar plots or pie charts to visualize the sales distribution by region.
# Creating a bar plot to visualize sales distribution by region
plt.figure(figsize=(8, 5))
sales_by_region.plot(kind='bar', color=['blue', 'orange', 'green', 'red'])
plt.title("Sales Distribution by Region")
plt.xlabel("Region")
plt.ylabel("Total Sales Amount")
plt.xticks(rotation=45)
plt.show()

In [None]:
# 5. Create pie charts to visualize the sales distribution by region.
plt.figure(figsize=(8, 8))
plt.pie(
    sales_by_region, 
    labels=sales_by_region.index, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=['blue', 'orange', 'green', 'red']
)
plt.title("Sales Distribution by Region")
plt.show()

In [None]:
# 6. Identify the top-performing regions based on the highest sales amount.
top_regions = sales_by_region.head(2)
top_regions

In [None]:
# 7. Group the sales data by region and product category to calculate the total sales
# amount for each combination.
# Grouping data by region and product category, calculating total sales for each combination
sales_by_region_category = data.groupby(["Region", "Product_Category"])["Sales_Amount"].sum().unstack()
# Displaying the aggregated data
sales_by_region_category


In [None]:
# 8. Create stacked bar plots or grouped bar plots to compare the sales amounts across
# different regions and product categories.
# Creating a grouped bar plot to compare sales across regions and product categories
sales_by_region_category.plot(kind="bar", stacked=True, figsize=(10, 6), colormap="viridis")
plt.title("Sales by Region and Product Category")
plt.xlabel("Region")
plt.ylabel("Total Sales Amount")
plt.xticks(rotation=45)
plt.legend(title="Product Category")
plt.show()

In [None]:
# Done