In [1]:
import pandas as pd
import plotly.express as px

In [2]:
dados = pd.read_csv('./dados/sales_data.csv', sep=',')

In [3]:
dados.head()

Unnamed: 0,Product_ID,Sale_Date,Sales_Rep,Region,Sales_Amount,Quantity_Sold,Product_Category,Unit_Cost,Unit_Price,Customer_Type,Discount,Payment_Method,Sales_Channel,Region_and_Sales_Rep
0,1052,2023-02-03,Bob,North,5053.97,18,Furniture,152.75,267.22,Returning,0.09,Cash,Online,North-Bob
1,1093,2023-04-21,Bob,West,4384.02,17,Furniture,3816.39,4209.44,Returning,0.11,Cash,Retail,West-Bob
2,1015,2023-09-21,David,South,4631.23,30,Food,261.56,371.4,Returning,0.2,Bank Transfer,Retail,South-David
3,1072,2023-08-24,Bob,South,2167.94,39,Clothing,4330.03,4467.75,New,0.02,Credit Card,Retail,South-Bob
4,1061,2023-03-24,Charlie,East,3750.2,13,Electronics,637.37,692.71,New,0.08,Credit Card,Online,East-Charlie


In [4]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Product_ID            1000 non-null   int64  
 1   Sale_Date             1000 non-null   object 
 2   Sales_Rep             1000 non-null   object 
 3   Region                1000 non-null   object 
 4   Sales_Amount          1000 non-null   float64
 5   Quantity_Sold         1000 non-null   int64  
 6   Product_Category      1000 non-null   object 
 7   Unit_Cost             1000 non-null   float64
 8   Unit_Price            1000 non-null   float64
 9   Customer_Type         1000 non-null   object 
 10  Discount              1000 non-null   float64
 11  Payment_Method        1000 non-null   object 
 12  Sales_Channel         1000 non-null   object 
 13  Region_and_Sales_Rep  1000 non-null   object 
dtypes: float64(4), int64(2), object(8)
memory usage: 109.5+ KB


**Product_ID**: Unique identifier for each product sold. Randomly generated for practice purposes.

**Sale_Date**: The date when the sale occurred. Randomly selected from the year 2023.


**Sales_Rep**: The sales representative responsible for the transaction. The dataset includes five random sales representatives (Alice, Bob, Charlie, David, Eve).

**Region**: The region where the sale took place. The possible regions are North, South, East, and **West.

**Sales_Amount**: The total sales amount for the transaction, including discounts if any. Values range from 100 to 10,000 (in currency units).

**Quantity_Sold**: The number of units sold in that transaction, randomly generated between 1 and 50.

**Product_Category**: The category of the product sold. Categories include Electronics, Furniture, Clothing, and Food.

**Unit_Cost**: The cost per unit of the product sold, randomly generated between 50 and 5000 currency units.

**Unit_Price:** The selling price per unit of the product, calculated to be higher than the unit cost.

**Customer_Type**: Indicates whether the customer is a New or Returning customer.

**Discount:** The discount applied to the sale, randomly chosen between 0% and 30%.

**Payment_Method**: The method of payment used by the customer (e.g., Credit Card, Cash, Bank Transfer).

**Sales_Channel**: The channel through which the sale occurred. Either Online or Retail.

**Region_and_Sales_Rep**: A combined column that pairs the region and sales representative for easier tracking.

In [5]:
dados['Product_ID'] = dados['Product_ID'].astype(str)

### Estatísticas

In [13]:
dados.describe()

Unnamed: 0,Sales_Amount,Quantity_Sold,Unit_Cost,Unit_Price,Discount
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5019.26523,25.355,2475.30455,2728.44012,0.15239
std,2846.790126,14.159006,1417.872546,1419.399839,0.0872
min,100.12,1.0,60.28,167.12,0.0
25%,2550.2975,13.0,1238.38,1509.085,0.08
50%,5019.3,25.0,2467.235,2696.4,0.15
75%,7507.445,38.0,3702.865,3957.97,0.23
max,9989.04,49.0,4995.3,5442.15,0.3


### Total de vendas por região

In [11]:
amount_per_region = dados.groupby('Region')[['Sales_Amount']].sum().reset_index()
amount_per_region = amount_per_region.sort_values('Sales_Amount', ascending=False)

Visualização

In [12]:
px.bar(amount_per_region, x='Region', y='Sales_Amount', color='Sales_Amount', color_continuous_scale='peach')

### Categoria mais vendida

In [9]:
dados['Product_Category'].unique()

array(['Furniture', 'Food', 'Clothing', 'Electronics'], dtype=object)

In [29]:
quantity_per_category = dados.groupby('Product_Category')[['Quantity_Sold']].sum().reset_index()
quantity_per_category = quantity_per_category.sort_values('Quantity_Sold', ascending=False)

Visualização

In [30]:
px.bar(quantity_per_category, x='Product_Category', y='Quantity_Sold', color='Quantity_Sold', color_continuous_scale='peach')

In [31]:
dados.head()

Unnamed: 0,Product_ID,Sale_Date,Sales_Rep,Region,Sales_Amount,Quantity_Sold,Product_Category,Unit_Cost,Unit_Price,Customer_Type,Discount,Payment_Method,Sales_Channel,Region_and_Sales_Rep
0,1052,2023-02-03,Bob,North,5053.97,18,Furniture,152.75,267.22,Returning,0.09,Cash,Online,North-Bob
1,1093,2023-04-21,Bob,West,4384.02,17,Furniture,3816.39,4209.44,Returning,0.11,Cash,Retail,West-Bob
2,1015,2023-09-21,David,South,4631.23,30,Food,261.56,371.4,Returning,0.2,Bank Transfer,Retail,South-David
3,1072,2023-08-24,Bob,South,2167.94,39,Clothing,4330.03,4467.75,New,0.02,Credit Card,Retail,South-Bob
4,1061,2023-03-24,Charlie,East,3750.2,13,Electronics,637.37,692.71,New,0.08,Credit Card,Online,East-Charlie


### Sazonalidade de `quantity_sold`

In [35]:
data_grouped = dados.groupby('Sale_Date')['Quantity_Sold'].sum().reset_index()

In [36]:
px.line(data_grouped, x='Sale_Date', y='Quantity_Sold')