### **Data Cleaning and Exploratory Analysis**

#### **Import libraries**

In [15]:
# Import the custom function 'gera_dados' from the local module 'gerar_dados_aleatorios'
from modules.generae_random_sales_data import GenerateData as gd

# Import date and time utilities for handling timestamps and time intervals
from datetime import datetime, timedelta

# Import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import data manipulation and numerical computation libraries
import pandas as pd
import numpy as np

# Import the 'random' module for generating random numbers
import random

#### **Create functions**

In [16]:
# Function to format numeric values into millions with a currency symbol (Brazilian Real)
# Example: 3500000 -> 'R$ 3.50 Mi'
def formatador_milhoes(y, pos):
    return f'R$ {y/1_000_000:.2f} Mi'

In [17]:
# Function to format percentage labels for charts (e.g., pie charts)
# It shows both the percentage and the corresponding absolute value in millions
# Example: 25%, R$ 2.50 Mi
def formatar_valores(pct, allvals):
    absolute = pct/100. * sum(allvals)
    return f'{pct:.1f}%\nR$ {absolute/1_000_000:.2f} Mi'

#### **Create a random dataset**

In [18]:
# Create a random dataFrame
df_sales = gd.generate_fake_data()


Starting the generation of 600 records...
Data generation completed successfully.



#### **Exploratory Data Analysis**

In [19]:
# Check the first 5 records
df_sales.head()

Unnamed: 0,Order_ID,Order_Date,Product_Name,Category,Unit_Price,Quantity,Customer_ID,City,State
0,1000,2026-01-01 00:00:00,Gaming Laptop,Electronics,7500.0,3,136,Rio de Janeiro,RJ
1,1001,2026-01-01 10:00:00,Ultrawide Monitor,Electronics,2800.0,5,132,Belo Horizonte,MG
2,1002,2026-01-01 11:00:00,Gaming Chair,Furniture,1200.0,7,147,Porto Alegre,RS
3,1003,2026-01-01 11:00:00,Headset 7.1,Accessories,800.0,4,141,Fortaleza,CE
4,1004,2026-01-01 23:00:00,Ultrawide Monitor,Electronics,2800.0,3,140,São Paulo,SP


In [20]:
# Show how much lines and coluns the DataFrame has
df_sales.shape

(600, 9)

In [21]:
# Show important information (null, data type)
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order_ID      600 non-null    int64         
 1   Order_Date    600 non-null    datetime64[ns]
 2   Product_Name  600 non-null    object        
 3   Category      600 non-null    object        
 4   Unit_Price    600 non-null    float64       
 5   Quantity      600 non-null    int64         
 6   Customer_ID   600 non-null    int64         
 7   City          600 non-null    object        
 8   State         600 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 42.3+ KB


In [22]:
# Show statistics summary (mantain just important columns)
df_sales[['Unit_Price', 'Quantity']].describe()

Unnamed: 0,Unit_Price,Quantity
count,600.0,600.0
mean,2283.150533,4.103333
std,2405.377048,1.98138
min,225.2,1.0
25%,545.6325,2.0
50%,1200.0,4.0
75%,4500.0,6.0
max,7500.0,7.0


In [23]:
# Show duplicate values
df_sales.duplicated().sum()

np.int64(0)

In [24]:
# Convert date columns to datetime type
df_sales['Order_Date'] = pd.to_datetime(df_sales['Order_Date'])

### **Addressing business-related questions**

#### What are the top 10 best-selling products?

In [33]:
# Groups the sales data by product name and sums up the total quantity sold for each product
top_10_products_by_quantity = df_sales.groupby(
    'Product_Name'
)['Quantity'].sum().sort_values(ascending = False)

In [34]:
# Converts the result into a DataFrame for easier visualization and analysis
pd.DataFrame(top_10_products_by_quantity)

Unnamed: 0_level_0,Quantity
Product_Name,Unnamed: 1_level_1
Gaming Chair,336
Vertical Mouse,328
Graphics Card,322
Gaming Laptop,314
SSD 1TB,303
Ultrawide Monitor,298
Mechanical Keyboard,284
Headset 7.1,277


#### What was the revenue for the month?

In [30]:
# Create a month/year column at the DataFrame
df_sales['Month_Year'] = df_sales['Order_Date'].dt.strftime('%b-%Y')

In [36]:
# Create a column containing the revenue amount
df_sales['Revenue_Amount'] =  df_sales['Unit_Price'] * df_sales['Quantity']

In [40]:
# Group the sales data by month-year and sums up the total revenue amount
total_revenu_by_month =  df_sales.groupby('Month_Year')['Revenue_Amount'].sum().sort_values(ascending = False)

In [None]:
# Converts the result into a DataFrame for easier visualization and analysis
pd.DataFrame(total_revenu_by_month)

Unnamed: 0_level_0,Revenue_Amount
Month_Year,Unnamed: 1_level_1
Jan-2026,1675329.99
Mar-2026,1355235.51
Feb-2026,1347400.93
Apr-2026,1293263.85


In [None]:
Adicionar formatação o resultado da coluna Revenue_Amount