### **Data Cleaning and Exploratory Analysis**

#### **Import libraries**

In [2]:
# Import the custom function 'gera_dados' from the local module 'gerar_dados_aleatorios'
from modules.generae_random_sales_data import GenerateData as gd

# Import date and time utilities for handling timestamps and time intervals
from datetime import datetime, timedelta

# Import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import data manipulation and numerical computation libraries
import pandas as pd
import numpy as np

# Import the 'random' module for generating random numbers
import random

#### **Create functions**

In [3]:
# Function to format numeric values into millions with a currency symbol (Brazilian Real)
# Example: 3500000 -> 'R$ 3.50 Mi'
def formatador_milhoes(y, pos):
    return f'R$ {y/1_000_000:.2f} Mi'

In [3]:
# Function to format percentage labels for charts (e.g., pie charts)
# It shows both the percentage and the corresponding absolute value in millions
# Example: 25%, R$ 2.50 Mi
def formatar_valores(pct, allvals):
    absolute = pct/100. * sum(allvals)
    return f'{pct:.1f}%\nR$ {absolute/1_000_000:.2f} Mi'

#### **Create a random dataset**

In [4]:
# Create a random dataFrame
df_sales = gd.generate_fake_data()


Starting the generation of 600 records...
Data generation completed successfully.



#### **Exploratory Data Analysis**

In [6]:
# Check the first 5 records
df_sales.head()

Unnamed: 0,Order_ID,Order_Date,Product_Name,Category,Unit_Price,Quantity,Customer_ID,City,State
0,1000,2026-01-01 15:00:00,Vertical Mouse,Accessories,241.08,5,128,São Paulo,SP
1,1001,2026-01-01 02:00:00,Graphics Card,Hardware,4500.0,1,100,Belo Horizonte,MG
2,1002,2026-01-01 09:00:00,SSD 1TB,Hardware,600.0,2,122,Salvador,BA
3,1003,2026-01-01 22:00:00,Graphics Card,Hardware,4500.0,7,106,Fortaleza,CE
4,1004,2026-01-01 15:00:00,Gaming Laptop,Electronics,7500.0,5,109,Fortaleza,CE


In [None]:
# Show how much lines and coluns the DataFrame has
df_sales.shape

(600, 9)

In [8]:
# Show important information (null, data type)
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order_ID      600 non-null    int64         
 1   Order_Date    600 non-null    datetime64[ns]
 2   Product_Name  600 non-null    object        
 3   Category      600 non-null    object        
 4   Unit_Price    600 non-null    float64       
 5   Quantity      600 non-null    int64         
 6   Customer_ID   600 non-null    int64         
 7   City          600 non-null    object        
 8   State         600 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 42.3+ KB


In [11]:
# Show statistics summary (mantain just important columns)
df_sales[['Unit_Price', 'Quantity']].describe()

Unnamed: 0,Unit_Price,Quantity
count,600.0,600.0
mean,2253.006283,3.963333
std,2397.771961,2.007163
min,225.13,1.0
25%,543.81,2.0
50%,800.0,4.0
75%,2800.0,6.0
max,7500.0,7.0


In [12]:
# Show duplicate values
df_sales.duplicated().sum()

np.int64(0)

In [13]:
# Convert date columns to datetime type
df_sales['Order_Date'] = pd.to_datetime(df_sales['Order_Date'])