# II. Entendimento dos dados

## 0. Libraries and functions

### 0.1. Importing libraries

In [1]:
from ucimlrepo import fetch_ucirepo 

import pandas as pd
import numpy as np
from scipy.stats import shapiro, kstest

### 0.2. Helper functions

In [2]:
# ----------------------------------------------------------
# Descrição complementar do dataset
# Subfunções especializadas (cada uma com sua responsabilidade)
# ----------------------------------------------------------

def _calc_distribution_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Calcula estatísticas de distribuição com MultiIndex"""
    return pd.DataFrame({
        ('Distribution', 'range'): df.max() - df.min(),
        ('Distribution', 'skewness'): df.skew(),
        ('Distribution', 'kurtosis'): df.kurtosis(),
        ('Distribution', 'coef_var'): (df.std() / df.mean()).abs().replace(np.inf, np.nan)
    })
    
def _calc_normality_stats(df: pd.DataFrame, numeric_cols: pd.Index) -> pd.DataFrame:
    """
    Executa testes de normalidade com critérios otimizados:
    - Shapiro-Wilk: 4 <= n <= 2000 (ótimo para pequenas/médias amostras)
    - Kolmogorov-Smirnov: n > 2000 (recomendado para grandes amostras)
    Retorna DataFrame com os testes aplicáveis para cada coluna
    """
    cols = [('Normality', 'shapiro_p'), ('Normality', 'ks_p')]
    normality = pd.DataFrame(index=numeric_cols, columns=pd.MultiIndex.from_tuples(cols), dtype=float)
    
    for col in numeric_cols:
        sample = df[col].dropna()
        n = len(sample)

        if 4 <= n <= 2000:
            normality.loc[col, ('Normality', 'shapiro_p')] = shapiro(sample)[1]
        elif n > 2000:
            normality.loc[col, ('Normality', 'ks_p')] = kstest(sample, 'norm', args=(sample.mean(), sample.std()))[1]
    
    return normality

def _calc_outlier_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Identifica outliers com MultiIndex"""
    stats = df.describe(percentiles=[.25, .75]).T
    q1, q3 = stats['25%'], stats['75%']
    iqr = q3 - q1
    
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    is_lower = df.lt(lower_bound)  # df < lower_bound
    is_upper = df.gt(upper_bound)  # df > upper_bound
    
    return pd.DataFrame({
        ('Outliers', 'has_outlier'): (is_lower | is_upper).any(),
        ('Outliers', 'IQR'): iqr,
        ('Outliers', 'lower_bound'): lower_bound,
        ('Outliers', 'upper_bound'): upper_bound,
        ('Outliers', 'n_lower'): is_lower.sum(),
        ('Outliers', 'n_upper'): is_upper.sum()
    })

def _calc_quality_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Analisa qualidade dos dados com MultiIndex"""
    return pd.DataFrame({
        ('Quality', 'has_missing'): df.isna().any(),
        ('Quality', 'n_missing'): df.isna().sum(),
        ('Quality', 'p_missing'): (df.isna().mean()*100).round(2),
        ('Quality', 'has_inf'): np.isinf(df).any(),
        ('Quality', 'n_inf'): np.isinf(df).sum()
    })

# ----------------------------------------------------------
# Função principal (estrutura organizadora)
# ----------------------------------------------------------

def describe_plus(df: pd.DataFrame) -> pd.DataFrame:
    """
    Função que coordena as subfunções e garante:
    - MultiIndex bem formado
    - Apenas um teste de normalidade por coluna
    - Estrutura limpa e organizada
    """
    numeric_cols = df.select_dtypes(include=np.number).columns
    
    # 1. Definimos a estrutura completa do MultiIndex
    multi_index = pd.MultiIndex.from_tuples([
        # Distribution
        ('Distribution', 'range'),
        ('Distribution', 'skewness'),
        ('Distribution', 'kurtosis'),
        ('Distribution', 'coef_var'),
        
        # Normality
        ('Normality', 'shapiro_p'),
        ('Normality', 'ks_p'),
        
        # Outliers
        ('Outliers', 'has_outlier'),
        ('Outliers', 'IQR'),
        ('Outliers', 'lower_bound'),
        ('Outliers', 'upper_bound'),
        ('Outliers', 'n_lower'),
        ('Outliers', 'n_upper'),
        
        # Quality
        ('Quality', 'has_missing'),
        ('Quality', 'n_missing'),
        ('Quality', 'p_missing'),
        ('Quality', 'has_inf'),
        ('Quality', 'n_inf')
    ])
    
    # 2. DataFrame vazio com a estrutura completa
    result = pd.DataFrame(index=df.columns, columns=multi_index)
    
    # 3. Preenchemos com as subfunções
    # Distribution
    dist_stats = _calc_distribution_stats(df)
    result[dist_stats.columns] = dist_stats
    
    # Normality
    normality = _calc_normality_stats(df, numeric_cols)
    result[normality.columns] = normality
    
    # Outliers
    outliers = _calc_outlier_stats(df)
    result[outliers.columns] = outliers
    
    # Quality
    quality = _calc_quality_stats(df)
    result[quality.columns] = quality
    
    return result.round(2)

## 1. Data

### 1.1. Loading data

In [3]:
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
#print(auto_mpg.metadata) 
  
# variable information 
#print(auto_mpg.variables) 

# Juntar X e y
df_raw = pd.concat([X, y], axis=1)
df_raw.head()

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg
0,307.0,8,130.0,3504,12.0,70,1,18.0
1,350.0,8,165.0,3693,11.5,70,1,15.0
2,318.0,8,150.0,3436,11.0,70,1,18.0
3,304.0,8,150.0,3433,12.0,70,1,16.0
4,302.0,8,140.0,3449,10.5,70,1,17.0


### 1.2. Data description

- Análise da qualidade geral dos dados:
	- Data shape;
	- Data types e info;
	- Total de missings;
	- Total de duplicatas.

- Análise descritiva por colunas:
	- Data describe:
    	- Média, desvio padrão, mínimo, máximo e quartis;
  	- Distribuição:
    	- Amplitude, assimetria, curtose e coeficiente de variação;
    	- Teste de normalidade (p_shapiro, p_ks)
	- Outliers: 
    	- (IQR, has_outliers, lim_lower, lim_upper, n_lower, n_upper)
	- Qualidade dos dados
    	- Missing: n_missing e p_missing_%;
    	- has_inf, n_inf;

In [5]:
print('Qtde de linhas: {:,}'.format(df_raw.shape[0]))
print('Qtde de colunas: {:,}'.format(df_raw.shape[1]))
print('Qtde de registros duplicados: {:,}'.format(df_raw.duplicated().sum()))
print('Qtde total de registros faltantes: {:,}'.format(df_raw.isna().sum().sum()))

Qtde de linhas: 398
Qtde de colunas: 8
Qtde de registros duplicados: 0
Qtde total de registros faltantes: 6


In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   displacement  398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   horsepower    392 non-null    float64
 3   weight        398 non-null    int64  
 4   acceleration  398 non-null    float64
 5   model_year    398 non-null    int64  
 6   origin        398 non-null    int64  
 7   mpg           398 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 25.0 KB


In [7]:
df_raw.describe().round(2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
displacement,398.0,193.43,104.27,68.0,104.25,148.5,262.0,455.0
cylinders,398.0,5.45,1.7,3.0,4.0,4.0,8.0,8.0
horsepower,392.0,104.47,38.49,46.0,75.0,93.5,126.0,230.0
weight,398.0,2970.42,846.84,1613.0,2223.75,2803.5,3608.0,5140.0
acceleration,398.0,15.57,2.76,8.0,13.82,15.5,17.18,24.8
model_year,398.0,76.01,3.7,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.57,0.8,1.0,1.0,1.0,2.0,3.0
mpg,398.0,23.51,7.82,9.0,17.5,23.0,29.0,46.6


In [8]:
describe_plus(df_raw).round(2)

Unnamed: 0_level_0,Distribution,Distribution,Distribution,Distribution,Normality,Normality,Outliers,Outliers,Outliers,Outliers,Outliers,Outliers,Quality,Quality,Quality,Quality,Quality
Unnamed: 0_level_1,range,skewness,kurtosis,coef_var,shapiro_p,ks_p,has_outlier,IQR,lower_bound,upper_bound,n_lower,n_upper,has_missing,n_missing,p_missing,has_inf,n_inf
displacement,387.0,0.72,-0.75,0.54,0.0,,False,157.75,-132.38,498.62,0,0,False,0,0.0,False,0
cylinders,5.0,0.53,-1.38,0.31,0.0,,False,4.0,-2.0,14.0,0,0,False,0,0.0,False,0
horsepower,184.0,1.09,0.7,0.37,0.0,,True,51.0,-1.5,202.5,0,10,True,6,1.51,False,0
weight,3527.0,0.53,-0.79,0.29,0.0,,False,1384.25,147.38,5684.38,0,0,False,0,0.0,False,0
acceleration,16.8,0.28,0.42,0.18,0.04,,True,3.35,8.8,22.2,3,4,False,0,0.0,False,0
model_year,12.0,0.01,-1.18,0.05,0.0,,False,6.0,64.0,88.0,0,0,False,0,0.0,False,0
origin,2.0,0.92,-0.82,0.51,0.0,,False,1.0,-0.5,3.5,0,0,False,0,0.0,False,0
mpg,37.6,0.46,-0.51,0.33,0.0,,True,11.5,0.25,46.25,0,1,False,0,0.0,False,0


### 1.3. Data dictionary


| **FEATURE** | **DESCRIPTION** | **TYPE** | **DETAILS** | **COMMENTS**
|---|---|---|---|---|
| **displacement** | Cilindrada do motor (polegadas cúbicas?) | Contínua | [68, 455]    | Mede o volume total dos cilindros.
| **cylinders**    | Número de cilindros                      | Discreta | [3, 8]       | 
| **horsepower**   | Potência do motor (HP)                   | Contínua | [46, 230]    | 
| **weight**       | Peso do veículo (lbs)                    | Contínua | [1613, 5140] | 
| **acceleration** | Aceleração (0-60 mph em segundos)        | Contínua | [8.0, 24.8]  | Quanto menor, mais rápido.
| **model_year**   | Ano do modelo                            | Discreta | [70, 82]     | Codificado como último dois dígitos (70 = 1970).
| **origin**       | Origem do veículo                        | Nominal  | [1, 3]       | ??? 1=EUA, 2=Europa, 3=Japão?
| **mpg**          | Milhas por galão (consumo)               | Contínua | [9.0, 46.6]  | Variável target. Maior = mais eficiente.


# III. Preparação dos dados