In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

def criar_transformer_categoricos(estrategia='balanceada'):
    """
    Cria um ColumnTransformer para processar todos os atributos.
    
    Parâmetros:
    -----------
    estrategia : str
        'balanceada' - Mix de One-Hot e Ordinal (padrão)
        'onehot' - Tudo One-Hot (alta dimensionalidade)
        'ordinal' - Tudo Ordinal (baixa dimensionalidade)
    """
    
    # Definir grupos de atributos
    atributos_numericos = [
        'normalized-losses', 'wheel-base', 'length', 'width', 'height',
        'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio',
        'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
    ]
    
    binarios = ['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']
    nominais_pequenos = ['body-style', 'drive-wheels']
    nominais_grandes = ['make', 'engine-type', 'fuel-system']
    
    # Atributo ordinal com ordem específica
    ordinal = ['num-of-cylinders']
    cylinder_categories = [['two', 'three', 'four', 'five', 'six', 'eight', 'twelve']]
    
    if estrategia == 'balanceada':
        transformer = ColumnTransformer(
            transformers=[
                # Atributos numéricos: manter como estão (normalizar depois)
                ('num', 'passthrough', atributos_numericos),
                
                # Binários: Ordinal (mais eficiente que One-Hot para 2 categorias)
                ('bin', OrdinalEncoder(), binarios),
                
                # Nominais pequenos: One-Hot
                ('nom_small', OneHotEncoder(drop='first', sparse_output=False), 
                 nominais_pequenos),
                
                # Nominais grandes: Ordinal (evitar explosão dimensional)
                ('nom_large', OrdinalEncoder(), nominais_grandes),
                
                # Ordinal: com ordem específica
                ('ord', OrdinalEncoder(categories=cylinder_categories), ordinal)
            ]
        )
    
    elif estrategia == 'onehot':
        todos_categoricos = binarios + nominais_pequenos + nominais_grandes + ordinal
        transformer = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', atributos_numericos),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), 
                 todos_categoricos)
            ]
        )
    
    elif estrategia == 'ordinal':
        transformer = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', atributos_numericos),
                ('bin', OrdinalEncoder(), binarios),
                ('nom', OrdinalEncoder(), nominais_pequenos + nominais_grandes),
                ('ord', OrdinalEncoder(categories=cylinder_categories), ordinal)
            ]
        )
    
    else:
        raise ValueError(f"Estratégia '{estrategia}' não reconhecida")
    
    return transformer


In [5]:

base_name = "autos"
# for i in range(1,6):
names=[
            'normalized-losses', 'make', 'fuel-type', 'aspiration', 
            'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
            'wheel-base', 'length', 'width', 'height', 'curb-weight',
            'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system',
            'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm',
            'city-mpg', 'highway-mpg', 'price', 'symboling'
        ]
completed_data = pd.read_csv( f"{base_name}/{base_name}/{base_name}.dat"  ,
        header=None, 
        delimiter=',', 
        comment='@',
        names=names,
        skipinitialspace=True  # Remove espaços após vírgulas
    ) 

X_complete = completed_data.drop('symboling', axis=1)
transformer = criar_transformer_categoricos()
transformer.fit(X_complete)

for i in range(1,6):
    print(f"Transform-{i}")
    file_name = f"{base_name}-5-{i}"
    
    data_train = pd.read_csv( f"{base_name}/{base_name}-5-fold/{file_name}tra.dat"  ,
        header=None, 
        delimiter=',', 
        comment='@',
        names=names,
        skipinitialspace=True  # Remove espaços após vírgulas
    ) 
    data_test = pd.read_csv( 
        f"{base_name}/{base_name}-5-fold/{file_name}tst.dat"  , 
        header=None, 
        delimiter=',', 
        comment='@', 
        names=names,
        skipinitialspace=True  # Remove espaços após vírgulas
    ) 


    # Carregar dados
    # df = pd.read_csv('automobile.csv')

    # Separar features e target
    X_train = data_train.drop('symboling', axis=1)
    y_train = data_train['symboling']

    # Separar features e target
    X_test = data_test.drop('symboling', axis=1)
    y_test = data_test['symboling']

    # Criar o transformer

    # Aplicar transformação
    X_train_transformed = transformer.transform(X_train)
    X_test_transformed = transformer.transform(X_test)

    # Criar DataFrame
    df_train_transformed = pd.DataFrame(X_train_transformed)
    df_train_transformed['symboling'] = y_train.values  # ✅ Adicionar aqui
    
    # Salvar
    df_train_transformed.to_csv(f"{base_name}/{base_name}-transformed/{file_name}tra.dat", index=False, header=False)

    # Criar DataFrame
    df_test_transformed = pd.DataFrame(X_test_transformed)
    df_test_transformed['symboling'] = y_test.values  # ✅ Adicionar aqui
    
    # Salvar
    df_test_transformed.to_csv(f"{base_name}/{base_name}-transformed/{file_name}tst.dat", index=False, header=False)


Transform-1
Transform-2
Transform-3
Transform-4
Transform-5


In [40]:
pd.DataFrame(X_train_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,168.0,94.5,168.7,64.0,52.6,2169.0,98.0,3.19,3.03,9.0,...,0.0,0.0,1.0,0.0,0.0,1.0,14.0,2.0,1.0,2.0
1,113.0,93.1,166.8,64.2,54.1,1950.0,91.0,3.08,3.15,9.0,...,0.0,0.0,1.0,0.0,1.0,0.0,6.0,2.0,1.0,2.0
2,87.0,95.7,158.7,63.6,54.5,1985.0,92.0,3.05,3.03,9.0,...,0.0,1.0,0.0,0.0,1.0,0.0,14.0,2.0,1.0,2.0
3,125.0,96.3,172.4,65.4,51.6,2365.0,122.0,3.35,3.46,8.5,...,0.0,0.0,1.0,0.0,1.0,0.0,8.0,2.0,1.0,2.0
4,194.0,91.3,170.7,67.9,49.7,3139.0,181.0,3.43,3.27,7.8,...,0.0,1.0,0.0,0.0,0.0,1.0,9.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,188.0,101.2,176.8,64.8,54.3,2765.0,164.0,3.31,3.19,9.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,4.0,4.0
123,94.0,97.3,171.7,65.5,55.7,2300.0,109.0,3.19,3.40,10.0,...,0.0,0.0,1.0,0.0,1.0,0.0,15.0,2.0,4.0,2.0
124,65.0,102.4,175.6,66.5,54.9,2414.0,122.0,3.31,3.54,8.7,...,0.0,0.0,1.0,0.0,1.0,0.0,14.0,2.0,4.0,2.0
125,128.0,94.5,165.3,63.8,54.5,1918.0,97.0,3.15,3.29,9.4,...,0.0,0.0,1.0,0.0,1.0,0.0,9.0,2.0,1.0,2.0


In [10]:
data_train

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,symboling
0,168.0,toyota,gas,std,two,sedan,rwd,front,94.5,168.7,...,2bbl,3.19,3.03,9.0,70.0,4800.0,29.0,34.0,8058.0,1
1,113.0,mazda,gas,std,four,sedan,fwd,front,93.1,166.8,...,2bbl,3.08,3.15,9.0,68.0,5000.0,31.0,38.0,7395.0,1
2,125.0,mitsubishi,gas,std,four,sedan,fwd,front,96.3,172.4,...,2bbl,3.35,3.46,8.5,88.0,5000.0,25.0,32.0,6989.0,1
3,194.0,nissan,gas,turbo,two,hatchback,rwd,front,91.3,170.7,...,mpfi,3.43,3.27,7.8,200.0,5200.0,17.0,23.0,19699.0,3
4,128.0,nissan,gas,std,four,sedan,fwd,front,100.4,181.7,...,mpfi,3.43,3.27,9.0,152.0,5200.0,17.0,22.0,13499.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,94.0,volkswagen,gas,std,four,sedan,fwd,front,97.3,171.7,...,mpfi,3.19,3.40,10.0,100.0,5500.0,26.0,32.0,9995.0,2
123,65.0,toyota,gas,std,four,sedan,fwd,front,102.4,175.6,...,mpfi,3.31,3.54,8.7,92.0,4200.0,27.0,32.0,10898.0,-1
124,128.0,nissan,gas,std,two,sedan,fwd,front,94.5,165.3,...,2bbl,3.15,3.29,9.4,69.0,5200.0,31.0,37.0,6649.0,1
125,186.0,porsche,gas,std,two,hatchback,rwd,front,94.5,168.9,...,mpfi,3.94,3.11,9.5,143.0,5500.0,19.0,27.0,22018.0,3
