In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
def load_adult_dataset():
    columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
               'marital-status', 'occupation', 'relationship', 'race', 'sex',
               'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
    
    # 0 -- carregar dataset adult
    df = pd.read_csv('adult.data', names=columns, skipinitialspace=True)
    
    # 1 -- excluir linhas com dados faltantes
    df = df.replace('?', np.nan)
    df = df.dropna()
    
    # 2 -- transformar as strings em números
    categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                          'relationship', 'race', 'sex', 'native-country']
     
    # Criar um LabelEncoder para cada coluna categórica
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Salvar para uso posterior se necessário

    # 3 -- separar idade por faixa de valores
    df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['young', 'middle_aged', 'old'])
    le_age = LabelEncoder()
    df['age_group'] = le_age.fit_transform(df['age_group'])
    
    # Corrigir o target (>50K com K maiúsculo)
    df['income'] = (df['income'] == '>50K').astype(int)
    
    return df

In [8]:
# Carregar e visualizar os dados
df = load_adult_dataset()
print(f"Shape do dataset: {df.shape}")
print(f"Colunas: {df.columns.tolist()}")
print(f"Distribuição do target:")
print(df['income'].value_counts())
print(f"\nPrimeiras 5 linhas:")
print(df.head())

Shape do dataset: (30162, 16)
Colunas: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income', 'age_group']
Distribuição do target:
income
0    22654
1     7508
Name: count, dtype: int64

Primeiras 5 linhas:
   age  workclass  fnlwgt  education  education-num  marital-status  \
0   39          5   77516          9             13               4   
1   50          4   83311          9             13               2   
2   38          2  215646         11              9               0   
3   53          2  234721          1              7               2   
4   28          2  338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           0             1     4    1          2174             0   
1           3             0     4    1             0             0   
2        