In [81]:
from sklearn.datasets import fetch_openml
adult = fetch_openml("adult", version=2)  
df = adult.frame
df.dropna()
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [82]:
# Scaling and Encoding the data
# Standard for numerical, label for the class, and one hot encoder for categorical (non ordinal data)
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pandas as pd

numerical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output = False) # Sparse false to return dense np array
labelEncoder = LabelEncoder()

df[numerical_features] = scaler.fit_transform(df[numerical_features])

# OneHotEncoder creates sum(Ni) new columns, where N is the number of values a categorical feature can take, i is each categorical feature
# For example, a gender column (M, F) will crete 2 new columns. [M=0, F=1] for a female for example
# This causes the new data to be larger in columns than the original so a new DF must be made, you cannot simply set the dfs equal
encoded_categorical = encoder.fit_transform(df[categorical_features])

# Column names will now change too, OneHotEncoder has a method to find the new column names
encoded_categorical_cols = encoder.get_feature_names_out(categorical_features)

# Convert this transformed data into a dataframe
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoded_categorical_cols)

# Replace the old columns
df = df.drop(columns=categorical_features)
df = pd.concat([df, encoded_categorical_df], axis=1)


df['class'] = labelEncoder.fit_transform(df['class'])
print(df)

            age    fnlwgt  education-num  capital-gain  capital-loss  \
0     -0.995129  0.351675      -1.197259     -0.144804     -0.217127   
1     -0.046942 -0.945524      -0.419335     -0.144804     -0.217127   
2     -0.776316  1.394723       0.747550     -0.144804     -0.217127   
3      0.390683 -0.277844      -0.030373      0.886874     -0.217127   
4     -1.505691 -0.815954      -0.030373     -0.144804     -0.217127   
...         ...       ...            ...           ...           ...   
48837 -0.849254  0.640492       0.747550     -0.144804     -0.217127   
48838  0.098933 -0.334178      -0.419335     -0.144804     -0.217127   
48839  1.411808 -0.357510      -0.419335     -0.144804     -0.217127   
48840 -1.213941  0.111984      -0.419335     -0.144804     -0.217127   
48841  0.974183  0.930494      -0.419335      1.871315     -0.217127   

       hours-per-week  class  workclass_Federal-gov  workclass_Local-gov  \
0           -0.034087      0                    0.0        