# Combining numerical and categorical variables

In [2]:
import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census.csv")
# drop the duplicated column `"education-num"` as stated in the first notebook
adult_census = adult_census.drop(columns="education-num")

target_name = "class"
target = adult_census[target_name]

data = adult_census.drop(columns=[target_name])

## Using the ColumnTransformer

In [4]:
data.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [5]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude="object")
categorical_columns_selector = selector(dtype_include="object")

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [8]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns)
    ]
)

In [17]:
print(preprocessor.fit_transform(data))

  (0, 4)	1.0
  (0, 10)	1.0
  (0, 29)	1.0
  (0, 39)	1.0
  (0, 50)	1.0
  (0, 55)	1.0
  (0, 59)	1.0
  (0, 99)	1.0
  (0, 102)	-0.9951289322402804
  (0, 103)	-0.144803531037397
  (0, 104)	-0.21712709919583073
  (0, 105)	-0.03408696347500956
  (1, 4)	1.0
  (1, 20)	1.0
  (1, 27)	1.0
  (1, 37)	1.0
  (1, 47)	1.0
  (1, 57)	1.0
  (1, 59)	1.0
  (1, 99)	1.0
  (1, 102)	-0.04694150913298843
  (1, 103)	-0.144803531037397
  (1, 104)	-0.21712709919583073
  (1, 105)	0.7729297452241753
  (2, 2)	1.0
  :	:
  (48839, 105)	-0.03408696347500956
  (48840, 4)	1.0
  (48840, 20)	1.0
  (48840, 29)	1.0
  (48840, 33)	1.0
  (48840, 50)	1.0
  (48840, 57)	1.0
  (48840, 59)	1.0
  (48840, 99)	1.0
  (48840, 102)	-1.2139414144958094
  (48840, 103)	-0.144803531037397
  (48840, 104)	-0.21712709919583073
  (48840, 105)	-1.6481203808733793
  (48841, 5)	1.0
  (48841, 20)	1.0
  (48841, 27)	1.0
  (48841, 36)	1.0
  (48841, 52)	1.0
  (48841, 57)	1.0
  (48841, 58)	1.0
  (48841, 99)	1.0
  (48841, 102)	0.9741834080594799
  (48841, 103)

![columntransformer diagram](../figures/api_diagram-columntransformer.svg)

## Using the ColumnTransformer in a machine learning pipeline

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, RandomForestClassifier())
model

In [10]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42
)

In [11]:
_ = model.fit(data_train, target_train)

In [12]:
model.predict(data_test[:5])

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' >50K'], dtype=object)

In [13]:
target_test[:5]

7762      <=50K
23881     <=50K
30507      >50K
28911     <=50K
19484     <=50K
Name: class, dtype: object

In [14]:
model.score(data_test, target_test)

0.8560314470559331

In [15]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([38.03345513, 44.92998934, 41.27348018, 42.25141931, 45.04457283]),
 'score_time': array([0.27126646, 0.29560232, 0.2477026 , 0.31544304, 0.28247905]),
 'test_score': array([0.84246085, 0.8414372 , 0.84643735, 0.85053235, 0.85616298])}