# Combining numerical and categorical variables

In [3]:
import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census.csv")
# remove a column
adult_census = adult_census.drop(columns=["education-num"])

# Split target from data
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name])

## Using the ColumnTransformer

In [4]:
data.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [7]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# apply these functions to the dataset, to actually perform the selection
numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Create 2 different preprocessors, specific to the datatype
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [13]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        # here we put tuples in this list
        # syntaxis in the tuple: name of your liking, the actual preprocessor object, columns that we want to apply it to
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns)
    ]
)

![columntransformer diagram](../figures/api_diagram-columntransformer.svg)

Data is first splitted into numerical and categorical data.

Then specify how the columns shoud be transformed.

Then merge (concatenate) the 2 data into 1 data object.

## Using the ColumnTransformer in a machine learning pipeline

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, RandomForestClassifier())
model

## Split data into train and test set

In [15]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, 
    target, 
    random_state=42) 

data_train.shape, data_test.shape

((36631, 12), (12211, 12))

## Train (fit) the RandomForest model

In [17]:
_ = model.fit(data_train, target_train)

In [18]:
# Predict the test data on the first 5
model.predict(data_test[:5])

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' >50K'], dtype=object)

In [19]:
target_test[:5]

7762      <=50K
23881     <=50K
30507      >50K
28911     <=50K
19484     <=50K
Name: class, dtype: object

## Predict and score on the entire test set

In [20]:
model.score(data_test, target_test)

0.853165178937024

## Do crossvalidation of this pipeline

In [21]:
from sklearn.model_selection import cross_validate

cv_result = cross_validate(model, data, target, cv=5)

In [22]:
cv_result

{'fit_time': array([40.30474186, 40.61124492, 40.28244829, 39.05785155, 39.7747972 ]),
 'score_time': array([0.30269694, 0.26687574, 0.30362344, 0.23479462, 0.25656652]),
 'test_score': array([0.84235848, 0.84020882, 0.8474611 , 0.85022523, 0.85462735])}

## Note on reproducibility
RandomForestClassifier has an argument 'random_state' that defaults to None. For reproducible results, you should enter an integer value here