# 5.1.3 Transformación de datos heterogeneos con ColumnTransformer

Permite transformar las columnas individualmente o por grupos. <br>
Las columnas pueden ser especificadas por nombre o por tipo. <br>
Se pueden indicar transformaciones por defecto.

Links: 

https://www.youtube.com/watch?v=F1o4BIuhaf4

https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

## 5.1.3.1 Dataset de prueba

In [None]:
#
# Creación de un dataframe de prueba
#
X = pd.DataFrame(
    {
        "city": [
            "London",
            "London",
            "Paris",
            "Sallisaw",
        ],
        "title": [
            "His Last Bow",
            "How Watson Learned the Trick",
            "A Moveable Feast",
            "The Grapes of Wrath",
        ],
        "expert_rating": [
            5,
            3,
            4,
            5,
        ],
        "user_rating": [
            4,
            5,
            4,
            3,
        ],
    }
)

X

## 5.1.3.2 ColumnTransformer

In [None]:
column_trans = ColumnTransformer(
    # -------------------------------------------------------------------------
    # List of (name, transformer, columns) tuples specifying the transformer
    # objects to be applied to subsets of the data.
    transformers=[
        ("categories", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    # -------------------------------------------------------------------------
    # By default, only the specified columns in transformers are transformed
    # and combined in the output, and the non-specified columns are dropped.
    # (default of 'drop'). By specifying remainder='passthrough', all remaining
    # columns that were not specified in transformers will be automatically
    # passed through.
    remainder="drop",
)

column_trans.fit(X)

In [None]:
#
# Nombres de las columnas transformadas
#
column_trans.get_feature_names_out()

In [None]:
#
# X transformado
#
column_trans.transform(X).toarray()

In [None]:
#
# Visualización como un dataframe
#
pd.DataFrame(
    column_trans.transform(X).toarray(),
    columns=column_trans.get_feature_names_out(),
)

## 5.1.3.3 Selección de columnas basadas en su tipo

In [None]:
ct = ColumnTransformer(
    [
        ("scale", StandardScaler(), make_column_selector(dtype_include=np.number)),
        (
            "onehot",
            OneHotEncoder(),
            make_column_selector(pattern="city", dtype_include=object),
        ),
    ]
)
ct.fit_transform(X)

In [None]:
pd.DataFrame(ct.transform(X), columns=ct.get_feature_names_out())

## 5.1.3.4 Uso de “passthrough”

In [None]:
#
# Copia de las columnas no transformadas a la matriz
# de salida
#
column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder="passthrough",
)

column_trans.fit_transform(X)

## 5.1.3.5 Aplicación de un transformdor por defecto

In [None]:
column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder=MinMaxScaler(),
)

column_trans.fit_transform(X)[:, -2:]

## 5.1.3.6 Creación de un tranformador con make_column_transformer

In [None]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ["city"]),
    (CountVectorizer(), "title"),
    remainder=MinMaxScaler(),
)

column_trans

In [None]:
print('ok_')