# 0.4.4 ColumnTransformer para datos heterogeneos

https://www.youtube.com/watch?v=F1o4BIuhaf4

https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [19]:
import sys
sys.path.append("../../../../") 

import utils.paths as path
from utils.paths2 import direcciones

## 0.4.4.1 Dataset de prueba

In [20]:
#
# Creación de un dataframe de prueba
#
X = pd.DataFrame(
    {
        "city": [
            "London",
            "London",
            "Paris",
            "Sallisaw",
        ],
        "title": [
            "His Last Bow",
            "How Watson Learned the Trick",
            "A Moveable Feast",
            "The Grapes of Wrath",
        ],
        "expert_rating": [
            5,
            3,
            4,
            5,
        ],
        "user_rating": [
            4,
            5,
            4,
            3,
        ],
    }
)

X

Unnamed: 0,city,title,expert_rating,user_rating
0,London,His Last Bow,5,4
1,London,How Watson Learned the Trick,3,5
2,Paris,A Moveable Feast,4,4
3,Sallisaw,The Grapes of Wrath,5,3


## 0.4.4.2 ColumnTransformer

In [21]:
column_trans = ColumnTransformer(
    # -------------------------------------------------------------------------
    # List of (name, transformer, columns) tuples specifying the transformer
    # objects to be applied to subsets of the data.
    transformers=[
        ("categories", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    # -------------------------------------------------------------------------
    # By default, only the specified columns in transformers are transformed
    # and combined in the output, and the non-specified columns are dropped.
    # (default of 'drop'). By specifying remainder='passthrough', all remaining
    # columns that were not specified in transformers will be automatically
    # passed through.
    remainder="drop",
)

column_trans.fit(X)

ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'),
                                 ['city']),
                                ('title_bow', CountVectorizer(), 'title')])

In [22]:
#
# Nombres de las columnas transformadas
#
column_trans.get_feature_names()

['categories__x0_London',
 'categories__x0_Paris',
 'categories__x0_Sallisaw',
 'title_bow__bow',
 'title_bow__feast',
 'title_bow__grapes',
 'title_bow__his',
 'title_bow__how',
 'title_bow__last',
 'title_bow__learned',
 'title_bow__moveable',
 'title_bow__of',
 'title_bow__the',
 'title_bow__trick',
 'title_bow__watson',
 'title_bow__wrath']

In [23]:
#
# X transformado
#
column_trans.transform(X).toarray()

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [25]:
#
# Visualización como un dataframe
#
pd.DataFrame(
    column_trans.transform(X).toarray(),
    columns=column_trans.get_feature_names(),
)

Unnamed: 0,categories__x0_London,categories__x0_Paris,categories__x0_Sallisaw,title_bow__bow,title_bow__feast,title_bow__grapes,title_bow__his,title_bow__how,title_bow__last,title_bow__learned,title_bow__moveable,title_bow__of,title_bow__the,title_bow__trick,title_bow__watson,title_bow__wrath
0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,1,0,0,1,1,1,0
2,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1


## 0.4.4.3 Selección de columnas basadas en su tipo

In [26]:
ct = ColumnTransformer(
    [
        ("scale", StandardScaler(), make_column_selector(dtype_include=np.number)),
        (
            "onehot",
            OneHotEncoder(),
            make_column_selector(pattern="city", dtype_include=object),
        ),
    ]
)
ct.fit_transform(X)

array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],
       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],
       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])

## 0.4.4.4 Uso de “passthrough”

In [27]:
#
# Copia de las columnas no transformadas a la matriz
# de salida
#
column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder="passthrough",
)

column_trans.fit_transform(X)

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]],
      dtype=int64)

## 0.4.4.5 Aplicación de un transformdor por defecto

In [28]:
column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder=MinMaxScaler(),
)

column_trans.fit_transform(X)[:, -2:]

array([[1. , 0.5],
       [0. , 1. ],
       [0.5, 0.5],
       [1. , 0. ]])

## 0.4.4.6 Creación de un tranformador con make_column_transformer

In [29]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ["city"]),
    (CountVectorizer(), "title"),
    remainder=MinMaxScaler(),
)

column_trans

ColumnTransformer(remainder=MinMaxScaler(),
                  transformers=[('onehotencoder', OneHotEncoder(), ['city']),
                                ('countvectorizer', CountVectorizer(),
                                 'title')])

In [30]:
print('ok_')

ok_
