# 5.2.1 Carga de características desde diccionarios usando DictVectorizer

In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

import warnings
warnings.filterwarnings("ignore")

La clase DictVectorizer permite convertir vectores de características representados como diccionarios a matrices que pueden ser usadas por los estimadores de sklearn.

Note que esta es una representación en formato JSON.



In [7]:
#
# Cada diccionario representa una fila del dataset
#
measurements = [
    {"city": "Dubai", "temperature": 33.0},
    {"city": "London", "temperature": 12.0},
    {"city": "San Francisco", "temperature": 18.0},
]

measurements

[{'city': 'Dubai', 'temperature': 33.0},
 {'city': 'London', 'temperature': 12.0},
 {'city': 'San Francisco', 'temperature': 18.0}]

In [4]:
#
# Creación de una instancia
#
dictVectorizer = DictVectorizer(
    # -------------------------------------------------------------------------
    # Separator string used when constructing new features for one-hot coding.
    separator="=",
    # -------------------------------------------------------------------------
    # Whether feature_names_ and vocabulary_ should be sorted when fitting.
    sort=True,
)

#
# Entrenamiento
#
dictVectorizer.fit(measurements)

#
# Transformación
#
X = dictVectorizer.transform(measurements).toarray()
X

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [5]:
#
# Fit-Transform
#
dictVectorizer.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [8]:
#
# Nombres de las columnas
#
dictVectorizer.get_feature_names_out()

array(['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'],
      dtype=object)

In [10]:
pd.DataFrame(
    dictVectorizer.fit_transform(measurements).toarray(),
    columns=dictVectorizer.get_feature_names_out(),
)

Unnamed: 0,city=Dubai,city=London,city=San Francisco,temperature
0,1.0,0.0,0.0,33.0
1,0.0,1.0,0.0,12.0
2,0.0,0.0,1.0,18.0


In [11]:
#
# Transformación inversa
#
dictVectorizer.inverse_transform(X)

[{'city=Dubai': 1.0, 'temperature': 33.0},
 {'city=London': 1.0, 'temperature': 12.0},
 {'city=San Francisco': 1.0, 'temperature': 18.0}]

In [18]:
print('ok_')

ok_
