## Using XGBoost in pipelines

In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np

### Encoding categorical columns I: LabelEncoder

In [3]:
df = pd.read_csv('chapter4-data.csv')

In [4]:

from sklearn.preprocessing import LabelEncoder


df.LotFrontage = df.LotFrontage.fillna(0)


categorical_mask = (df.dtypes == object)


categorical_columns = df.columns[categorical_mask].tolist()


print(df[categorical_columns].head())


le = LabelEncoder()


df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))


print(df[categorical_columns].head())

  MSZoning PavedDrive Neighborhood BldgType HouseStyle
0       RL          Y      CollgCr     1Fam     2Story
1       RL          Y      Veenker     1Fam     1Story
2       RL          Y      CollgCr     1Fam     2Story
3       RL          Y      Crawfor     1Fam     2Story
4       RL          Y      NoRidge     1Fam     2Story
   MSZoning  PavedDrive  Neighborhood  BldgType  HouseStyle
0         3           2             5         0           5
1         3           2            24         0           2
2         3           2             5         0           5
3         3           2             6         0           5
4         3           2            15         0           5


### Encoding categorical columns II: OneHotEncoder

In [5]:

from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(sparse=False)


df_encoded = ohe.fit_transform(df)


print(df_encoded[:5, :])


print(df.shape)


print(df_encoded.shape)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(1460, 22)
(1460, 4829)


### Encoding categorical columns III: DictVectorizer

In [6]:

from sklearn.feature_extraction import DictVectorizer


df_dict = df.to_dict("records")


dv = DictVectorizer(sparse=False)


df_encoded = dv.fit_transform(df_dict)


print(df_encoded[:5,:])


print(dv.vocabulary_)

[[3.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 2.000e+00 5.480e+02
  1.710e+03 1.000e+00 5.000e+00 8.450e+03 6.500e+01 6.000e+01 3.000e+00
  5.000e+00 5.000e+00 7.000e+00 2.000e+00 0.000e+00 2.085e+05 0.000e+00
  2.003e+03]
 [3.000e+00 0.000e+00 0.000e+00 1.000e+00 1.000e+00 2.000e+00 4.600e+02
  1.262e+03 0.000e+00 2.000e+00 9.600e+03 8.000e+01 2.000e+01 3.000e+00
  2.400e+01 8.000e+00 6.000e+00 2.000e+00 0.000e+00 1.815e+05 1.000e+00
  1.976e+03]
 [3.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 2.000e+00 6.080e+02
  1.786e+03 1.000e+00 5.000e+00 1.125e+04 6.800e+01 6.000e+01 3.000e+00
  5.000e+00 5.000e+00 7.000e+00 2.000e+00 1.000e+00 2.235e+05 2.000e+00
  2.001e+03]
 [3.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 1.000e+00 6.420e+02
  1.717e+03 0.000e+00 5.000e+00 9.550e+03 6.000e+01 7.000e+01 3.000e+00
  6.000e+00 5.000e+00 7.000e+00 2.000e+00 1.000e+00 1.400e+05 3.000e+00
  1.915e+03]
 [4.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 2.000e+00 8.360e+02
  2.198e+03 