<a href="https://colab.research.google.com/github/pea-sys/Til/blob/master/XfeatTutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

pfnetが開発した特徴量エンジニアリングのライブラリ xfeatのTutorialの写経です。  
気に入ったら、テーブルコンペで使う目論見。  
大きいデータで試さないと分からないが、恐らく最適化も頑張っていると思われる。
[参考](https://github.com/pfnet-research/xfeat/blob/master/_docs/xfeat_slides.pdf)

In [23]:
!pip install -q https://github.com/pfnet-research/xfeat/archive/master.zip

  Building wheel for xfeat (setup.py) ... [?25l[?25hdone


In [24]:
import seaborn as sns
import pandas as pd
import xfeat
from xfeat import SelectCategorical, LabelEncoder, Pipeline
import numpy as np

In [25]:
sns.get_dataset_names()


No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.





['anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [26]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [28]:
# Serialize the original csv file into the feather file format.
# For simplicity, we assume that train and test are concatenated to perform feature engineering.
xfeat.utils.compress_df(df).to_feather("data.ftr")

In [29]:
# Check the serialized data.
df = pd.read_feather("data.ftr")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.799999,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.900002,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.400002,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.299999,58.0,335,4.34,4.35,2.75


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float32
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float32
 5   table    53940 non-null  float32
 6   price    53940 non-null  int16  
 7   x        53940 non-null  float32
 8   y        53940 non-null  float32
 9   z        53940 non-null  float32
dtypes: float32(6), int16(1), object(3)
memory usage: 2.6+ MB


# カテゴリカラム
xfeatのデータ変換クラスは、入力としてDataFrameを使用し、出力としてDataFrameを使用します。pandasとcuDFの両方のDataFrameをサポートしています。

xfeat.SelectCategoricalは、入力データフレームからカテゴリデータの列のみを抽出します。

xfeat.Pipelineは、データ変換オブジェクトを順次結合します。

In [31]:
xfeat.SelectCategorical().fit_transform(df).head()

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2


In [40]:
# Takes categorical columns from the data frame and performs label encoding on them.
# The converted data is stored in the column with suffix defined in `output_suffix`.
# By defining `output_suffix=""`, it is possible to store the result in the same column.
encoder = Pipeline([
    SelectCategorical(exclude_cols=[]),
    LabelEncoder(output_suffix=""),
])

In [41]:
encoder.fit_transform(df).head()

Unnamed: 0,cut,color,clarity
0,0,0,0
1,1,0,1
2,2,0,2
3,1,1,3
4,2,2,0


xfeat.cat_encoder.ConcatCombinationは、入力されたカラムを結合して新しいカラムを作成します。

In [50]:
from xfeat import ConcatCombination


encoder = Pipeline([
    SelectCategorical(exclude_cols=[]),

    # If there are many categorical columns,
    # users can specify the columns to be combined with `input_cols` kwargs.
    # `r=2` specifies the number of columns to combine the columns.
    ConcatCombination(drop_origin=True, output_suffix="", r=2),
    
    LabelEncoder(output_suffix=""),
])
encoder.fit_transform(df).head()

Unnamed: 0,cutcolor,cutclarity,colorclarity
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,4


# 数値コラム
xfeat.SelectNumericalは、入力データフレームから数値データの列のみを抽出します。

xfeat.ArithmeticCombinationsは、算術的な組み合わせを適用して新しい列を作成します。

In [53]:
from xfeat import SelectNumerical

SelectNumerical(exclude_cols=['z']).fit_transform(df).head()

Unnamed: 0,carat,depth,table,price,x,y
0,0.23,61.5,55.0,326,3.95,3.98
1,0.21,59.799999,61.0,326,3.89,3.84
2,0.23,56.900002,65.0,327,4.05,4.07
3,0.29,62.400002,58.0,334,4.2,4.23
4,0.31,63.299999,58.0,335,4.34,4.35


In [55]:
from xfeat import ArithmeticCombinations

encoder = Pipeline([
    SelectNumerical(exclude_cols=["z"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="*",
        r=2,
        output_suffix="",
    ),
])
encoder.fit_transform(df).head()

Unnamed: 0,caratdepth,carattable,caratprice,caratx,caraty,depthtable,depthprice,depthx,depthy,tableprice,tablex,tabley,pricex,pricey,xy
0,14.145,12.650001,74.980003,0.9085,0.9154,3382.5,20049.0,242.925003,244.770004,17930.0,217.25,218.899994,1287.700073,1297.47998,15.721001
1,12.558,12.809999,68.459999,0.8169,0.8064,3647.800049,19494.798828,232.622009,229.631989,19886.0,237.290009,234.23999,1268.140015,1251.839966,14.9376
2,13.087001,14.95,75.209999,0.9315,0.9361,3698.5,18606.300781,230.445023,231.583023,21255.0,263.25,264.550018,1324.350098,1330.890015,16.483501
3,18.096001,16.82,96.860001,1.218,1.2267,3619.200195,20841.599609,262.079987,263.951996,19372.0,243.599991,245.339996,1402.799927,1412.819946,17.765999
4,19.622999,17.98,103.849998,1.3454,1.3485,3671.399902,21205.5,274.722015,275.35498,19430.0,251.720001,252.299988,1453.900024,1457.25,18.879


# Lambda Encoder
xfeat.LambdaEncoderはラムダ関数を引数に取り、データフレームの列を変換します。

In [62]:
from xfeat import LambdaEncoder
import numpy as np


encoder = Pipeline([
    SelectNumerical(exclude_cols=["z"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="*",
        r=2,
        output_suffix="",
    ),

    LambdaEncoder(
        lambda x: x-1,
        output_prefix="",
        output_suffix="",
        drop_origin=True,
    ),
])

encoder.fit_transform(df).head()

Unnamed: 0,caratdepth,carattable,caratprice,caratx,caraty,depthtable,depthprice,depthx,depthy,tableprice,tablex,tabley,pricex,pricey,xy
0,13.145,11.650001,73.980003,-0.0915,-0.0846,3381.5,20048.0,241.925003,243.770004,17929.0,216.25,217.899994,1286.700073,1296.47998,14.721001
1,11.558,11.809999,67.459999,-0.1831,-0.1936,3646.800049,19493.798828,231.622009,228.631989,19885.0,236.290009,233.23999,1267.140015,1250.839966,13.9376
2,12.087001,13.95,74.209999,-0.0685,-0.0639,3697.5,18605.300781,229.445023,230.583023,21254.0,262.25,263.550018,1323.350098,1329.890015,15.483501
3,17.096001,15.82,95.860001,0.218,0.2267,3618.200195,20840.599609,261.079987,262.951996,19371.0,242.599991,244.339996,1401.799927,1411.819946,16.765999
4,18.622999,16.98,102.849998,0.3454,0.3485,3670.399902,21204.5,273.722015,274.35498,19429.0,250.720001,251.299988,1452.900024,1456.25,17.879


#Serialize/Deserialize
エンコーダのパラメータをPickleでシリアライズ/デシリアライズすることができます。

In [66]:
import pickle

df_train = pd.read_feather("data.ftr").head(10)
df_test = pd.read_feather("data.ftr").tail(10)

encoder = Pipeline([
    SelectCategorical(exclude_cols=[]),
    LabelEncoder(output_suffix=""),
])
df_train_encoded = encoder.fit_transform(df_train)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)
    
df_train_encoded.head()

Unnamed: 0,cut,color,clarity
0,0,0,0
1,1,0,1
2,2,0,2
3,1,1,3
4,2,2,0


In [67]:
with open("label_encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

encoder.transform(df_test).head()

Unnamed: 0,cut,color,clarity
53930,1,0,1
53931,1,-1,1
53932,3,0,3
53933,3,0,3
53934,1,-1,1
