# 3-3. データ準備 (Feature Engineering)

In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler, PowerTransformer

## 訓練データの読み込み

In [None]:
# 演習データの読み込み
train = pd.read_csv("data/train.csv")
train.head()

In [None]:
train.info()

In [None]:
cat_columns = train.select_dtypes(include=object).columns
num_columns = train.select_dtypes(exclude=object).columns

## Feature Engineering

In [None]:
# カテゴリー変数は欠損値を0で補完した後、OrdinalEncoderでエンコードします。
category_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(),
)

In [None]:
# 数値変数をMinMaxScalerで変換する場合
minmax_transformer = make_column_transformer(
    (category_transformer, cat_columns),
    (MinMaxScaler(), num_columns),
)

In [None]:
# 数値変数をStandardScalerで変換する場合
standard_transformer = make_column_transformer(
    (category_transformer, cat_columns),
    (StandardScaler(), num_columns),
)

In [None]:
# 数値変数をPowerTransformerで変換する場合
power_transformer = make_column_transformer(
    (category_transformer, cat_columns),
    (PowerTransformer(), num_columns),
)

In [None]:
transformer = minmax_transformer

In [None]:
transformed_columns = cat_columns.append(num_columns)

訓練データに対しては fit_transform() を使う。

In [None]:
transformed_data = transformer.fit_transform(train)
transformed_data = pd.DataFrame(transformed_data, columns=transformed_columns)
train = transformed_data[train.columns]

In [None]:
train

In [None]:
train.describe()

## テストデータの読み込み

In [None]:
test = pd.read_csv("data/test.csv")
test.head()

テストデータに対しては transform() を使う。

In [None]:
transformed_data = transformer.transform(test)
transformed_data = pd.DataFrame(transformed_data, columns=transformed_columns)
test = transformed_data[test.columns]

In [None]:
test.head()

In [None]:
test.describe()

## データの保存

In [None]:
train.to_csv("data/converted_train.csv", index=False)
test.to_csv("data/converted_test.csv", index=False)