## データセットの読み込み

In [26]:
# type: ignore
import pandas as pd

train = pd.read_csv("data/train.csv", index_col="id")
# train_ext = pd.read_csv("data/training_extra.csv", index_col="id")
# train = pd.concat([train, train_ext], axis=0, ignore_index=False)
# test = pd.read_csv("data/test.csv")
# submission = pd.read_csv("data/sample_submission.csv")

## データ全体の確認

まずは、トレーニングデータセット内のデータ数や欠損値の有無、データ型をざっくりと確認します。
データ数は 30 万件で、多くの特徴量はカテゴリ変数であることがわかります。

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 25.2+ MB


In [28]:
train.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [29]:
train.describe().round(2)

Unnamed: 0,Compartments,Weight Capacity (kg),Price
count,300000.0,299862.0,300000.0
mean,5.44,18.03,81.41
std,2.89,6.97,39.04
min,1.0,5.0,15.0
25%,3.0,12.1,47.38
50%,5.0,18.07,80.96
75%,8.0,24.0,115.02
max,10.0,30.0,150.0


## カテゴリデータについて調べる

### カテゴリ変数のユニークな値を確認

In [30]:
categorical_columns = train.select_dtypes(include=["object"]).columns
for col in categorical_columns:
    print(f"{col} 列のユニーク値: {train[col].unique()}")

Brand 列のユニーク値: ['Jansport' 'Under Armour' 'Nike' 'Adidas' 'Puma' nan]
Material 列のユニーク値: ['Leather' 'Canvas' 'Nylon' nan 'Polyester']
Size 列のユニーク値: ['Medium' 'Small' 'Large' nan]
Laptop Compartment 列のユニーク値: ['Yes' 'No' nan]
Waterproof 列のユニーク値: ['No' 'Yes' nan]
Style 列のユニーク値: ['Tote' 'Messenger' nan 'Backpack']
Color 列のユニーク値: ['Black' 'Green' 'Red' 'Blue' 'Gray' 'Pink' nan]


### 変換器

In [31]:
# type: ignore
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# カテゴリ変数と数値変数のカラム一覧を取得
categorical_columns = train.select_dtypes(include=["object"]).columns.to_list()
numerical_columns = train.select_dtypes(include=["number"]).columns.to_list()

categorical_transformer = Pipeline(
    steps=[
        ("OneHot", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
    ]
)
numerical_transformer = Pipeline(
    steps=[
        ("SimpleImputer", SimpleImputer(strategy="mean")),
        ("Polynomial", PolynomialFeatures(degree=2, include_bias=False)),
    ]
)

# 各カラムの変換器を作成
transformer = ColumnTransformer(
    transformers=[
        ("Categorical", categorical_transformer, categorical_columns),
        ("Numerical", numerical_transformer, numerical_columns),
    ],
    remainder="passthrough"
)

# 実際に変換してみる
data = transformer.fit_transform(train)
columns = transformer.get_feature_names_out()
train_onehot = pd.DataFrame(data=data, columns=columns)
train_onehot


Unnamed: 0,Categorical__Brand_Adidas,Categorical__Brand_Jansport,Categorical__Brand_Nike,Categorical__Brand_Puma,Categorical__Brand_Under Armour,Categorical__Brand_nan,Categorical__Material_Canvas,Categorical__Material_Leather,Categorical__Material_Nylon,Categorical__Material_Polyester,...,Categorical__Color_nan,Numerical__Compartments,Numerical__Weight Capacity (kg),Numerical__Price,Numerical__Compartments^2,Numerical__Compartments Weight Capacity (kg),Numerical__Compartments Price,Numerical__Weight Capacity (kg)^2,Numerical__Weight Capacity (kg) Price,Numerical__Price^2
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,7.0,11.611723,112.15875,49.0,81.282060,785.11125,134.832107,1302.356315,12579.585202
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,10.0,27.078537,68.88056,100.0,270.785366,688.80560,733.247143,1865.184764,4744.531546
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,2.0,16.643760,39.17320,4.0,33.287520,78.34640,277.014745,651.989337,1534.539598
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,8.0,12.937220,80.60793,64.0,103.497762,644.86344,167.371669,1042.842549,6497.638379
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,17.749338,86.02312,1.0,17.749338,86.02312,315.039016,1526.853473,7399.977175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,9.0,12.730812,129.99749,81.0,114.577306,1169.97741,162.073567,1654.973571,16899.347406
299996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,6.0,26.633182,19.85819,36.0,159.799093,119.14914,709.326396,528.886793,394.347710
299997,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,9.0,11.898250,111.41364,81.0,107.084247,1002.72276,141.568346,1325.627309,12412.999178
299998,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,6.175738,115.89080,1.0,6.175738,115.89080,38.139739,715.711206,13430.677525
