# カテゴリデータ

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# サンプルデータの作成
df = pd.DataFrame({
    'category_column': ['red', 'blue', 'green', 'blue', 'red'],
    'ordinal_category_column': ['low', 'medium', 'high', 'low', 'medium']
})

In [2]:
# 1. ワンホットエンコーディング (One-Hot Encoding)
one_hot = pd.get_dummies(df['category_column'], prefix='category')
df = pd.concat([df, one_hot], axis=1)

df

Unnamed: 0,category_column,ordinal_category_column,category_blue,category_green,category_red
0,red,low,0,0,1
1,blue,medium,1,0,0
2,green,high,0,1,0
3,blue,low,1,0,0
4,red,medium,0,0,1


In [3]:
# 2. 順序エンコーディング (Ordinal Encoding)
encoder = OrdinalEncoder()
df['encoded_ordinal'] = encoder.fit_transform(df[['ordinal_category_column']])

df

Unnamed: 0,category_column,ordinal_category_column,category_blue,category_green,category_red,encoded_ordinal
0,red,low,0,0,1,1.0
1,blue,medium,1,0,0,2.0
2,green,high,0,1,0,0.0
3,blue,low,1,0,0,1.0
4,red,medium,0,0,1,2.0


In [4]:
# 3. ラベルエンコーディング
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['category_column'])

df

Unnamed: 0,category_column,ordinal_category_column,category_blue,category_green,category_red,encoded_ordinal,encoded_label
0,red,low,0,0,1,1.0,2
1,blue,medium,1,0,0,2.0,0
2,green,high,0,1,0,0.0,1
3,blue,low,1,0,0,1.0,0
4,red,medium,0,0,1,2.0,2


# 数値データ

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# サンプルデータの作成
df = pd.DataFrame({
    'numeric_feature': [2, 8, 15, 24, 35]
})


In [6]:
# 1. 標準化
scaler = StandardScaler()
df['scaled_feature'] = scaler.fit_transform(df[['numeric_feature']])
df

Unnamed: 0,numeric_feature,scaled_feature
0,2,-1.266485
1,8,-0.753045
2,15,-0.154032
3,24,0.616128
4,35,1.557434


In [7]:
# 2. 正規化
minmax_scaler = MinMaxScaler()
df['minmax_scaled_feature'] = minmax_scaler.fit_transform(df[['numeric_feature']])
df

Unnamed: 0,numeric_feature,scaled_feature,minmax_scaled_feature
0,2,-1.266485,0.0
1,8,-0.753045,0.181818
2,15,-0.154032,0.393939
3,24,0.616128,0.666667
4,35,1.557434,1.0


In [8]:
# 3. 対数変換
df['log_feature'] = np.log1p(df['numeric_feature'])
df


Unnamed: 0,numeric_feature,scaled_feature,minmax_scaled_feature,log_feature
0,2,-1.266485,0.0,1.098612
1,8,-0.753045,0.181818,2.197225
2,15,-0.154032,0.393939,2.772589
3,24,0.616128,0.666667,3.218876
4,35,1.557434,1.0,3.583519


In [9]:
# 4. ビニング/離散化
bins = [0, 10, 20, 30, np.inf]
labels = ['0-10', '10-20', '20-30', '30+']
df['binned_feature'] = pd.cut(df['numeric_feature'], bins=bins, labels=labels)


# 日付・時刻データ

In [11]:
import pandas as pd
import numpy as np

# サンプルデータの作成
df = pd.DataFrame({
    'datetime_column': pd.date_range('2023-01-01', periods=5, freq='D')
})

In [12]:
# 1. 基本的な日付/時刻の成分の抽出
df['year'] = df['datetime_column'].dt.year
df['month'] = df['datetime_column'].dt.month
df['day'] = df['datetime_column'].dt.day
df['weekday'] = df['datetime_column'].dt.weekday

df

Unnamed: 0,datetime_column,year,month,day,weekday
0,2023-01-01,2023,1,1,6
1,2023-01-02,2023,1,2,0
2,2023-01-03,2023,1,3,1
3,2023-01-04,2023,1,4,2
4,2023-01-05,2023,1,5,3


In [13]:
# 2. 季節性の特徴の生成
df['quarter'] = df['datetime_column'].dt.quarter

def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

df['season'] = df['month'].apply(get_season)

df

Unnamed: 0,datetime_column,year,month,day,weekday,quarter,season
0,2023-01-01,2023,1,1,6,1,winter
1,2023-01-02,2023,1,2,0,1,winter
2,2023-01-03,2023,1,3,1,1,winter
3,2023-01-04,2023,1,4,2,1,winter
4,2023-01-05,2023,1,5,3,1,winter


In [14]:
# 3. 時間経過に関する特徴
reference_date = pd.Timestamp('2023-01-01')
df['days_since_reference'] = (df['datetime_column'] - reference_date).dt.days

# 4. 周期的な特徴の変換
df['hour_sin'] = np.sin(2 * np.pi * df['datetime_column'].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['datetime_column'].dt.hour / 24)

df

Unnamed: 0,datetime_column,year,month,day,weekday,quarter,season,days_since_reference,hour_sin,hour_cos
0,2023-01-01,2023,1,1,6,1,winter,0,0.0,1.0
1,2023-01-02,2023,1,2,0,1,winter,1,0.0,1.0
2,2023-01-03,2023,1,3,1,1,winter,2,0.0,1.0
3,2023-01-04,2023,1,4,2,1,winter,3,0.0,1.0
4,2023-01-05,2023,1,5,3,1,winter,4,0.0,1.0


In [15]:
# 5. 祝日や特定のイベントまでの日数
event_date = pd.Timestamp('2023-12-25')
df['days_until_event'] = (event_date - df['datetime_column']).dt.days
df['days_since_event'] = (df['datetime_column'] - event_date).dt.days

df

Unnamed: 0,datetime_column,year,month,day,weekday,quarter,season,days_since_reference,hour_sin,hour_cos,days_until_event,days_since_event
0,2023-01-01,2023,1,1,6,1,winter,0,0.0,1.0,358,-358
1,2023-01-02,2023,1,2,0,1,winter,1,0.0,1.0,357,-357
2,2023-01-03,2023,1,3,1,1,winter,2,0.0,1.0,356,-356
3,2023-01-04,2023,1,4,2,1,winter,3,0.0,1.0,355,-355
4,2023-01-05,2023,1,5,3,1,winter,4,0.0,1.0,354,-354
