# 高度な特徴量エンジニアリングのための詳細EDA

exp001-003の結果を踏まえ、より高度な特徴量パターンを発見するための詳細分析を実施

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'IPAexGothic'
plt.style.use('seaborn-v0_8')

# データ読み込み
train_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/train.csv')
test_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

## 1. 名前(Name)からの高度な特徴量抽出

In [None]:
# 名前の詳細分析
def extract_name_features(df):
    # 敬称の詳細抽出
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    
    # 稀少敬称のグループ化
    title_mapping = {
        'Mr': 'Mr',
        'Mrs': 'Mrs', 
        'Miss': 'Miss',
        'Master': 'Master',
        'Dr': 'Rare',
        'Rev': 'Rare',
        'Col': 'Rare',
        'Major': 'Rare',
        'Mlle': 'Miss',
        'Countess': 'Rare',
        'Ms': 'Mrs',
        'Lady': 'Rare',
        'Jonkheer': 'Rare',
        'Don': 'Rare',
        'Dona': 'Rare',
        'Mme': 'Mrs',
        'Capt': 'Rare',
        'Sir': 'Rare'
    }
    df['Title_Grouped'] = df['Title'].map(title_mapping)
    
    # 名前の長さ
    df['Name_Length'] = df['Name'].str.len()
    
    # 苗字の抽出
    df['Surname'] = df['Name'].str.split(',').str[0]
    
    # 同じ苗字の人数（家族グループサイズ）
    surname_counts = df['Surname'].value_counts()
    df['Surname_Count'] = df['Surname'].map(surname_counts)
    
    return df

train_df = extract_name_features(train_df)
test_df = extract_name_features(test_df)

# 敬称別生存率
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
title_survival = train_df.groupby('Title_Grouped')['Survived'].agg(['mean', 'count'])
title_survival['mean'].plot(kind='bar')
plt.title('敬称別生存率')
plt.xlabel('敬称')
plt.ylabel('生存率')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
surname_survival = train_df.groupby('Surname_Count')['Survived'].mean()
surname_survival.plot(kind='line', marker='o')
plt.title('家族グループサイズ別生存率')
plt.xlabel('同姓人数')
plt.ylabel('生存率')

plt.tight_layout()
plt.show()

print("敬称別統計:")
print(title_survival)

## 2. チケット(Ticket)からの特徴量抽出

In [None]:
# チケット情報の詳細分析
def extract_ticket_features(df):
    # チケット番号の長さ
    df['Ticket_Length'] = df['Ticket'].str.len()
    
    # チケット番号が数字のみかどうか
    df['Ticket_IsNumeric'] = df['Ticket'].str.isnumeric().astype(int)
    
    # チケットのプレフィックス（英字部分）
    df['Ticket_Prefix'] = df['Ticket'].str.extract(r'([A-Za-z]+)').fillna('None')
    
    # 同じチケット番号を持つ人数（グループチケット）
    ticket_counts = df['Ticket'].value_counts()
    df['Ticket_Count'] = df['Ticket'].map(ticket_counts)
    
    # チケット番号の最初の数字
    df['Ticket_First_Digit'] = df['Ticket'].str.extract(r'(\d)').astype(float)
    
    return df

train_df = extract_ticket_features(train_df)
test_df = extract_ticket_features(test_df)

# チケット関連特徴量の可視化
plt.figure(figsize=(15, 8))

plt.subplot(2, 3, 1)
train_df.boxplot(column='Ticket_Length', by='Survived', ax=plt.gca())
plt.title('チケット長さ別生存状況')

plt.subplot(2, 3, 2)
pd.crosstab(train_df['Ticket_IsNumeric'], train_df['Survived'], normalize='index').plot(kind='bar', ax=plt.gca())
plt.title('数字のみチケット vs 生存率')
plt.xticks(rotation=0)

plt.subplot(2, 3, 3)
ticket_group_survival = train_df.groupby('Ticket_Count')['Survived'].mean()
ticket_group_survival.plot(kind='line', marker='o', ax=plt.gca())
plt.title('グループチケット人数別生存率')

plt.subplot(2, 3, 4)
# 主要なチケットプレフィックス別生存率
prefix_counts = train_df['Ticket_Prefix'].value_counts()
major_prefixes = prefix_counts[prefix_counts >= 10].index
major_prefix_data = train_df[train_df['Ticket_Prefix'].isin(major_prefixes)]
prefix_survival = major_prefix_data.groupby('Ticket_Prefix')['Survived'].mean()
prefix_survival.plot(kind='bar', ax=plt.gca())
plt.title('主要チケットプレフィックス別生存率')
plt.xticks(rotation=45)

plt.subplot(2, 3, 5)
first_digit_survival = train_df.groupby('Ticket_First_Digit')['Survived'].mean()
first_digit_survival.plot(kind='bar', ax=plt.gca())
plt.title('チケット最初の数字別生存率')

plt.tight_layout()
plt.show()

print("チケット関連統計:")
print(f"グループチケット人数別生存率: {ticket_group_survival}")

## 3. 客室(Cabin)の詳細分析

In [None]:
# 客室情報の詳細分析
def extract_cabin_features(df):
    # 客室デッキ（アルファベット部分）
    df['Cabin_Deck'] = df['Cabin'].str.extract(r'^([A-Za-z])').fillna('Unknown')
    
    # 客室番号（数字部分）
    df['Cabin_Number'] = df['Cabin'].str.extract(r'(\d+)').astype(float)
    
    # 客室情報の個数（複数客室を持つ場合）
    df['Cabin_Count'] = df['Cabin'].str.count(' ') + 1
    df.loc[df['Cabin'].isnull(), 'Cabin_Count'] = 0
    
    # 客室番号の偶数/奇数
    df['Cabin_Number_Odd'] = (df['Cabin_Number'] % 2).fillna(-1)
    
    return df

train_df = extract_cabin_features(train_df)
test_df = extract_cabin_features(test_df)

# 客室関連の可視化
plt.figure(figsize=(15, 6))

plt.subplot(1, 3, 1)
deck_survival = train_df.groupby('Cabin_Deck')['Survived'].agg(['mean', 'count'])
deck_survival = deck_survival[deck_survival['count'] >= 5]
deck_survival['mean'].plot(kind='bar')
plt.title('デッキ別生存率（5人以上）')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
cabin_count_survival = train_df.groupby('Cabin_Count')['Survived'].mean()
cabin_count_survival.plot(kind='bar')
plt.title('客室保有数別生存率')

plt.subplot(1, 3, 3)
# 客室番号の範囲別分析
train_df_with_cabin_num = train_df.dropna(subset=['Cabin_Number'])
if len(train_df_with_cabin_num) > 0:
    train_df_with_cabin_num['Cabin_Range'] = pd.cut(train_df_with_cabin_num['Cabin_Number'], bins=5)
    cabin_range_survival = train_df_with_cabin_num.groupby('Cabin_Range')['Survived'].mean()
    cabin_range_survival.plot(kind='bar')
    plt.title('客室番号範囲別生存率')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("デッキ別統計:")
print(deck_survival)

## 4. 年齢・運賃の非線形パターン分析

In [None]:
# 年齢・運賃の詳細分析
def create_nonlinear_features(df):
    # 年齢の非線形変換
    df['Age_Squared'] = df['Age'] ** 2
    df['Age_Log'] = np.log1p(df['Age'].fillna(0))
    df['Age_Sqrt'] = np.sqrt(df['Age'].fillna(0))
    
    # 運賃の非線形変換
    df['Fare_Log'] = np.log1p(df['Fare'].fillna(0))
    df['Fare_Sqrt'] = np.sqrt(df['Fare'].fillna(0))
    df['Fare_Squared'] = df['Fare'] ** 2
    
    # 年齢と運賃の相互作用
    df['Age_Fare_Interaction'] = df['Age'].fillna(0) * df['Fare'].fillna(0)
    df['Age_Fare_Ratio'] = df['Age'].fillna(0) / (df['Fare'].fillna(0) + 1)
    
    # 多項式特徴量
    df['Age_Pclass_Interaction'] = df['Age'].fillna(0) * df['Pclass']
    df['Fare_Pclass_Interaction'] = df['Fare'].fillna(0) * df['Pclass']
    
    return df

train_df = create_nonlinear_features(train_df)
test_df = create_nonlinear_features(test_df)

# 非線形特徴量の相関分析
nonlinear_features = ['Age', 'Age_Squared', 'Age_Log', 'Age_Sqrt', 
                     'Fare', 'Fare_Log', 'Fare_Sqrt', 'Fare_Squared',
                     'Age_Fare_Interaction', 'Age_Fare_Ratio', 'Survived']

correlation_matrix = train_df[nonlinear_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f')
plt.title('非線形特徴量相関マトリックス')
plt.tight_layout()
plt.show()

# 生存との相関が高い特徴量を表示
survival_corr = correlation_matrix['Survived'].abs().sort_values(ascending=False)
print("生存との相関（絶対値）:")
print(survival_corr)

## 5. 家族構成の詳細分析

In [None]:
# 家族構成の高度な分析
def create_family_features(df):
    # 既存の家族サイズ
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # 家族構成のより詳細なカテゴリ
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['IsSmallFamily'] = ((df['FamilySize'] >= 2) & (df['FamilySize'] <= 4)).astype(int)
    df['IsLargeFamily'] = (df['FamilySize'] > 4).astype(int)
    
    # 特定の家族構成パターン
    df['HasSpouse'] = (df['SibSp'] > 0).astype(int)
    df['HasChildren'] = (df['Parch'] > 0).astype(int)
    df['HasSiblings'] = (df['SibSp'] > 0).astype(int)
    df['HasParents'] = (df['Parch'] > 0).astype(int)
    
    # 家族構成の組み合わせ
    df['Spouse_Children'] = df['HasSpouse'] * df['HasChildren']
    df['Siblings_Parents'] = df['HasSiblings'] * df['HasParents']
    
    # SibSp/Parchの比率
    df['SibSp_Parch_Ratio'] = df['SibSp'] / (df['Parch'] + 1)
    df['Parch_SibSp_Ratio'] = df['Parch'] / (df['SibSp'] + 1)
    
    return df

train_df = create_family_features(train_df)
test_df = create_family_features(test_df)

# 家族構成の可視化
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
family_survival = train_df.groupby('FamilySize')['Survived'].mean()
family_survival.plot(kind='bar')
plt.title('家族サイズ別生存率')

plt.subplot(2, 3, 2)
alone_survival = train_df.groupby('IsAlone')['Survived'].mean()
alone_survival.plot(kind='bar')
plt.title('一人旅 vs 生存率')
plt.xticks([0, 1], ['家族連れ', '一人'], rotation=0)

plt.subplot(2, 3, 3)
spouse_children_survival = train_df.groupby('Spouse_Children')['Survived'].mean()
spouse_children_survival.plot(kind='bar')
plt.title('配偶者&子供有無別生存率')
plt.xticks([0, 1], ['なし', 'あり'], rotation=0)

plt.subplot(2, 3, 4)
# SibSp vs Parch の2次元生存率
sibsp_parch_survival = train_df.groupby(['SibSp', 'Parch'])['Survived'].mean().unstack().fillna(0)
sns.heatmap(sibsp_parch_survival, annot=True, cmap='RdYlBu_r', ax=plt.gca())
plt.title('SibSp-Parch組み合わせ別生存率')

plt.subplot(2, 3, 5)
# 年齢と家族サイズの関係
age_family_data = train_df.dropna(subset=['Age'])
for family_size in [1, 2, 3, 4]:
    if family_size in age_family_data['FamilySize'].values:
        data = age_family_data[age_family_data['FamilySize'] == family_size]
        survival_by_age = data.groupby(pd.cut(data['Age'], bins=10))['Survived'].mean()
        plt.plot(range(len(survival_by_age)), survival_by_age.values, 
                label=f'FamilySize={family_size}', marker='o')
plt.title('年齢×家族サイズ別生存率')
plt.legend()

plt.subplot(2, 3, 6)
ratio_survival = train_df.groupby(pd.cut(train_df['SibSp_Parch_Ratio'], bins=5))['Survived'].mean()
ratio_survival.plot(kind='bar')
plt.title('SibSp/Parch比率別生存率')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 6. 特徴量の重要度・相関分析

In [None]:
# 数値特徴量のみ抽出して相関分析
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if 'PassengerId' in numeric_features:
    numeric_features.remove('PassengerId')

# 欠損値を平均で補完
train_numeric = train_df[numeric_features].fillna(train_df[numeric_features].mean())

# 生存との相関の高い特徴量を抽出
survival_correlation = train_numeric.corrwith(train_numeric['Survived']).abs().sort_values(ascending=False)
top_features = survival_correlation[survival_correlation > 0.1].drop('Survived')

print("生存と相関の高い特徴量（|相関| > 0.1）:")
for feature, corr in top_features.items():
    print(f"{feature}: {corr:.4f}")

# 高相関特徴量の相関マトリックス
if len(top_features) > 1:
    top_feature_names = top_features.index.tolist() + ['Survived']
    top_corr_matrix = train_numeric[top_feature_names].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(top_corr_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.3f')
    plt.title('高相関特徴量相関マトリックス（生存相関>0.1）')
    plt.tight_layout()
    plt.show()

## 7. カテゴリカル特徴量の詳細分析

In [None]:
# カテゴリカル特徴量の詳細分析
categorical_features = ['Sex', 'Embarked', 'Title_Grouped', 'Cabin_Deck', 'Ticket_Prefix']

plt.figure(figsize=(18, 12))

for i, feature in enumerate(categorical_features, 1):
    if feature in train_df.columns:
        plt.subplot(2, 3, i)
        feature_survival = train_df.groupby(feature)['Survived'].agg(['mean', 'count'])
        feature_survival = feature_survival[feature_survival['count'] >= 5]  # 5人以上のカテゴリのみ
        
        if len(feature_survival) > 0:
            feature_survival['mean'].plot(kind='bar')
            plt.title(f'{feature}別生存率（5人以上）')
            plt.xticks(rotation=45)
            plt.ylabel('生存率')

plt.tight_layout()
plt.show()

# カテゴリ間の交互作用分析
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sex_class_survival = train_df.groupby(['Sex', 'Pclass'])['Survived'].mean().unstack()
sex_class_survival.plot(kind='bar')
plt.title('性別×クラス別生存率')
plt.xticks(rotation=0)

plt.subplot(1, 3, 2)
sex_embarked_survival = train_df.groupby(['Sex', 'Embarked'])['Survived'].mean().unstack()
sex_embarked_survival.plot(kind='bar')
plt.title('性別×乗船港別生存率')
plt.xticks(rotation=0)

plt.subplot(1, 3, 3)
if 'Title_Grouped' in train_df.columns:
    title_class_survival = train_df.groupby(['Title_Grouped', 'Pclass'])['Survived'].mean().unstack()
    title_class_survival.plot(kind='bar')
    plt.title('敬称×クラス別生存率')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 8. 統計的検定による特徴量の有意性確認

In [None]:
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu

# 数値特徴量に対するt検定
print("=== 数値特徴量の統計的有意性（t検定） ===")
survived = train_df[train_df['Survived'] == 1]
not_survived = train_df[train_df['Survived'] == 0]

numerical_tests = []
for feature in ['Age', 'Fare', 'FamilySize', 'Name_Length', 'Ticket_Length', 'Surname_Count']:
    if feature in train_df.columns:
        # 欠損値を除外
        survived_data = survived[feature].dropna()
        not_survived_data = not_survived[feature].dropna()
        
        if len(survived_data) > 0 and len(not_survived_data) > 0:
            # t検定
            t_stat, t_p = ttest_ind(survived_data, not_survived_data)
            
            # Mann-Whitney U検定（ノンパラメトリック）
            u_stat, u_p = mannwhitneyu(survived_data, not_survived_data, alternative='two-sided')
            
            numerical_tests.append({
                'Feature': feature,
                'Survived_Mean': survived_data.mean(),
                'NotSurvived_Mean': not_survived_data.mean(),
                'T_pvalue': t_p,
                'U_pvalue': u_p,
                'Significant': t_p < 0.05
            })

numerical_df = pd.DataFrame(numerical_tests)
print(numerical_df)

print("\n=== カテゴリカル特徴量の統計的有意性（カイ二乗検定） ===")
categorical_tests = []
for feature in ['Sex', 'Embarked', 'Title_Grouped', 'Cabin_Deck', 'IsAlone']:
    if feature in train_df.columns:
        # クロス表作成
        contingency_table = pd.crosstab(train_df[feature], train_df['Survived'])
        
        if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
            # カイ二乗検定
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
            
            categorical_tests.append({
                'Feature': feature,
                'Chi2': chi2,
                'P_value': p_value,
                'DOF': dof,
                'Significant': p_value < 0.05
            })

categorical_df = pd.DataFrame(categorical_tests)
print(categorical_df)

## 9. 新特徴量候補の提案

In [None]:
print("=== 高度な特徴量エンジニアリング候補 ===")
print()
print("1. 名前関連特徴量:")
print("   - Title_Grouped: 敬称のグループ化")
print("   - Surname_Count: 同姓人数（家族グループ）")
print("   - Name_Length: 名前の長さ")
print()
print("2. チケット関連特徴量:")
print("   - Ticket_Count: 同一チケット人数")
print("   - Ticket_IsNumeric: 数字のみチケットフラグ")
print("   - Ticket_Prefix: チケットのプレフィックス")
print("   - Ticket_First_Digit: チケット番号最初の数字")
print()
print("3. 客室関連特徴量:")
print("   - Cabin_Deck: 客室デッキ")
print("   - Cabin_Count: 保有客室数")
print("   - Cabin_Number_Odd: 客室番号の偶数奇数")
print()
print("4. 非線形・交互作用特徴量:")
print("   - Age_Fare_Interaction: 年齢×運賃")
print("   - Age_Pclass_Interaction: 年齢×クラス")
print("   - Fare_Log: 運賃の対数変換")
print("   - Age_Squared: 年齢の二乗")
print()
print("5. 家族構成詳細特徴量:")
print("   - IsSmallFamily/IsLargeFamily: 家族サイズカテゴリ")
print("   - HasSpouse/HasChildren: 特定関係者有無")
print("   - Spouse_Children: 配偶者&子供組み合わせ")
print("   - SibSp_Parch_Ratio: 兄弟姉妹/親子比率")

# 新特徴量の統計サマリー
new_features = [
    'Title_Grouped', 'Surname_Count', 'Name_Length',
    'Ticket_Count', 'Ticket_IsNumeric', 'Ticket_First_Digit',
    'Cabin_Deck', 'Cabin_Count', 'Cabin_Number_Odd',
    'Age_Fare_Interaction', 'Fare_Log', 'Age_Squared',
    'IsSmallFamily', 'IsLargeFamily', 'HasSpouse', 'HasChildren',
    'SibSp_Parch_Ratio'
]

available_features = [f for f in new_features if f in train_df.columns]
if available_features:
    print(f"\n新特徴量相関サマリー（利用可能な{len(available_features)}個）:")
    new_feature_corr = train_df[available_features + ['Survived']].corrwith(train_df['Survived']).abs().sort_values(ascending=False)
    print(new_feature_corr)