# 2. Modeling

## 0. Setup

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## 1. Data Preprpcessing

### 1-1. Data Loading

In [52]:
df = pd.read_csv('/workspace/data/raw/marketing_campaign.csv')
df = df.copy()

### 1-2. Feature Engineering

In [53]:
df['income_is_missing'] = df['Income'].isna().astype(int)
df['Marital_Status'] = df['Marital_Status'].replace({'Absurd': np.nan, 'YOLO': np.nan})
df['is_partner'] = df['Marital_Status'].isin(['Married', 'Together']).astype(int)
df['is_high_education'] = df['Education'].isin(['Graduation', 'Master', 'PhD']).astype(int)
df['Age_at_time'] = 2014 - df['Year_Birth']

### 1-3. Feature Exclusion

In [54]:
drop_cols = [
    'ID', 'Dt_Customer', 'Marital_Status', 'Education', 'Year_Birth',
    'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
    'Response', 'Z_CostContact', 'Z_Revenue'
]
df = df.drop(columns=drop_cols)

leak_cols = [c for c in df.columns if c.startswith('Mnt') or c.endswith('Purchases')]
df = df.drop(columns=leak_cols)

### 1-4. Encoding

In [40]:
df = pd.get_dummies(df, drop_first=True, dtype=int)

### 1-5. Function Definition

In [55]:
def get_binary_cols(df: pd.DataFrame) -> list[str]:
    binary_cols = []

    for col in df.columns:
        s = df[col]

        if not (pd.api.types.is_bool_dtype(s) or pd.api.types.is_numeric_dtype(s)):
            continue

        u = pd.unique(s.dropna())
        if len(u) == 0:
            continue

        if pd.api.types.is_bool_dtype(s) or set(u).issubset({0, 1}):
            binary_cols.append(col)

    return binary_cols

## 2. Modeling

### 2-1. Modeling

In [56]:
target_col = "AcceptedCmp5"
X = df.drop(columns=[target_col])
y = df[target_col]

# 例：フラグと連続を明示（get_binary_cols が信用できないなら明示が堅い）
binary_cols = get_binary_cols(X)  # ここが怪しいなら手でリスト化してもOK
numeric_cols = [c for c in X.columns if c not in binary_cols]

numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, numeric_cols),
        ("bin", "passthrough", binary_cols),
    ],
    remainder="drop",
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000)),
])

### 2-2. Cross-Validation Evaluation

In [57]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring="roc_auc")
scores.mean()

np.float64(0.9213107719338893)

### 2-3. Model Interpretation

In [58]:
pipeline.fit(X, y)

feature_name = pipeline.named_steps['preprocess'].get_feature_names_out()
coef = pipeline.named_steps['model'].coef_[0]

coef_df = (
    pd.DataFrame({
        'feature': feature_name,
        'coef': coef,
        'odds_ratio': np.exp(coef)
    })
    .assign(
        feature=lambda df: (
            df['feature']
            .str.replace('num__', '', regex=False)
            .str.replace('bin__', '', regex=False)
        )
    )
    .sort_values('coef', ascending=False)
)

coef_df

Unnamed: 0,feature,coef,odds_ratio
0,Income,1.333922,3.795903
9,is_high_education,0.393921,1.482784
8,is_partner,0.335385,1.398479
7,income_is_missing,0.24841,1.281985
6,Complain,0.119743,1.127207
4,NumWebVisitsMonth,0.080272,1.083582
3,Recency,0.020113,1.020316
5,Age_at_time,-0.090894,0.913114
2,Teenhome,-0.884091,0.413089
1,Kidhome,-0.972018,0.378319


## 3. Statistical Tests

### 3-1. t-test

In [35]:
income_imp = pd.Series(
    SimpleImputer(strategy="median").fit_transform(df[['Income']]).ravel(),
    index=df.index
)

x1 = income_imp[y == 1]
x0 = income_imp[y == 0]

ttest_ind(x1, x0, equal_var=False)

TtestResult(statistic=np.float64(36.2789614644096), pvalue=np.float64(1.1698475449488947e-126), df=np.float64(387.44913662948346))

### 3-2. Chi-square test

In [59]:
ct = pd.crosstab(df['is_high_education'], y)
chi2_contingency(ct)

Chi2ContingencyResult(statistic=np.float64(4.381609546998435), pvalue=np.float64(0.036328676650578026), dof=1, expected_freq=array([[ 238.29866071,   18.70133929],
       [1838.70133929,  144.29866071]]))

In [60]:
ct = pd.crosstab(df['is_partner'], y)
chi2_contingency(ct)

Chi2ContingencyResult(statistic=np.float64(0.5650874370816413), pvalue=np.float64(0.4522176685447551), dof=1, expected_freq=array([[ 738.07678571,   57.92321429],
       [1338.92321429,  105.07678571]]))

## 4. Summary

本分析では、直近キャンペーンの結果を用いて
次回キャンペーンに反応しやすい顧客の特徴を分析した。
ロジスティック回帰の結果、収入および学歴は
他の属性を調整した上でも反応確率に正の影響を与えることが示唆された。
また、単変量検定においても収入および高学歴か否かで
有意な差が確認された。

これらの結果を踏まえ、次回キャンペーンでは
反応確率に基づく配信優先度設計を行い、
特に高収入・高学歴のセグメントを中心に
訴求内容を出し分ける施策が有効と考えられる。
