# 2. Modeling

## 0. Setup

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

from datatools.preprocessing import get_binary_cols

## 1. Data Preprpcessing

### 1-1. Data Loading

In [29]:
df = pd.read_csv('/workspace/data/marketing_campaign.csv')
df = df.copy()

### 1-2. Feature Engineering

In [30]:
df['income_is_missing'] = df['Income'].isna().astype(int)
df['Marital_Status'] = df['Marital_Status'].replace({'Absurd': np.nan, 'YOLO': np.nan})
df['is_partner'] = df['Marital_Status'].isin(['Married', 'Together']).astype(int)
df['is_high_education'] = df['Education'].isin(['Graduation', 'Master', 'PhD']).astype(int)
df['Age_at_time'] = 2014 - df['Year_Birth']

### 1-3. Feature Exclusion

In [31]:
drop_cols = [
    'ID', 'Dt_Customer', 'Marital_Status', 'Year_Birth', 'Education',
    'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
    'Response', 'Z_CostContact', 'Z_Revenue'
]
df = df.drop(columns=drop_cols)

leak_cols = [c for c in df.columns if c.startswith('Mnt') or c.endswith('Purchases')]
df = df.drop(columns=leak_cols)

## 2. Modeling

### 2-1. Modeling

In [32]:
target_col = "AcceptedCmp5"
X = df.drop(columns=[target_col])
y = df[target_col]

# 例：フラグと連続を明示（get_binary_cols が信用できないなら明示が堅い）
binary_cols = get_binary_cols(X)  # ここが怪しいなら手でリスト化してもOK
numeric_cols = [c for c in X.columns if c not in binary_cols]

numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, numeric_cols),
        ("bin", "passthrough", binary_cols),
    ],
    remainder="drop",
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000)),
])

### 2-2. Cross-Validation Evaluation

In [33]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring="roc_auc")
scores.mean()

np.float64(0.9213107719338893)

### 2-3. Model Interpretation

In [34]:
pipeline.fit(X, y)
feature_name = pipeline.named_steps["preprocess"].get_feature_names_out()

X_proc = pipeline.named_steps['preprocess'].transform(X)

X_sm = sm.add_constant(X_proc)

logit_model = sm.Logit(y, X_sm).fit()

params = logit_model.params
pvalues = logit_model.pvalues
conf = logit_model.conf_int()

params = params.drop('const')
pvalues = pvalues.drop('const')
conf = conf.drop('const')

odds = np.exp(params)
conf_odds = np.exp(conf)


coef_df = (
    pd.DataFrame({
        'feature': feature_name,
        'coef': params.values,
        'odds_ratio': odds.values,
        'p_values': pvalues.values,
        'ci_low': conf_odds[0].values,
        'ci_high': conf_odds[1].values,
    })
    .assign(
        feature=lambda df: (
            df['feature']
            .str.replace('num__', '', regex=False)
            .str.replace('bin__', '', regex=False)
        )
    )
    .sort_values('coef', ascending=False)
)

coef_df

Optimization terminated successfully.
         Current function value: 0.178505
         Iterations 9


Unnamed: 0,feature,coef,odds_ratio,p_values,ci_low,ci_high
0,Income,1.383151,3.987448,9.506908e-15,2.80993,5.658412
7,income_is_missing,0.625404,1.869001,0.5596999,0.228532,15.285268
9,is_high_education,0.455157,1.576421,0.2133657,0.769705,3.228643
8,is_partner,0.352218,1.422219,0.07313409,0.967524,2.0906
6,Complain,0.293893,1.34164,0.7967335,0.143358,12.555921
4,NumWebVisitsMonth,0.120566,1.128135,0.4038213,0.850019,1.497248
3,Recency,0.020148,1.020352,0.826472,0.852161,1.221739
5,Age_at_time,-0.09245,0.911695,0.2608748,0.775993,1.071128
2,Teenhome,-0.910305,0.402402,3.24005e-10,0.302977,0.534453
1,Kidhome,-1.036964,0.354529,1.897974e-05,0.220423,0.570227


## 3. Statistical Tests

### 3-1. t-test

In [35]:
income_imp = pd.Series(
    SimpleImputer(strategy="median").fit_transform(df[['Income']]).ravel(),
    index=df.index
)

x1 = income_imp[y == 1]
x0 = income_imp[y == 0]

ttest_ind(x1, x0, equal_var=False)

TtestResult(statistic=np.float64(36.2789614644096), pvalue=np.float64(1.1698475449488947e-126), df=np.float64(387.44913662948346))

### 3-2. Chi-square test

In [59]:
ct = pd.crosstab(df['is_high_education'], y)
chi2_contingency(ct)

Chi2ContingencyResult(statistic=np.float64(4.381609546998435), pvalue=np.float64(0.036328676650578026), dof=1, expected_freq=array([[ 238.29866071,   18.70133929],
       [1838.70133929,  144.29866071]]))

In [60]:
ct = pd.crosstab(df['is_partner'], y)
chi2_contingency(ct)

Chi2ContingencyResult(statistic=np.float64(0.5650874370816413), pvalue=np.float64(0.4522176685447551), dof=1, expected_freq=array([[ 738.07678571,   57.92321429],
       [1338.92321429,  105.07678571]]))

## 4. Summary

本分析では、直近キャンペーンの反応ログと顧客属性データを基に、反応確率に影響する要因を定量的に把握した。
分析の結果、収入が高い顧客ほど反応確率が高く、また収入情報が不足する場合は学歴が反応確率の補完指標となることが確認された。

これらより、次回キャンペーンでは購買力（収入/学歴）に基づく配信優先度の設計が有効と考えられる。特に上位購買力層への優先配信により、費用対効果の改善が期待できる。