In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly_express as px

event = pd.read_csv('event.csv')
distribution_center = pd.read_csv('distribution_center.csv')
inventory_items = pd.read_csv('inventory_items.csv')
order_items = pd.read_csv('order_items.csv')
orders = pd.read_csv('orders.csv')
users = pd.read_csv('users.csv', low_memory=False)
products = pd.read_csv('products.csv')

event.created_at = pd.to_datetime(event.created_at)
inventory_items.created_at = pd.to_datetime(inventory_items.created_at)
order_items.created_at = pd.to_datetime(order_items.created_at)
orders.created_at = pd.to_datetime(orders.created_at)
users.created_at = pd.to_datetime(users.created_at)

### A/B 테스트

In [None]:
import numpy as np

# Step 1: 유효한 데이터 필터링 (previous_purchase, purchase_interval 모두 존재)
df_valid = orders.dropna(subset=['previous_purchase', 'purchase_interval']).copy()

# 날짜형 변환
df_valid['created_at'] = pd.to_datetime(df_valid['created_at'])
df_valid['previous_purchase'] = pd.to_datetime(df_valid['previous_purchase'])

# Step 2: 리마인드 가능 시점 계산 (개인화 구매주기 * 0.7 이후)
df_valid['remind_date'] = df_valid['previous_purchase'] + pd.to_timedelta(df_valid['purchase_interval'] * 0.7, unit='D')

# 리마인드 타이밍이 현재 구매일보다 앞선 경우만 사용
df_test = df_valid[df_valid['remind_date'] < df_valid['created_at']].copy()

# Step 3: 그룹 랜덤 생성 (동일한 수로 맞춤)
np.random.seed(42)
test_size = len(df_test) // 2 * 2  # 짝수로 맞추기
df_test = df_test.sample(test_size, random_state=42).reset_index(drop=True)

df_test['group'] = ['A'] * (test_size // 2) + ['B'] * (test_size // 2)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)  # 셔플

def simulate_repurchase(group):
    return np.random.rand() < (0.3051284370477569*1.23 if group == 'A' else 0.3051284370477569)

df_test['repurchase_result'] = df_test['group'].apply(lambda g: '재구매함' if simulate_repurchase(g) else '재구매 안 함')

df_test[['user_id', 'group', 'repurchase_result']].head()

def a(x):
    if x=='A':
        return '실험군 (A)'
    else:
        return '대조군 (B)'
    
df_test.group = df_test.group.apply(a)


In [39]:
from scipy.stats import chi2_contingency

# 교차표 생성
contingency_table = pd.crosstab(df_test['group'], df_test['repurchase_result'])

# 카이제곱 검정 실행
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

contingency_table, p_val


(repurchase_result  재구매 안 함  재구매함
 group                           
 대조군 (B)               9486  4334
 실험군 (A)               8825  4995,
 4.6446485958829215e-17)

In [42]:
contingency_table = contingency_table.reindex(['실험군 (A)', '대조군 (B)'])

In [53]:
import plotly.graph_objects as go

# 재구매율 계산
group_counts = contingency_table.sum(axis=1)
repurchase_rates = contingency_table['재구매함'] / group_counts

# 막대그래프 생성
fig = go.Figure(data=[
    go.Bar(name='재구매율', x=repurchase_rates.index, y=repurchase_rates.values, 
           text=repurchase_rates.apply(lambda x: f'{x:.2%}'), textposition='auto', marker_color=['#636EFA', '#EF553B']
 )
])

fig.update_layout(
    title=dict(text='A/B 그룹별 재구매율 비교',
               x=0.5),
    xaxis_title='그룹',
    yaxis_title='재구매율',
    yaxis_tickformat=".0%",
    template='plotly_white'
)

fig.show()
