In [1]:
#데이터 불러오기, 피처 추가
import pandas as pd
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
prior = pd.read_csv('order_products__prior.csv')
train = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

In [2]:
dep_prod = departments.merge(products[['product_id', 'department_id']], how='left', on='department_id')
dep_reorder = dep_prod.merge(train[['product_id', 'reordered']], how='left', on='product_id')
dep_reorder_rate = dep_reorder.groupby('department').reordered.mean().sort_values().to_frame('reorder_rate').reset_index()

In [3]:
dep_reorder_rate

Unnamed: 0,department,reorder_rate
0,personal care,0.337089
1,pantry,0.363088
2,international,0.379936
3,missing,0.38153
4,other,0.388301
5,household,0.427166
6,canned goods,0.486805
7,dry goods pasta,0.487821
8,babies,0.541062
9,frozen,0.559297


In [4]:
import plotly_express as px
import plotly.graph_objects as go
fig = go.Figure(
    data=[
        go.Bar(
            x=dep_reorder_rate.reorder_rate,
            y=dep_reorder_rate.department,
            marker=dict(
                color=dep_reorder_rate.reorder_rate,
                colorscale='Viridis'
            ),
            orientation='h'
        )
    ]
)

fig.update_layout(
    title='카테고리별 평균 재구매율',
    xaxis_title='재구매율',
    yaxis_title='카테고리',
    height=600,
    template='simple_white'
)

fig.show()

In [5]:
prod_aisle = aisles.merge(products[['aisle_id', 'product_id']], how='left', on='aisle_id')
aisle_train = prod_aisle.merge(train[['product_id', 'reordered']], how='left', on='product_id')

aisle_reorder = aisle_train.groupby('aisle').reordered.mean().dropna().to_frame('reorder_rate').reset_index()
aisle_reorder = aisle_reorder[(aisle_reorder.reorder_rate != 0) & (aisle_reorder.reorder_rate != 1)].sort_values('reorder_rate')
aisle_reorder_df = pd.concat([aisle_reorder.head(20), aisle_reorder.tail(20)])

fig = go.Figure(
    data=[
        go.Bar(
            x=aisle_reorder_df.reorder_rate,
            y=aisle_reorder_df.aisle,
            marker=dict(
                color=aisle_reorder_df.reorder_rate,
                colorscale='RdBu'
            ),
            orientation='h')
    ]
)

fig.update_layout(
    title='재구매율 상위/하위 20개 품목 비교',
    xaxis_title='재구매율',
    yaxis_title='품목명(aisle)',
    height=800,
    template='simple_white'
)

fig.update_yaxes(
    tickfont=dict(size=10)
)

fig.show()

In [6]:
train.groupby('reordered').add_to_cart_order.mean()

reordered
0    10.085226
1     7.868062
Name: add_to_cart_order, dtype: float64

In [7]:
# 장바구니에 먼저 담은 제품일 수록 재구매율이 높다
order_seq = train.groupby('add_to_cart_order').reordered.mean().to_frame('reorder_rate').reset_index()
order_seq = order_seq[order_seq.add_to_cart_order<=20]
order_seq['add_to_cart_order'] = order_seq['add_to_cart_order'].astype(str)

fig = go.Figure(
    data=go.Bar(
        x=order_seq.reorder_rate,
        y=order_seq.add_to_cart_order,
        marker=dict(
            color=order_seq.reorder_rate,
            colorscale='Plotly3'
        ),
        orientation='h'
    )
)
fig.update_layout(
    title='장바구니 순서 상위 20개 재구매율 평균 비교',
    xaxis_title='재구매율',
    yaxis_title='장바구니 순서(add_to_cart_order)',
    height=600,
    template='simple_white'
)

fig.update_yaxes(autorange='reversed')
fig.show()

In [8]:
order_seq = train.groupby('add_to_cart_order').reordered.mean().to_frame('reorder_rate').reset_index()
order_seq.corr()

Unnamed: 0,add_to_cart_order,reorder_rate
add_to_cart_order,1.0,-0.416472
reorder_rate,-0.416472,1.0


In [9]:
import statsmodels.api as sm

# 회귀 분석을 위한 데이터 준비
X = order_seq[['add_to_cart_order']]  # 장바구니 순서
X = sm.add_constant(X)  # 상수항 추가
y = order_seq['reorder_rate']

# 회귀 분석
model = sm.OLS(y, X).fit()

# 회귀 분석 결과 출력
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           reorder_rate   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.163
Method:                 Least Squares   F-statistic:                     16.37
Date:                Wed, 16 Apr 2025   Prob (F-statistic):           0.000122
Time:                        15:11:11   Log-Likelihood:                 77.428
No. Observations:                  80   AIC:                            -150.9
Df Residuals:                      78   BIC:                            -146.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.5408      0.02

In [10]:
train_copy = train.copy()
prod_reorder = train.groupby('product_id').reordered.mean().to_frame('reorder_rate').reset_index()
train_copy = train.merge(prod_reorder, how='left', on='product_id')

median_rate = train_copy['reorder_rate'].median()
train_copy['group'] = train_copy['reorder_rate'].apply(lambda x: 'high' if x>=median_rate else 'low')
train_copy.groupby('group').reordered.mean().reset_index()
train_copy.groupby('group').reordered.mean()

group
high    0.727181
low     0.468325
Name: reordered, dtype: float64

In [11]:
from scipy.stats import ttest_ind

high = train_copy[train_copy.group=='high']['reordered']
low = train_copy[train_copy.group=='low']['reordered']

t_stat, p_value = ttest_ind(high, low, equal_var=False)
print(t_stat, p_value)

321.88335525678906 0.0


In [12]:
train_copy

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,reorder_rate,group
0,1,49302,1,1,0.875000,high
1,1,11109,2,1,0.743056,high
2,1,10246,3,0,0.535782,low
3,1,49683,4,0,0.708247,high
4,1,43633,5,1,0.416667,low
...,...,...,...,...,...,...
1384612,3421063,14233,3,1,0.785199,high
1384613,3421063,35548,4,1,0.555556,low
1384614,3421070,35951,1,1,0.773438,high
1384615,3421070,16953,2,1,0.548940,low


In [13]:
train_copy.group

0          high
1          high
2           low
3          high
4           low
           ... 
1384612    high
1384613     low
1384614    high
1384615     low
1384616    high
Name: group, Length: 1384617, dtype: object

In [14]:
fig = go.Figure(
    data=[
        go.Bar(
            x=['재구매율이 높은 제품', '재구매율이 낮은 제품'],
            y=train_copy.groupby('group').reordered.mean(),
            marker_color=['#1f77b4', '#ff7f0e'],
            text=train_copy.groupby('group').reordered.mean().round(2),
            textposition='auto'
    )
    ]
)

fig.update_layout(
    title='제품 그룹별 평균 재구매율 비교',
    xaxis_title='그룹',
    yaxis_title='재구매율',
    template='simple_white'
)

fig.show()