In [31]:
import pandas as pd
import plotly.graph_objects as go
import plotly_express as px

event = pd.read_csv('event.csv')
distribution_center = pd.read_csv('distribution_center.csv')
inventory_items = pd.read_csv('inventory_items.csv')
order_items = pd.read_csv('order_items.csv')
orders = pd.read_csv('orders.csv')
users = pd.read_csv('users.csv', low_memory=False)
products = pd.read_csv('products.csv')

event.created_at = pd.to_datetime(event.created_at)
inventory_items.created_at = pd.to_datetime(inventory_items.created_at)
order_items.created_at = pd.to_datetime(order_items.created_at)
orders.created_at = pd.to_datetime(orders.created_at)
users.created_at = pd.to_datetime(users.created_at)

In [32]:
# 회원 세션 수 
member_session = event[event.user_id!=-1].drop_duplicates('session_id')

# 비회원 세션 수
non_memeber_session = event[event.user_id==-1].drop_duplicates('session_id', keep='first')

user_ratio = pd.DataFrame({
    'User_Type': ['회원', '비회원'],
    'Count': [
        len(member_session),
        len(non_memeber_session)
    ]
})

fig = px.pie(user_ratio, names='User_Type', values='Count')

fig.update_layout(
    title=dict(
        text='회원/비회원 세션수 비교',
        x=0.5  # 가운데 정렬
    ),
    legend=dict(
        x=1,
        xanchor='left',
        yanchor='middle'
    )
)
fig.update_traces(textinfo='percent+label')
fig.show()

In [33]:
item_cnt = inventory_items.groupby('product_category').size().sort_values().to_frame('cnt').reset_index()

fig = go.Figure(
    data=[
        go.Bar(
            x=item_cnt.cnt,
            y=item_cnt.product_category,
            marker=dict(
                color=item_cnt.cnt,
                colorscale='Viridis'
            ),
            orientation='h'
        )
    ]
)

fig.update_layout(
    title=dict(text='카테고리별 남은 제품 수',
               x=0.5),
    xaxis_title='제품 수',
    yaxis_title='카테고리',
    height=600,
    template='simple_white'
)

fig.show()

In [34]:
user_gender = users.groupby('gender').size().to_frame('cnt').reset_index()
user_gender['ratio'] = (user_gender['cnt']/user_gender['cnt'].sum()*100).round(2)

fig = px.pie(user_gender, names='gender', values='ratio')
fig.update_layout(
    title=dict(
        text='유저 성별 비율',
        x=0.5  # 가운데 정렬
    ),
    legend=dict(
        x=1,
        xanchor='left',
        yanchor='middle'
    )
)
fig.update_traces(textinfo='percent+label')
fig.show()

In [35]:
traffic_cnt = users.groupby('traffic_source').size().sort_values(ascending=False).to_frame('cnt').reset_index()

fig = go.Figure(
    data=[
        go.Bar(
            x=traffic_cnt.traffic_source,
            y=traffic_cnt.cnt,
            # marker=dict(
            #     color=traffic_cnt.traffic_source
            # )
            text=traffic_cnt.cnt 
        )
    ]
)

fig.update_layout(
    title=dict(text='트래픽 소스별 유입 유저 수',
               x=0.5),
    xaxis_title='트래픽 소스',
    yaxis_title='유입 유저 수',
    height=600,
    template='simple_white'
)
fig.show()

In [36]:
user_country = users.groupby('country').size().sort_values().to_frame('cnt').reset_index()

fig = go.Figure(
    data=[
        go.Bar(
            x=user_country.cnt,
            y=user_country.country,
            marker=dict(
                color=user_country.cnt,
                colorscale='Plotly3'
            ),
            orientation='h',
            text=user_country.cnt
        )
    ]
)

fig.update_layout(
    title=dict(text='국가별 유저 수',
               x=0.5),
    xaxis_title='유저 수',
    yaxis_title='국가',
    height=600,
    template='simple_white'
)

fig.show()

In [37]:
users['created_at'] = pd.to_datetime(users.created_at)
users['year'] = users.created_at.dt.year

users.groupby('year').size()
# users.drop_duplicates(['user_id'])

year
2019    15355
2020    15647
2021    15413
2022    15498
2023    15510
2024    15598
2025     6979
dtype: int64

In [38]:
import plotly.express as px

fig = px.scatter_mapbox(
    distribution_center,
    lat="latitude",
    lon="longitude",
    hover_name="name",
    hover_data=["distribution_center_id"],
    zoom=3,  # 미국 전체가 보이도록 적당한 줌
    height=600
)

fig.update_traces(marker=dict(size=10))

fig.update_layout(
    mapbox_style="open-street-map",
    title="물류센터 위치",
    title_x=0.5,
    mapbox_center={"lat": 37.5, "lon": -95.5}  
)

fig.show()


In [39]:
order_status = orders.groupby('status').size().to_frame('cnt').reset_index().sort_values('cnt', ascending=False)

fig = px.pie(order_status, names='status', values='cnt')
fig.update_layout(
    title=dict(
        text='주문 상태별 비율',
        x=0.475 # 가운데 정렬
    )
)
fig.update_traces(textinfo='percent+label')
fig.show()

In [40]:
orders.created_at = pd.to_datetime(orders.created_at)
order_items.created_at = pd.to_datetime(orders.created_at)

# 년도 -월 
orders['year_month'] = orders['created_at'].dt.to_period('M').astype(str)
order_items['year_month'] = orders['created_at'].dt.to_period('M').astype(str)

orders_non = orders[orders.status.isin(['Cancelled', 'Returned'])]
order_items_non = order_items[~order_items.status.isin(['Cancelled', 'Returned'])]


monthly_order = orders_non.groupby('year_month').size().to_frame('cnt').reset_index().sort_values('year_month')
order_sales = order_items_non.groupby('year_month').sale_price.sum().to_frame('sales').reset_index().sort_values('year_month')

order_summary = pd.merge(monthly_order, order_sales, on='year_month', how='inner').sort_values('year_month')

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=monthly_order['year_month'],
    y=monthly_order['cnt'],
    mode='lines+markers',
    name='주문량',
    line=dict(color='royalblue', width=2),
    hovertemplate='월: %{x}<br>주문 수: %{y}건<extra></extra>'
))

# fig.add_trace(go.Scatter(
#     x=order_summary['year_month'],
#     y=order_summary['sales'],
#     name='매출액',
#     mode='lines+markers',
#     yaxis='y2',
#     line=dict(color='darkorange', width=2, dash='dot'),
#     marker=dict(size=6),
#     hovertemplate='월: %{x}<br>매출: $%{y:,.0f}<extra></extra>'
# ))


fig.update_layout(
    title=dict(text='년도-월별 주문 수 추이',
               x=0.5),
    title_font_size=20,
    xaxis=dict(
        title='년-월',
        tickangle=-45,
        showgrid=True,
    ),
    yaxis=dict(
        title='주문 수',
        showgrid=True,
    ),
    # yaxis2=dict(
    #     title='매출액',
    #     overlaying='y',
    #     side='right',
    #     showgrid=False
    # ),
    
    plot_bgcolor='white',
    margin=dict(t=80, l=60, r=40, b=80),
    hoverlabel=dict(
        bgcolor="white",
        font_size=14,
        font_family="Arial"
    )
)

fig.show()

In [41]:
order_cnt = orders_non.groupby('num_of_item').size().to_frame('cnt').reset_index()
fig = px.pie(order_cnt, names='num_of_item', values='cnt')
fig.update_layout(
    title=dict(
        text='주문당 구매 상품 개수',
        x=0.475 
    )
)
fig.update_traces(textinfo='percent+label')
fig.show()

In [42]:
product_cate = products.groupby('category').size().sort_values().to_frame('cnt').reset_index()

fig = go.Figure(
    data=[
        go.Bar(
            x=product_cate.cnt,
            y=product_cate.category,
            marker=dict(
                color=product_cate.cnt,
                colorscale='Plotly3'
            ),
            orientation='h',
            text=product_cate.cnt
        )
    ]
)

fig.update_layout(
    title=dict(text='카테고리별 등록된 제품 수',
               x=0.5),
    xaxis_title='제품 수',
    yaxis_title='카테고리',
    height=600,
    template='simple_white'
)

fig.show()

In [43]:
fig = px.histogram(products, x='retail_price', nbins=100)

fig.update_layout(
    title=dict(text='소비자 가격 분포',
               x=0.5), 
    xaxis_title='소비자 가격', 
    yaxis_title='도수', 
    template='simple_white', 
)
fig.show()

In [44]:
product_dep = products.groupby('department').size().to_frame('cnt').reset_index().sort_values('cnt', ascending=False)
fig = px.bar(product_dep, x='department', y='cnt', text_auto=True)
fig.update_layout(title=dict(text='제품 성별 구분 빈도',
                             x=0.5),
                  xaxis_title='구분',
                  yaxis_title='빈도')
fig.show()

In [45]:
dep_mean = products.groupby('department').retail_price.mean().round(2).to_frame('mean').reset_index()
dep_med = products.groupby('department').retail_price.median().round(2).to_frame('median').reset_index()

dep_price = pd.merge(dep_mean, dep_med, how='left', on='department').reset_index()
dep_price = dep_price[['department', 'mean', 'median']]

fig = go.Figure()

fig.add_trace(go.Bar(x=dep_price.department, y=dep_price['mean'],
                    name='판매 가격 평균',
                    text=dep_price['mean'],
                    textposition='outside'))

fig.add_trace(go.Bar(x=dep_price.department, y=dep_price['median'],
                    name='판매 가격 증앙값',
                    text=dep_price['median'],
                    textposition='outside'))

fig.update_layout(title=dict(text='성별 구분별 제품 가격 평균/중앙값',
                             x=0.5),
                  xaxis_title='구분',
                  yaxis_title='가격',
                  barmode='group',
                  )
fig.show()

In [46]:
brand_mean = products.groupby('brand').retail_price.mean().sort_values().to_frame('price').reset_index().tail(10)

fig = go.Figure(
    data=[
        go.Bar(
            x=brand_mean.price,
            y=brand_mean.brand,
            marker=dict(
                color=brand_mean.price,
                colorscale='Mint'
            ),
            orientation='h',
            text=brand_mean.price.round(3)
        )
    ]
)

fig.update_layout(
    title=dict(text='판매 금액 평균 상위 10개 브랜드',
               x=0.5),
    xaxis_title='판매 가격 평균',
    yaxis_title='브랜드',
    template='simple_white'
)

fig.show()

In [47]:
return_cancel = round(orders.groupby('user_id').size().mean()-1, 2)
non_return_cancel = round(orders[~orders.status.isin(['Returned', 'Cancelled'])].groupby('user_id').size().mean() -1, 2) 

repurchase = pd.DataFrame({
    '환불 취소 여부': ['환불 취소 포함한 재구매 횟수', '환불 취소 미포함 재구매 횟수'],
    '재구매 횟수': [return_cancel, non_return_cancel]
})

fig = px.bar(repurchase, x='환불 취소 여부', y='재구매 횟수', color='환불 취소 여부', text_auto='.2f',
             template='simple_white')
fig.show()

In [48]:
orders = orders[~orders.status.isin(['Cancelled', 'Returned'])]

orders.sort_values(by=['user_id', 'created_at'], ascending=[True, True])

orders.loc[:, 'previous_purchase'] = orders.groupby('user_id')['created_at'].shift(1)  # 이전 구매 날짜
orders.loc[:, 'purchase_interval'] = (orders['created_at'] - orders['previous_purchase']).dt.days  # 구매 간격(일 단위)
orders

Unnamed: 0,order_id,user_id,status,gender,created_at,returned_at,shipped_at,delivered_at,num_of_item,year_month,previous_purchase,purchase_interval
9389,13,6,Complete,F,2024-08-17 01:49:00,,2024-08-17 19:36:00,2024-08-20 05:44:00,1,2024-08,NaT,
9390,14,6,Complete,F,2024-08-08 01:49:00,,2024-08-09 03:39:00,2024-08-10 18:32:00,1,2024-08,2024-08-17 01:49:00,-9.0
9391,17,8,Complete,F,2024-09-05 18:59:00,,2024-09-08 14:49:00,2024-09-11 05:15:00,3,2024-09,NaT,
9392,20,12,Complete,F,2024-08-10 09:53:00,,2024-08-10 22:11:00,2024-08-13 13:22:00,1,2024-08,NaT,
9393,22,15,Complete,F,2025-01-21 18:46:00,,2025-01-24 09:34:00,2025-01-28 18:14:00,1,2025-01,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...
125286,125273,99985,Shipped,M,2025-04-17 12:41:00,,2025-04-20 05:54:00,,1,2025-04,2025-04-17 12:41:00,0.0
125287,125275,99986,Shipped,M,2020-07-16 11:36:00,,2020-07-17 20:24:00,,2,2020-07,NaT,
125288,125276,99986,Shipped,M,2025-03-13 11:36:00,,2025-03-13 13:38:00,,1,2025-03,2020-07-16 11:36:00,1701.0
125289,125278,99987,Shipped,M,2024-07-22 05:07:00,,2024-07-22 20:41:00,,1,2024-07,NaT,


In [49]:
event_purchase = event[event.event_type=='purchase']

event_purchase['created_at'] = pd.to_datetime(event_purchase['created_at'])

event_purchase.sort_values(by=['user_id', 'created_at'], ascending=[True, True])

event_purchase.loc[:, 'previous_purchase'] = event_purchase.groupby('user_id')['created_at'].shift(1)  # 이전 구매 날짜
event_purchase.loc[:, 'purchase_interval'] = (event_purchase['created_at'] - event_purchase['previous_purchase']).dt.days  # 구매 간격(일 단위)
event_purchase



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,event_id,user_id,sequence_number,session_id,created_at,ip_address,city,state,postal_code,browser,traffic_source,uri,event_type,previous_purchase,purchase_interval
22864,558067,42642,7,cd1ddb4b-cffb-4035-85a9-e0843fa80eaf,2024-01-24 14:10:43,177.212.221.133,São Paulo,São Paulo,02220-000,IE,Email,/purchase,purchase,NaT,
22865,730857,55898,5,46093913-975b-4a28-b9e4-7b27acedb581,2024-12-02 10:15:06,42.156.229.204,São Paulo,São Paulo,02675-031,Chrome,Email,/purchase,purchase,NaT,
22866,432412,33151,5,bce7f1c7-f1ad-4790-8ae2-68d76d4d6f0e,2023-03-21 01:56:22,75.104.239.88,São Paulo,São Paulo,02675-031,Firefox,Email,/purchase,purchase,NaT,
22867,855901,65240,7,918002d3-4bfe-4989-afa3-51211363955a,2023-07-24 00:46:29,145.89.66.155,São Paulo,São Paulo,02675-031,Chrome,Adwords,/purchase,purchase,NaT,
22868,957029,73088,13,95b621cd-b876-404c-87df-7969394fec0d,2021-11-12 10:20:20,24.32.183.102,São Paulo,São Paulo,02675-031,Safari,Facebook,/purchase,purchase,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428997,632064,48376,13,7ecdff07-0e62-4a9a-8660-05893079e29b,2021-02-23 15:41:46,171.244.2.201,Wigan,England,WN3,IE,Facebook,/purchase,purchase,2021-02-24 17:54:51,-2.0
2428998,119494,8977,7,78246070-1db8-46f5-9ccd-7c19794d149d,2024-06-19 14:26:04,54.16.51.37,Worcester,England,WR2,Chrome,Email,/purchase,purchase,2024-07-14 12:15:54,-25.0
2428999,260465,19824,5,8922b9e4-2fb5-4785-ab88-0c9f53bd011e,2021-02-08 10:33:13,92.144.3.253,Hednesford,England,WS12,Safari,Adwords,/purchase,purchase,NaT,
2429000,831829,63450,5,46efada6-a9c9-4652-9203-343779b71286,2020-05-21 23:50:53,217.251.178.123,Bilston,England,WV14,Safari,Facebook,/purchase,purchase,2019-08-09 02:11:31,286.0


In [50]:
event_purchase = event[event.event_type == 'purchase'].copy()

# 문자열로 바꾸는 건 필요 없음 → 바로 datetime 유지
event_purchase['created_at'] = pd.to_datetime(event_purchase['created_at'])

# 정렬된 결과를 변수에 다시 저장!!
event_purchase = event_purchase.sort_values(by=['user_id', 'created_at'], ascending=[True, True])

# 이전 구매 날짜
event_purchase['previous_purchase'] = event_purchase.groupby('user_id')['created_at'].shift(1)

# 구매 간격
event_purchase['purchase_interval'] = (event_purchase['created_at'] - event_purchase['previous_purchase']).dt.days

event_purchase[(event_purchase.purchase_interval>0) & (event_purchase.purchase_interval.notna())].purchase_interval.mean()

183.82970095612666

In [51]:
repurchase_interval = event_purchase[(event_purchase.purchase_interval.notna())]

fig = px.histogram(repurchase_interval, x='purchase_interval', nbins=100)

fig.update_layout(title=dict(text='재구매 주기 분포',
                             x=0.5),
                  xaxis_title='재구매 주기',
                  template='simple_white')
fig.show()

# 당일 같이 구매한 경우 제외
# 대부분의 유저는 50일 이내 재구매를 한다
# 재구매 기간이 가장 긴 유저는 2177일로 평균 재구매 기간은 183일, 중앙값 33일

In [52]:
# 분 단위 최대 2.94 

event['created_at'] = pd.to_datetime(event['created_at'])
event = event[event.user_id != -1]

product_cart = event[event['event_type'].isin(['cart', 'home', 'department', 'product'])]

product_cart = product_cart.sort_values(['user_id', 'created_at'])

product_cart['next_event_type'] = product_cart.groupby('user_id')['event_type'].shift(-1)
product_cart['next_event_time'] = product_cart.groupby('user_id')['created_at'].shift(-1)

product_to_cart = product_cart[(product_cart.event_type.isin(['home', 'department', 'product'])) & (product_cart.next_event_type=='cart')]

product_to_cart['time_to_cart'] = (product_to_cart['next_event_time'] - product_to_cart['created_at']).dt.total_seconds()


product_to_cart_cnt = product_to_cart.groupby('time_to_cart').size().to_frame('cnt').reset_index()

fig = px.line(product_to_cart_cnt, x='time_to_cart', y='cnt')
fig.update_layout(title=dict(text='home/department/product -> cart 이벤트 전환 시간(초) 분포',
                             x=0.5),
           xaxis_title='전환 시간(초)',
           template='simple_white')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [53]:
# 일 단위 최대 4.0

event['created_at'] = pd.to_datetime(event['created_at'])

cart_purchase = event[event['event_type'].isin(['cart', 'purchase'])]

cart_purchase = cart_purchase.sort_values(['user_id', 'created_at'])

cart_purchase['next_event_type'] = cart_purchase.groupby('user_id')['event_type'].shift(-1)
cart_purchase['next_event_time'] = cart_purchase.groupby('user_id')['created_at'].shift(-1)

cart_to_purchase = cart_purchase[(cart_purchase.event_type=='cart') & (cart_purchase.next_event_type=='purchase')]

cart_to_purchase['time_to_purchase'] = (cart_to_purchase['next_event_time'] - cart_to_purchase['created_at']).dt.total_seconds()/86400

cart_to_purchase_cnt = cart_to_purchase.groupby('time_to_purchase').size().to_frame('cnt').reset_index()

fig = px.line(cart_to_purchase_cnt, x='time_to_purchase', y='cnt')

fig.update_layout(title=dict(text='cart -> purchase 이벤트 전환 시간(일) 분포',
                             x=0.5),
                  xaxis_title='전환 시간(일)',
                  template='simple_white',
                  xaxis=dict(range=[-0.05,4.1])
                  )

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [54]:
purchase_try = cart_to_purchase.groupby('user_id').size().to_frame('purchase_try').reset_index()

order_items = order_items[~order_items.status.isin(['Cancelled', 'Returned'])]

order_items_purchase_try = pd.merge(order_items, purchase_try, on='user_id', how='left')

order_items_purchase_try = pd.merge(order_items_purchase_try, order_items_purchase_try.groupby('user_id').size().to_frame('cnt').reset_index(), on='user_id', how='left')
order_items_purchase_try.groupby('purchase_try').cnt.mean()

purchase_try
1.0    1.678942
2.0    2.738239
3.0    3.796089
4.0    4.682392
5.0    5.766418
6.0    6.913309
7.0    7.390625
8.0    9.636364
Name: cnt, dtype: float64

In [55]:
purchase_try = cart_to_purchase.groupby('user_id').size().to_frame('purchase_try').reset_index()

order_items = order_items[~order_items.status.isin(['Cancelled', 'Returned'])]

order_items_sorted = order_items.sort_values(['user_id', 'created_at'])  
order_items_sorted['previous_order_time'] = order_items_sorted.groupby('user_id')['created_at'].shift(1)

order_items_sorted['time_diff'] = (order_items_sorted['created_at'] - order_items_sorted['previous_order_time']).dt.days

order_items_sorted['is_repurchase'] = (order_items_sorted['time_diff'] >= 1).astype(int)

repurchase_cnt = order_items_sorted.groupby('user_id')['is_repurchase'].sum().reset_index()

df = pd.merge(purchase_try, repurchase_cnt, on='user_id', how='left')
df['is_repurchase'] = df['is_repurchase'].fillna(0)

df.groupby('purchase_try')['is_repurchase'].mean()

purchase_try
1    0.129615
2    0.597812
3    1.181740
4    1.762955
5    2.422374
6    3.186567
7    3.578947
8    5.800000
Name: is_repurchase, dtype: float64

In [56]:
df.isna().sum()

user_id          0
purchase_try     0
is_repurchase    0
dtype: int64

In [57]:
from scipy.stats import pearsonr

# df: 'purchase_try', 'is_repurchase' 컬럼 포함된 테이블
corr, p_value = pearsonr(df['purchase_try'], df['is_repurchase'])
print(f'상관계수: {corr:.3f}, p-value: {p_value:.5f}')


상관계수: 0.586, p-value: 0.00000


In [58]:
import statsmodels.api as sm

X = df['purchase_try']
y = df['is_repurchase']

X = sm.add_constant(X)  # 상수항 추가
model = sm.OLS(y, X).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:          is_repurchase   R-squared:                       0.343
Model:                            OLS   Adj. R-squared:                  0.343
Method:                 Least Squares   F-statistic:                 4.178e+04
Date:                Thu, 01 May 2025   Prob (F-statistic):               0.00
Time:                        16:05:18   Log-Likelihood:                -92169.
No. Observations:               79998   AIC:                         1.843e+05
Df Residuals:                   79996   BIC:                         1.844e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.4381      0.005    -82.841   

In [59]:
mean_repurchase = df.groupby('purchase_try')['is_repurchase'].mean().reset_index()

fig = px.line(
    mean_repurchase, 
    x='purchase_try', 
    y='is_repurchase',
    markers=True,
    title='결제 시도 수별 평균 재구매 수',
    labels={'purchase_try': '결제 시도 수', 'is_repurchase': '평균 재구매 수'}
)
fig.update_layout(title=dict(text='결제 시도 수별 평균 재구매 수',
                             x=0.5),
                    template='simple_white')
fig.show()


In [60]:
import pandas as pd
import numpy as np
import plotly.express as px

# === (1) 전처리: 유저별 첫 구매일, 두 번째 구매일 ===

# order_items에는 user_id, order_date 등이 포함돼 있다고 가정
order_items = order_items.sort_values(['user_id', 'created_at'])

# 각 유저의 구매일 rank 부여
order_items['purchase_rank'] = order_items.groupby('user_id')['created_at'].rank(method='first')

# 첫 구매, 두 번째 구매 데이터 추출
first_purchase = order_items[order_items['purchase_rank'] == 1][['user_id', 'created_at']].rename(columns={'created_at': 'first_purchase_date'})
second_purchase = order_items[order_items['purchase_rank'] == 2][['user_id', 'created_at']].rename(columns={'created_at': 'second_purchase_date'})

# 병합 및 재구매까지 걸린 일수 계산
repurchase_info = pd.merge(first_purchase, second_purchase, on='user_id', how='inner')
repurchase_info['days_to_repurchase'] = (repurchase_info['second_purchase_date'] - repurchase_info['first_purchase_date']).dt.days

# 전체 구매 횟수 추가 (장기 재구매 지표)
user_total_cnt = order_items.groupby('user_id').size().reset_index(name='total_purchases')

# 병합
repurchase_info = pd.merge(repurchase_info, user_total_cnt, on='user_id', how='left')


In [61]:
order_items = order_items.sort_values(['user_id', 'created_at'])
order_items['purchase_rank'] = order_items.groupby('user_id')['created_at'].rank(method='first')

first_purchase = order_items[order_items['purchase_rank'] == 1][['user_id', 'created_at']].rename(columns={'created_at': 'first_purchase_date'})
second_purchase = order_items[order_items['purchase_rank'] == 2][['user_id', 'created_at']].rename(columns={'created_at': 'second_purchase_date'})

repurchase_info = pd.merge(first_purchase, second_purchase, on='user_id', how='left')
repurchase_info['days_to_repurchase'] = (repurchase_info['second_purchase_date'] - repurchase_info['first_purchase_date']).dt.days

results = []

total_converted = repurchase_info['second_purchase_date'].notna().sum()

for n in range(1, 91):
    converted_within_n = repurchase_info[
        (repurchase_info['second_purchase_date'].notna()) & 
        (repurchase_info['days_to_repurchase'] <= n)
    ]

    conversion_rate = len(converted_within_n) / total_converted if total_converted else 0

    results.append({'n': n, 'conversion_rate': conversion_rate * 100})  # %

result_df = pd.DataFrame(results)

result_df['conversion_rate_diff'] = result_df['conversion_rate'].diff()
result_df['smoothed_diff'] = result_df['conversion_rate_diff'].rolling(window=7).mean()

fig = px.line(result_df, x='n', y='smoothed_diff',
              title='n일별 전환률 증가량',
              labels={'n': 'n일', 'conversion_rate_diff': '전환률 증가량'})
fig.update_layout(title=dict(text='n일별 전환률 증가량',
                             x=0.5),
                    template='simple_white')
fig.update_traces(mode='lines+markers')
fig.show()