In [40]:
import os
import warnings
import pandas as pd 
import numpy as np
import datetime as dt

import seaborn as sns
sns.set_style('whitegrid')
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [41]:
df = pd.read_csv("/kaggle/input/ecommerce-data/data.csv",encoding = 'unicode_escape')

In [42]:
df.head(10)

# Data Ceaning

In [43]:
df.isnull().sum()

In [44]:
df.dropna(inplace = True)
df.isnull().sum()

In [45]:
df.shape

In [46]:
df.info()

In [47]:
df.describe()

In [48]:
# datetime으로 변경
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [49]:
df['TotalPrice'] = df['UnitPrice'] * df['Quantity']
df.head()

In [50]:
df['Country'].value_counts().reset_index()

In [51]:
df = df[(df['Country']=='United Kingdom')&
        (df['UnitPrice']>0)&
        (df['TotalPrice']>0)&
        (df['Quantity']>0)&
        (~df['InvoiceNo'].str.contains("C", na=False))]

# RFM Analysis

## Remove Outliers

In [52]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit ,up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
replace_with_thresholds(df, "Quantity")
replace_with_thresholds(df, "UnitPrice")

## Recency

In [53]:
max_date = df['InvoiceDate'].dt.date.max()
max_date

In [54]:
recency = df.groupby('CustomerID')['InvoiceDate'].agg(lambda x : (max_date - x.dt.date.max()).days).reset_index()
recency.rename(columns = {'InvoiceDate' : 'Recency'}, inplace = True)
recency.head()

## Frequency

In [55]:
f_df = df.groupby(['CustomerID','InvoiceNo']).agg({'TotalPrice': lambda x: x.sum()})
frequency = f_df.groupby('CustomerID').agg({'TotalPrice' : lambda x : len(x)})
frequency.rename(columns = {'TotalPrice' : 'Frequency'}, inplace = True)
frequency.head()

## Monetary

In [56]:
monetary = df.groupby('CustomerID', as_index=False)['TotalPrice'].sum() # as_index = False -> group을 인덱스로 사용하지 않음
monetary.rename(columns = {'TotalPrice' : 'Monetary'}, inplace = True)
monetary.head()

In [57]:
rf = pd.merge(recency, frequency, how='outer', on='CustomerID')
rfm = pd.merge(rf, monetary, how='outer', on='CustomerID')
rfm.head()

In [58]:
rfm['Frequency'].plot.hist()

In [59]:
rfm['Recency'].plot.hist()

In [60]:
rfm['Monetary'].plot.hist()

In [61]:
rfm.isnull().sum()

## RFM Segmentation

In [62]:
rfm['r_score'] = pd.qcut(rfm['Recency'], 5, labels=[5,4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]) #rank(method='first') 동점 관측치 중에서 데이터 상에 먼저 나타나는 관측치부터 순위 부여
rfm['m_score'] = pd.qcut(rfm['Monetary'], 5, labels=[1,2,3,4,5])

In [63]:
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str)

In [64]:
rfm.head()

## Segments Description
- Hibernating : 최근에 구매를 하지 않았으며 구매 빈도가 낮은 고객
- At Risk : 최근에 구매를 하지 않았으며  구매 빈도는 보통~낮은 편인 고객
- Can’t Lose Them : 최근에 구매를 하지 않았지만 구매 빈도는 높은 고객
- About To Sleep : 구매 시기는 보통이지만 구매 빈도는 낮은 고객
- Need Attention : 구매 시기, 구매 빈도 보통인 고객
- Loyal Customers : 최근에 구매하였고 구매 빈도는 보통~높은 편인 고객
- Promising : 최근에 구매하였지만 구매 빈도는 낮은 고객
- New Customers : 아주 최근에 구매하였지만 구매 빈도는 낮은 고객
- Potential Loyalist : 최근에 구매하였지만 구매 빈도는 보통인 고객
- Champions : 아주 최근에 구매하였으며 구매 빈도가 높은 고객

In [65]:
seg_map = {r'[1-2][1-2]': 'hibernating',
           r'[1-2][3-4]': 'at_Risk',
           r'[1-2]5': 'cant_loose',
           r'3[1-2]': 'about_to_sleep',
           r'33': 'need_attention',
           r'[3-4][4-5]': 'loyal_customers',
           r'41': 'promising',
           r'51': 'new_customers',
           r'[4-5][2-3]': 'potential_loyalists',
           r'5[4-5]': 'champions'}

rfm['segment'] = rfm['rfm_score'].replace(seg_map, regex=True) # regex=True일 때 string/pattern을 정규표현식으로 인
rfm.head()

In [66]:
rfm['segment'].value_counts().sort_values()

In [67]:
rfm.groupby('segment')[['Recency', 'Frequency', 'Monetary']].agg('mean')

In [68]:
fig = go.Figure()
for i in rfm['segment'].unique():
    fig.add_trace(go.Scatter(x=rfm[rfm['segment']==i]['Recency'],
                             y=rfm[rfm['segment']==i]['Frequency'],
                             mode='markers', name=str(i),showlegend = True,
                             marker = dict(size = 10,opacity = 0.6)))

fig.update_layout(title={'text': "Recency & Frequency by Segments",
                         'y':0.9, # text y좌표
                         'x':0.5, # text x좌표
                         'xanchor': 'center', # text 위치 (x)
                         'yanchor': 'top'}, # text 위치 (y)
                  legend=dict(x=0.9,
                              y=1),
                  xaxis=dict(title='Recency'), # x축 이름
                  yaxis=dict(title='Frequency'), # y축 이름
                  template='plotly_white')

fig.show()

In [69]:
data = go.Bar(y = pd.DataFrame(rfm['segment'].value_counts()).index,
              x = pd.DataFrame(rfm['segment'].value_counts())['segment'].values,
              name = str(pd.DataFrame(rfm['segment'].value_counts())['segment'].values),
              orientation='h')

layout = go.Layout(title={'text': "Number of Customer by Segments",
                          'y':0.9, # text y좌표
                          'x':0.5, # text x좌표
                          'xanchor': 'center', # text 위치 (x)
                          'yanchor': 'top'},# text 위치 (y)
                   xaxis =dict(title='Customers'), # x축 이름
                   template = 'plotly_white')

fig=go.Figure(data=data, layout=layout)
fig.update_xaxes(range=[0,1300]) # x축 범위
iplot(fig)

# K-means Clustering

### Elbow Methods

In [70]:
kmeans_data = rfm.loc[:,['r_score','f_score']]

inertias = []
k = [1,2,3,4,5,6,7,8,9]

for i in k:
    kmean = KMeans(n_clusters=i)
    kmean.fit(kmeans_data)
    inertias.append(kmean.inertia_)

In [71]:
data = go.Scatter(x=k, y=inertias,mode='lines+markers',marker=dict(size=10))

layout = go.Layout(title={'text': "Elbow Method",
                          'y':0.9, # text y좌표
                          'x':0.5, # text x좌표
                          'xanchor': 'center',# text 위치 (x)
                          'yanchor': 'top'},# text 위치 (y)
                   width=600,
                   height=470,
                   xaxis = dict(title='Number of Clusters'), # x축 이름
                   yaxis =dict(title='Sum of Squared Distance'), # y축 이름
                   template = 'plotly_white')

fig =go.Figure(data=data,layout=layout)
iplot(fig)

### Shilhouette Score

In [72]:
kmeans = KMeans(n_clusters = 4, random_state = 42)
kmeans.fit(kmeans_data)

round(metrics.silhouette_score(kmeans_data, kmeans.labels_),3) # 군집 수 4

군집 수를 4로 결정

### Segment Analysis

In [73]:
rfm['segment2'] = kmeans.labels_
rfm.head()

In [74]:
rfm[['Recency', 'Frequency', 'Monetary', 'segment2']].groupby('segment2').agg({'mean','std','max','min'})

In [75]:
rfm['segment2'].value_counts().reset_index()

In [76]:
fig = go.Figure()
for i in rfm['segment2'].unique():
    fig.add_trace(go.Scatter(x=rfm[rfm['segment2']==i]['Recency'],
                             y=rfm[rfm['segment2']==i]['Frequency'],
                             mode='markers', name=str(i),showlegend = True,
                             marker = dict(size = 10,opacity = 0.6)))

fig.update_layout(title={'text': "Recency & Frequency by Segments",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  legend=dict(x=0.9,
                              y=1),
                  xaxis=dict(title='Recency'),
                  yaxis=dict(title='Frequency'),
                  template='plotly_white')

fig.show()

In [77]:
data = go.Bar(y = pd.DataFrame(rfm['segment2'].value_counts()).index,
              x = pd.DataFrame(rfm['segment2'].value_counts())['segment2'].values,
              name = str(pd.DataFrame(rfm['segment2'].value_counts())['segment2'].values),
              orientation='h')

layout = go.Layout(title={'text': "Number of Customer by K-Means Segments",
                          'y':0.9,
                          'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                   xaxis =dict(title='Customers'),
                   template = 'plotly_white')

fig=go.Figure(data=data, layout=layout)
fig.update_xaxes(range=[0,1900])
iplot(fig)

In [78]:
fig = make_subplots(rows=3,cols=4)

#Recency
fig.add_trace(go.Box(y =rfm[rfm['segment2']==0]['Recency'],showlegend=False,
                     name = '0',marker_color='blue'),row=1,col=1)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==1]['Recency'],showlegend=False,
                     name = '1',marker_color='red'),row=1,col=2)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==2]['Recency'],showlegend=False,
                     name = '2',marker_color='orange'),row=1,col=3)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==3]['Recency'],showlegend=False,
                     name = '3',marker_color='green'),row=1,col=4)

#Frequency
fig.add_trace(go.Box(y =rfm[rfm['segment2']==0]['Frequency'],showlegend=False,
                     name = '0',marker_color='blue'),row=2,col=1)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==1]['Frequency'],showlegend=False,
                     name = '1',marker_color='red'),row=2,col=2)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==2]['Frequency'],showlegend=False,
                     name = '2',marker_color='orange'),row=2,col=3)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==3]['Frequency'],showlegend=False,
                     name = '3',marker_color='green'),row=2,col=4)

#Monetary
fig.add_trace(go.Box(y =rfm[rfm['segment2']==0]['Monetary'],showlegend=False,
                     name = '0',marker_color='blue'),row=3,col=1)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==1]['Monetary'],showlegend=False,
                     name = '1',marker_color='red'),row=3,col=2)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==2]['Monetary'],showlegend=False,
                     name = '2',marker_color='orange'),row=3,col=3)
fig.add_trace(go.Box(y =rfm[rfm['segment2']==3]['Monetary'],showlegend=False,
                     name = '3',marker_color='green'),row=3,col=4)

fig.update_layout(title={'text': "RFM by K-Means Segments",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  width=800,
                  height=650,
                  template='plotly')

fig.update_yaxes(title_text="Recency", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)
fig.update_yaxes(title_text="Monetary", row=3, col=1)
iplot(fig)

- 0번 군집은 다른 군집에 비해 Recency가 높다. 즉, 최근에 활동한 적이 없는 고객 군집이다.
- 0, 3번 군집은 다른 군집에 비해 Frequency가 낮다. 즉, 구매 빈도가 적은 고객 군집이다.
- 1번 군집은 다른 군집들에 비해 Monetary가 높다. 즉, 지출 금액이 큰 고객 군집이다.

### Suggestion
- 0번 군집 (저수익성 고객) -> 마케팅 비용 절감
- 1번 군집 (VIP 고객, R/F/M 우수) -> 프리미엄 서비스 운영, 신제품 홍보 등
- 2번 군집 (이탈 우려 고객 , F/M 우수) -> 활성화 프로모션, 할인 쿠폰 제공
- 3번 군집 (신경 써야할 고객) -> 활성화 프로모션