In [1]:
import pandas as pd
import numpy as np

In [2]:
FILE_PATH = '../../data/marketing/attribution_data.csv'
df = pd.read_csv(FILE_PATH)
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display
4,0000nACkD9nFkBBDECD3ki00E,2018-07-03T09:44:57Z,impression,0,0.0,Paid Search


* Cookie: 브라우저를 이용해 사이트에 방문한 사용자들에게 부여된 랜덤한 쿠키의 ID
* Timestamp: 사이트에 방문한 시간(쿠키 기록 시간)
* Interaction: 사이트에 방문한 사용자에게 광고가 노출 되었는지, 전환 까지 이루어 졌는지에 대한 카테고리 변수
* Conversion: 0 : 실제 구매가 이루어지지 않음. 1 : 실제 구매가 이루어 졌음
* Conversion Value: 전환의 가치
* Channel: 마케팅에 대한 온라인 채널


In [3]:
df_channels_per_cookie = df.groupby('cookie')['channel'].nunique().reset_index()
df_channels_per_cookie.head()

Unnamed: 0,cookie,channel
0,00000FkCnDfDDf0iC97iC703B,2
1,0000nACkD9nFkBBDECD3ki00E,1
2,0003EfE37E93D0BC03iBhBBhF,1
3,00073CFE3FoFCn70fBhB3kfon,1
4,00079hhBkDF3k3kDkiFi9EFAD,1


In [4]:
# 채널 수 별 유저 수 확인
df_channels_per_cookie.groupby('channel').count()

Unnamed: 0_level_0,cookie
channel,Unnamed: 1_level_1
1,183020
2,45964
3,9549
4,1473
5,102


## Last Interaction Attribution Model
* 마지막으로 노출/클릭된 채널의 기여도를 100%로 계산

In [5]:
TEST_USER = '00000FkCnDfDDf0iC97iC703B'
df.loc[df['cookie'] == TEST_USER]

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display


In [7]:
# 마지막으로 본 광고 선택
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'channel', 'conversion']]
df_last_interaction.head(10)

Unnamed: 0,cookie,channel,conversion
3,00000FkCnDfDDf0iC97iC703B,Online Display,0
9,0000nACkD9nFkBBDECD3ki00E,Paid Search,0
14,0003EfE37E93D0BC03iBhBBhF,Paid Search,0
15,00073CFE3FoFCn70fBhB3kfon,Instagram,0
16,00079hhBkDF3k3kDkiFi9EFAD,Paid Search,0
20,0007iiAiFh3ifoo9Ehn3ABB0F,Instagram,0
21,0007o0nfoh9o79DDfD7DAiEnE,Facebook,0
22,0007oEBhnoF97AoEE3BCkFnhB,Paid Search,1
23,00090n9EBBEkA000C7Cik999D,Facebook,1
28,000A9AfDohfiBAFB0FDf3kDEE,Online Video,0


In [9]:
# conversion 된 경우만 확인
df_last_interaction_conv = df_last_interaction.loc[df_last_interaction['conversion'] == 1]
df_last_interaction_conv.head()

Unnamed: 0,cookie,channel,conversion
22,0007oEBhnoF97AoEE3BCkFnhB,Paid Search,1
23,00090n9EBBEkA000C7Cik999D,Facebook,1
83,000h3n9nC0hFhE3CCnkkAof7n,Facebook,1
109,000hCBnCB7oi7ADAEnEBCnBEE,Online Video,1
124,000kiDB3D0fCfDAohCDB3ohko,Facebook,1


In [14]:
last_interaction_attrib = df_last_interaction_conv.groupby('channel')['cookie'].count().reset_index()
# last interaction 만 남겼기 때문에 어차피 중복 없음 => nunique 대신 count 이용
last_interaction_attrib.rename(
    columns={
        'cookie':'interactions'
    }
)
last_interaction_attrib

Unnamed: 0,channel,cookie
0,Facebook,5301
1,Instagram,2244
2,Online Display,2139
3,Online Video,3408
4,Paid Search,4547


## First Interaction Attribution Model
* 고객이 상호작용한(노출된) 첫 번째 채널이 전환에 100% 기여했다고 간주

In [19]:
df_first_interaction = df.drop_duplicates('cookie', keep='first')[['cookie', 'channel', 'conversion']]
df_first_interaction.head(10)

Unnamed: 0,cookie,channel,conversion
0,00000FkCnDfDDf0iC97iC703B,Instagram,0
4,0000nACkD9nFkBBDECD3ki00E,Paid Search,0
10,0003EfE37E93D0BC03iBhBBhF,Paid Search,0
15,00073CFE3FoFCn70fBhB3kfon,Instagram,0
16,00079hhBkDF3k3kDkiFi9EFAD,Paid Search,0
17,0007iiAiFh3ifoo9Ehn3ABB0F,Instagram,0
21,0007o0nfoh9o79DDfD7DAiEnE,Facebook,0
22,0007oEBhnoF97AoEE3BCkFnhB,Paid Search,1
23,00090n9EBBEkA000C7Cik999D,Facebook,1
24,000A9AfDohfiBAFB0FDf3kDEE,Online Video,0


In [26]:
# 처음 본 광고와 마지막 conversion 여부
df_first_interaction_conv = df_first_interaction[['cookie', 'channel']]
df_first_interaction_conv = df_first_interaction_conv.merge(df_last_interaction_conv[['cookie', 'conversion']], on='cookie')
df_first_interaction_conv.head()

Unnamed: 0,cookie,channel,conversion
0,0007oEBhnoF97AoEE3BCkFnhB,Paid Search,1
1,00090n9EBBEkA000C7Cik999D,Facebook,1
2,000h3n9nC0hFhE3CCnkkAof7n,Facebook,1
3,000hCBnCB7oi7ADAEnEBCnBEE,Online Video,1
4,000kiDB3D0fCfDAohCDB3ohko,Facebook,1


In [32]:
first_interaction_attribute = df_first_interaction_conv.groupby('channel')['cookie'].count().reset_index()
first_interaction_attribute.rename(
    columns={
        'cookie':'attribution'
    }
)
first_interaction_attribute

Unnamed: 0,channel,cookie
0,Facebook,5177
1,Instagram,2329
2,Online Display,2160
3,Online Video,3216
4,Paid Search,4757
