## Python绘制桑基图分析广告转化数据

桑基图 (Sankey Diagram)：是一种特定类型的流图，用于描述一组值到另一组值的流向。

桑基图的特点如下：
* 起始流量和结束流量相同，所有主支宽度的总和与所有分出去的分支宽度总和相等，保持能量的平衡；
* 在内部，不同的线条代表了不同的流量分流情况，它的宽度成比例地显示此分支占有的流量；
* 节点不同的宽度代表了特定状态下的流量大小。

### 1. 读取广告转化数据集

In [1]:
import pandas as pd

In [2]:
# 数据源：https://www.kaggle.com/loveall/clicks-conversion-tracking/data
# 广告转化数据集：Ad Conversions Data
df = pd.read_csv("./datas/KAG_conversion_data.csv")

In [3]:
df.head(10)

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1
5,708820,916,103929,30-34,M,29,1915,0,0.0,1,1
6,708889,916,103940,30-34,M,15,15615,3,4.77,1,0
7,708895,916,103941,30-34,M,16,10951,1,1.27,1,1
8,708953,916,103951,30-34,M,27,2355,1,1.5,1,0
9,708958,916,103952,30-34,M,28,9502,3,3.16,1,0


绘图目的：
总流量 > 不同的活动 > 性别 > 年龄

### 2. 构造桑基图数据集

In [4]:
# 数据处理
df["total"] = "全量"
df["xyz_campaign_id"] = df["xyz_campaign_id"].map(lambda x : f"campaign_{x}")
df["gender"] = df["gender"].map(lambda x : f"gender_{x}")
df["age"] = df["age"].map(lambda x : f"age_{x}")

In [5]:
df.head(10)

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,total
0,708746,campaign_916,103916,age_30-34,gender_M,15,7350,1,1.43,2,1,全量
1,708749,campaign_916,103917,age_30-34,gender_M,16,17861,2,1.82,2,0,全量
2,708771,campaign_916,103920,age_30-34,gender_M,20,693,0,0.0,1,0,全量
3,708815,campaign_916,103928,age_30-34,gender_M,28,4259,1,1.25,1,0,全量
4,708818,campaign_916,103928,age_30-34,gender_M,28,4133,1,1.29,1,1,全量
5,708820,campaign_916,103929,age_30-34,gender_M,29,1915,0,0.0,1,1,全量
6,708889,campaign_916,103940,age_30-34,gender_M,15,15615,3,4.77,1,0,全量
7,708895,campaign_916,103941,age_30-34,gender_M,16,10951,1,1.27,1,1,全量
8,708953,campaign_916,103951,age_30-34,gender_M,27,2355,1,1.5,1,0,全量
9,708958,campaign_916,103952,age_30-34,gender_M,28,9502,3,3.16,1,0,全量


In [6]:
# Pyecharts桑基图数据1：节点列表
nodes = pd.concat([df["total"], df["xyz_campaign_id"], df["age"], df["gender"]]).unique()
nodes

array(['全量', 'campaign_916', 'campaign_936', 'campaign_1178', 'age_30-34',
       'age_35-39', 'age_40-44', 'age_45-49', 'gender_M', 'gender_F'],
      dtype=object)

In [7]:
# Pyecharts桑基图数据2：节点到达关系列表
type_list = ["total", "xyz_campaign_id", "gender", "age"]
from_to_list = []
for idx in range(len(type_list)-1):
    from_type = type_list[idx]
    to_type = type_list[idx+1]
    print(from_type, to_type)
    
    df_agg = df.groupby([from_type, to_type]).size().reset_index()
    df_agg.columns = ["from", "to", "value"]

    for _, (from_key, to_key, value) in df_agg.iterrows():
        from_to_list.append([from_key, to_key, value])

from_to_list

total xyz_campaign_id
xyz_campaign_id gender
gender age


[['全量', 'campaign_1178', 625],
 ['全量', 'campaign_916', 54],
 ['全量', 'campaign_936', 464],
 ['campaign_1178', 'gender_F', 276],
 ['campaign_1178', 'gender_M', 349],
 ['campaign_916', 'gender_F', 19],
 ['campaign_916', 'gender_M', 35],
 ['campaign_936', 'gender_F', 256],
 ['campaign_936', 'gender_M', 208],
 ['gender_F', 'age_30-34', 197],
 ['gender_F', 'age_35-39', 109],
 ['gender_F', 'age_40-44', 107],
 ['gender_F', 'age_45-49', 138],
 ['gender_M', 'age_30-34', 229],
 ['gender_M', 'age_35-39', 139],
 ['gender_M', 'age_40-44', 103],
 ['gender_M', 'age_45-49', 121]]

In [8]:
# 中间步骤的解释：计算相邻节点的from和to
df_agg = df.groupby(["total", "xyz_campaign_id"]).size().reset_index()
df_agg.columns = ["from", "to", "value"]

df_agg

Unnamed: 0,from,to,value
0,全量,campaign_1178,625
1,全量,campaign_916,54
2,全量,campaign_936,464


### 3. 绘制桑基图

In [9]:
# 1. 转换节点列表为桑基图形式
pyecharts_nodes = [{"name": node} for node in nodes]
pyecharts_nodes

[{'name': '全量'},
 {'name': 'campaign_916'},
 {'name': 'campaign_936'},
 {'name': 'campaign_1178'},
 {'name': 'age_30-34'},
 {'name': 'age_35-39'},
 {'name': 'age_40-44'},
 {'name': 'age_45-49'},
 {'name': 'gender_M'},
 {'name': 'gender_F'}]

In [10]:
# 2. 转换跳转列表为桑基图形式
pyecharts_links = [
            {"source": source, "target": target, "value": value}
            for source, target, value in from_to_list
        ]
pyecharts_links

[{'source': '全量', 'target': 'campaign_1178', 'value': 625},
 {'source': '全量', 'target': 'campaign_916', 'value': 54},
 {'source': '全量', 'target': 'campaign_936', 'value': 464},
 {'source': 'campaign_1178', 'target': 'gender_F', 'value': 276},
 {'source': 'campaign_1178', 'target': 'gender_M', 'value': 349},
 {'source': 'campaign_916', 'target': 'gender_F', 'value': 19},
 {'source': 'campaign_916', 'target': 'gender_M', 'value': 35},
 {'source': 'campaign_936', 'target': 'gender_F', 'value': 256},
 {'source': 'campaign_936', 'target': 'gender_M', 'value': 208},
 {'source': 'gender_F', 'target': 'age_30-34', 'value': 197},
 {'source': 'gender_F', 'target': 'age_35-39', 'value': 109},
 {'source': 'gender_F', 'target': 'age_40-44', 'value': 107},
 {'source': 'gender_F', 'target': 'age_45-49', 'value': 138},
 {'source': 'gender_M', 'target': 'age_30-34', 'value': 229},
 {'source': 'gender_M', 'target': 'age_35-39', 'value': 139},
 {'source': 'gender_M', 'target': 'age_40-44', 'value': 103},

In [11]:
from pyecharts import options as opts
from pyecharts.charts import Sankey

sankey = (
    Sankey()
    .add(
        "",
        pyecharts_nodes,
        pyecharts_links,
        linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
        label_opts=opts.LabelOpts(position="right"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="广告转化数据桑基图"))
)

sankey.render_notebook()