In [6]:
from google.cloud import storage
import pandas as pd

def dt_from_gcs(gcs_bucket, gcs_path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcs_bucket)
    blob = bucket.blob(gcs_path)
    with blob.open("r") as f:
        dt = pd.read_csv(f)
        return dt

In [93]:
candidates_original = dt_from_gcs("mgaiduk", "events12jan/candidates")
front_events_original = dt_from_gcs("mgaiduk", "events12jan/front_events")



In [137]:
import copy
candidates = copy.deepcopy(candidates_original)
front_events = copy.deepcopy(front_events_original)

In [121]:
def simplify_cgname(cgname):
    for prefix in ["sharecone-weighted-", "sharecone-weightedlag-"]:
        if cgname.startswith(prefix):
            cgname = cgname.removeprefix(prefix)
            values = cgname.split("-")
            cgname = prefix + "-".join(values[1:])
    return cgname

In [136]:
def simplify(dt):
    dt["cg_name"] = dt["cg_name"].map(lambda x: simplify_cgname(x))
    slot_based_candidates = ["promoted-live-feed", "sharecone-weighted-minview4-[1]", "sharecone-weightedlag-minview4-[1]",
                            "sharecone-weighted-lt2k-[1 1 1 1 1]", "promoted"]
    dt = dt.query('cg_name not in @slot_based_candidates')
    return dt

In [138]:
candidates = simplify(candidates)
candidates = candidates.rename(columns = {"cnt": "cg_pool_cnt"})
candidates

Unnamed: 0,cg_pool_cnt,variant,cg_name,cg_rank
0,512,sc_extra_16_,popular,7
1,257,sc_extra_30_,popular,9
2,257,sc_extra_29_,popular,9
3,260,sc_extra_11_,popular,0
4,261,sc_extra_30_,popular,8
...,...,...,...,...
749420,4092,sc_ranker_,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,17
749421,4092,sc_ranker_,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,19
749422,45309,control_,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,22
749423,1022,sc_extra_21_,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,2


In [139]:
front_events = simplify(front_events)
front_events

Unnamed: 0,cnt,cg_name,cg_rank,variant,vplay98,isLiked,isShared,isProfileOpened,isFollowed,isDownloaded
0,3,creator-affinity-post,118,,0,0,0,0,0,0
1,1,creator-affinity-post,167,,1,0,0,0,0,0
2,3,creator-affinity-post,141,,0,0,0,0,0,0
3,6,creator-affinity-post,115,,0,0,0,0,0,0
4,1,creator-affinity-post,140,,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
647841,4,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,6,sc_variable_slot_,0,0,0,0,0,0
647842,1,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,2,sc_variable_slot_,0,0,0,0,0,0
647843,2,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,28,sc_variable_slot_,0,0,0,0,0,0
647844,1,realTimeLabelD_overall_top {Like:1 Share:3 Vid...,21,sc_variable_slot_,0,0,0,0,0,0


In [140]:
candidates_aggr = candidates[candidates["variant"] == "control_"].groupby(["cg_name"]).agg(
    cg_pool_cnt=pd.NamedAgg(column="cg_pool_cnt", aggfunc=sum)
)
candidates_aggr

Unnamed: 0_level_0,cg_pool_cnt
cg_name,Unnamed: 1_level_1
audio-id-affinity-manual-post,102552
audio-id-affinity-top-post,3135990
audio-id-trending-post,3495600
cluster-affinity-postv2,1283068
creator-affinity-live,991
creator-affinity-post,4484787
creator-affinity-post-backPress,9924
engaged-creator-affinity-post,2997565
ffm-session-embedding,4518700
ffm-vplay98-tf_recall_24hr_interaction,433000


In [153]:
candidates_aggr2 = candidates[candidates["variant"] == "control_"].groupby(["cg_name", "cg_rank"]).agg(
    cg_pool_cnt=pd.NamedAgg(column="cg_pool_cnt", aggfunc=sum),
)
candidates_aggr2["cg_pool_cumsum"] = candidates_aggr2.groupby("cg_name")["cg_pool_cnt"].cumsum()
candidates_aggr2

Unnamed: 0_level_0,Unnamed: 1_level_0,cg_pool_cnt,cg_pool_cumsum
cg_name,cg_rank,Unnamed: 2_level_1,Unnamed: 3_level_1
audio-id-affinity-manual-post,100,1572,1572
audio-id-affinity-manual-post,101,1643,3215
audio-id-affinity-manual-post,102,1708,4923
audio-id-affinity-manual-post,103,1744,6667
audio-id-affinity-manual-post,104,1765,8432
...,...,...,...
similar-engaged-creator-affinity-post,515,5,5131982
similar-engaged-creator-affinity-post,516,5,5131987
similar-engaged-creator-affinity-post,517,4,5131991
similar-engaged-creator-affinity-post,518,3,5131994


In [141]:
front_events_aggr = front_events[front_events["variant"] == "control_"]
front_events_aggr = front_events_aggr[front_events_aggr["isLiked"] == 1]
front_events_aggr = front_events_aggr.groupby(["cg_name"]).agg(
    front_events_cnt = pd.NamedAgg(column="cnt", aggfunc=sum)
)
front_events_aggr

Unnamed: 0_level_0,front_events_cnt
cg_name,Unnamed: 1_level_1
audio-id-affinity-manual-post,5
audio-id-affinity-top-post,166
audio-id-trending-post,44
cluster-affinity-postv2,206
creator-affinity-post,1256
engaged-creator-affinity-post,640
ffm-session-embedding,2047
ffm-vplay98-tf_recall_24hr_interaction,218
ffm-weighted-play-realtimecosine-[1 1 1 1 1 1],9
ffm-weighted-vplay98-tf_recall_multilingual_durationsampling_24hr_60d-[1 1],242


In [142]:
dt = front_events_aggr.join(candidates_aggr, how="outer")
dt["front_events_share"] = dt["front_events_cnt"] / sum(dt["front_events_cnt"].dropna())
dt["cg_pool_share"] = dt["cg_pool_cnt"] / sum(dt["cg_pool_cnt"].dropna())
dt["front_event_per_cg_slot"] = dt["front_events_cnt"] / dt["cg_pool_cnt"]
dt["front_event_per_cg_slot_normalized"] = dt["front_event_per_cg_slot"]/max(dt["front_event_per_cg_slot"])
dt.sort_values(by='front_event_per_cg_slot_normalized', inplace = True, ascending=False)
pd.set_option("display.max_rows", 200)
with pd.option_context('display.float_format', '{:,.4f}'.format):
    display(dt[["front_events_share", "cg_pool_share", "front_event_per_cg_slot_normalized"]])

Unnamed: 0_level_0,front_events_share,cg_pool_share,front_event_per_cg_slot_normalized
cg_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sharecone-weighted-interaction_2kth_3day-[1],0.3856,0.2074,1.0
ffm-weighted-vplay98-tf_recall_multilingual_durationsampling_24hr_60d-[1 1],0.0055,0.0039,0.7468
sharecone-weighted-multilingual_2kth_3d_plc-[1 1],0.2721,0.2072,0.7061
ffm-vplay98-tf_recall_24hr_interaction,0.0049,0.0042,0.6377
ffm-session-embedding,0.0462,0.0433,0.5738
interest-creator-affinity-post,0.0216,0.024,0.4842
sharecone-weighted-realtimecosine_150k-[1 1 1 1 1 1],0.163,0.212,0.4134
creator-affinity-post,0.0284,0.043,0.3548
engaged-creator-affinity-post,0.0145,0.0288,0.2705
similar-engaged-creator-affinity-post,0.0219,0.0492,0.2389


In [156]:
front_events_aggr2 = front_events[front_events["variant"] == "control_"]
front_events_aggr2 = front_events_aggr2[front_events_aggr2["isLiked"] == 1]
front_events_aggr2 = front_events_aggr2.groupby(["cg_name", "cg_rank"]).agg(
    front_events_cnt = pd.NamedAgg(column="cnt", aggfunc=sum)
)
front_events_aggr2["front_events_cumsum"] = front_events_aggr2.groupby("cg_name")["front_events_cnt"].cumsum()
front_events_aggr2

Unnamed: 0_level_0,Unnamed: 1_level_0,front_events_cnt,front_events_cumsum
cg_name,cg_rank,Unnamed: 2_level_1,Unnamed: 3_level_1
audio-id-affinity-manual-post,110,1,1
audio-id-affinity-manual-post,135,1,2
audio-id-affinity-manual-post,140,1,3
audio-id-affinity-manual-post,165,1,4
audio-id-affinity-manual-post,170,1,5
...,...,...,...
similar-engaged-creator-affinity-post,418,1,964
similar-engaged-creator-affinity-post,422,1,965
similar-engaged-creator-affinity-post,423,1,966
similar-engaged-creator-affinity-post,447,1,967


In [201]:
dt2 = front_events_aggr2.join(candidates_aggr2, how="inner")
dt2["cg_pool_cumshare"] = dt2["cg_pool_cumsum"] / sum(dt2["cg_pool_cnt"].dropna())
pd.set_option("display.max_rows", 200)
dt2["front_events_share"] = dt2["front_events_cumsum"] / sum(dt2["front_events_cumsum"].dropna())
dt2["front_event_per_cg_slot"] = dt2["front_events_cumsum"] / dt2["cg_pool_cumsum"]
dt2["front_event_per_cg_slot_normalized"] = dt2["front_event_per_cg_slot"] / max(dt2["front_event_per_cg_slot"].dropna())
dt2 = dt2.reset_index()
dt2["cg_pool_cumsum_per_pos"] = dt2["cg_pool_cumsum"]/(dt2["cg_rank"] + 1)
dt2 = dt2.set_index(["cg_name", "cg_rank"])
with pd.option_context('display.float_format', '{:,.4f}'.format):
    display(dt2[["front_events_cumsum", "cg_pool_cumsum", "cg_pool_cumshare", "front_event_per_cg_slot_normalized", "cg_pool_cumsum_per_pos"]])
dt2[["front_events_cumsum", "cg_pool_cumsum", "cg_pool_cumshare", "front_event_per_cg_slot_normalized", "cg_pool_cumsum_per_pos"]].to_csv("results2.csv", sep="\t", float_format="%.3f")


Unnamed: 0_level_0,Unnamed: 1_level_0,front_events_cumsum,cg_pool_cumsum,cg_pool_cumshare,front_event_per_cg_slot_normalized,cg_pool_cumsum_per_pos
cg_name,cg_rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
audio-id-affinity-manual-post,110,1,19264,0.0002,0.0150,173.5495
audio-id-affinity-manual-post,135,2,45296,0.0005,0.0127,333.0588
audio-id-affinity-manual-post,140,3,51483,0.0005,0.0168,365.1277
audio-id-affinity-manual-post,165,4,70260,0.0007,0.0164,423.2530
audio-id-affinity-manual-post,170,5,74880,0.0008,0.0193,437.8947
...,...,...,...,...,...,...
similar-engaged-creator-affinity-post,418,964,5102263,0.0534,0.0545,12177.2387
similar-engaged-creator-affinity-post,422,965,5105858,0.0534,0.0546,12070.5863
similar-engaged-creator-affinity-post,423,966,5106693,0.0534,0.0546,12044.0873
similar-engaged-creator-affinity-post,447,967,5121432,0.0536,0.0545,11431.7679


In [205]:
dt3 = dt2.reset_index().groupby("cg_name").agg(
    last_front_event_per_cg_slot_normalized=pd.NamedAgg(column="front_event_per_cg_slot_normalized", aggfunc=lambda ser: ser.values[-1]),
)
dt3 = dt3.sort_values(by="last_front_event_per_cg_slot_normalized", ascending=False)
dt3

Unnamed: 0_level_0,last_front_event_per_cg_slot_normalized
cg_name,Unnamed: 1_level_1
sharecone-weighted-interaction_2kth_3day-[1],0.227885
ffm-weighted-vplay98-tf_recall_multilingual_durationsampling_24hr_60d-[1 1],0.171203
sharecone-weighted-multilingual_2kth_3d_plc-[1 1],0.160915
ffm-vplay98-tf_recall_24hr_interaction,0.155271
ffm-session-embedding,0.130768
interest-creator-affinity-post,0.11034
sharecone-weighted-realtimecosine_150k-[1 1 1 1 1 1],0.094218
creator-affinity-post,0.080954
ffm-weighted-play-realtimecosine-[1 1 1 1 1 1],0.063981
engaged-creator-affinity-post,0.061853


In [209]:
dt4 = dt.join(dt3, how="inner")
dt4["last_front_event_per_cg_slot_normalized"] = dt4["last_front_event_per_cg_slot_normalized"] / max(dt4["last_front_event_per_cg_slot_normalized"].dropna())
pd.set_option("display.max_rows", 200)
with pd.option_context('display.float_format', '{:,.4f}'.format):
    display(dt4[["front_events_share", "cg_pool_share", "front_event_per_cg_slot_normalized", "last_front_event_per_cg_slot_normalized"]])
    
    
    


Unnamed: 0_level_0,front_events_share,cg_pool_share,front_event_per_cg_slot_normalized,last_front_event_per_cg_slot_normalized
cg_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sharecone-weighted-interaction_2kth_3day-[1],0.3856,0.2074,1.0,1.0
ffm-weighted-vplay98-tf_recall_multilingual_durationsampling_24hr_60d-[1 1],0.0055,0.0039,0.7468,0.7513
sharecone-weighted-multilingual_2kth_3d_plc-[1 1],0.2721,0.2072,0.7061,0.7061
ffm-vplay98-tf_recall_24hr_interaction,0.0049,0.0042,0.6377,0.6814
ffm-session-embedding,0.0462,0.0433,0.5738,0.5738
interest-creator-affinity-post,0.0216,0.024,0.4842,0.4842
sharecone-weighted-realtimecosine_150k-[1 1 1 1 1 1],0.163,0.212,0.4134,0.4134
creator-affinity-post,0.0284,0.043,0.3548,0.3552
engaged-creator-affinity-post,0.0145,0.0288,0.2705,0.2714
similar-engaged-creator-affinity-post,0.0219,0.0492,0.2389,0.239
