In [38]:
import pandas as pd
import numpy as np
import json
import re
from pyathena import connect
from pyathena.pandas.util import as_pandas
from pyathena.pandas.cursor import PandasCursor
import multiprocessing
import concurrent

In [39]:
client_df = pd.read_excel("./Tobii underlag.xlsx", converters={"Placement Id": int})
client_placement_ids = list(client_df["Placement Id"].unique())

In [40]:
def run_query(placement_id):
#for i, placement_id in enumerate(client_placement_ids):
    #if i % 10 == 0:
    #    print(i)
    cursor.execute(f'''SELECT 
        "impression_id",
        "is_fixated",
        "is_iab_inview",
        "hostname",
        {placement_id} as pid,
        CONCAT(cast(width as VARCHAR), 'x', cast(height as VARCHAR)) AS size
    FROM "data_pipelines_storage_nomarket_prod"."extension_impression"
    WHERE regexp_like(json_format(cast(transform(html_metadata, x -> ROW(x."inline_scripts", x."non_standard_attributes")) as JSON)), '(\D)({placement_id})(\D)')
    ''')

    return as_pandas(cursor)

In [None]:
num_cores = multiprocessing.cpu_count()
total_count = len(client_placement_ids)
cursor = connect(s3_staging_dir="s3://adhoc-athena-output-bucket-eu-north-1/",
                 region_name="eu-north-1", profile_name="atexprodadminsso", cursor_class=PandasCursor).cursor()
impressions_list = list()

# try to execute in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
    for i, result in enumerate(executor.map(run_query, client_placement_ids), 1):
        print(f"Processing {i} out of {total_count}")
        impressions_list.append(result)

impressions_df = pd.concat(impressions_list)

Processing 1 out of 2441
Processing 2 out of 2441
Processing 3 out of 2441
Processing 4 out of 2441
Processing 5 out of 2441
Processing 6 out of 2441
Processing 7 out of 2441
Processing 8 out of 2441
Processing 9 out of 2441
Processing 10 out of 2441
Processing 11 out of 2441
Processing 12 out of 2441
Processing 13 out of 2441


In [24]:
impressions_df

Unnamed: 0,impression_id,is_fixated,is_iab_inview,pid,size
0,39e90448-581c-47bc-b79a-a4096b0430a2,False,True,23767473,402x142
1,001b06e1-8444-450a-84f4-429b531da19b,False,True,23767473,1536x864
2,ba2e910b-9b12-4ee6-854c-56a812e832c9,False,True,23767473,1536x864
3,640b32e1-1658-4c28-bd96-da225976f824,False,True,23767473,402x148
4,40cd748f-799b-4456-9f46-162e4962cfad,False,True,23767473,1536x864
...,...,...,...,...,...
8,ad8c0ed5-992c-43d3-8d84-89c2659a2cbe,False,True,19499533,250x604
9,fab8f56e-5490-4ab1-a18f-ad61467b9c6e,False,False,19499533,160x604
10,68176dc9-393f-49b3-b715-219d5a390f44,False,False,19499533,250x608
11,38ba60ab-beb9-4e45-b4e6-44a853027f8e,False,False,19499533,250x604


In [4]:
def extract_pid(placement_id):
    placement_id = json.loads(placement_id)
    bam_ad_slots = placement_id["bam_ad_slot"]
    final_bam_ad_slots = [bas for bas in bam_ad_slots if bas.strip() != '']
    try:
        ret = placement_id['tag_id'][0]
    except:
        return
    ret += ','.join([bas for bas in final_bam_ad_slots])
    return ret

def pid_type(pid):
    try:
        int(pid)
        return 'int'
    except:
        return 'str'

#impressions_df['pid'] = impressions_df['placement_ids'].apply(extract_pid)
#impressions_df['pid_type'] = impressions_df['pid'].apply(pid_type)
#impressions_df = impressions_df[impressions_df["pid_type"] == 'int']
#impressions_df['pid'] = impressions_df['pid'].apply(lambda pid: int(pid))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  impressions_df['pid'] = impressions_df['pid'].apply(lambda pid: int(pid))


In [5]:
#filtered_impressions = impressions_df[impressions_df["pid"].isin(client_placement_ids)]
#filtered_impressions.id.count()

228365

In [6]:
def _get_pred_map():
    pred_map = dict()
    _year = 2023
    _range1 = 1
    _range2 = 9
    for i in range(_range1, _range2):
        if i < 10:
            i = f'0{i}'
        for day in os.listdir(f'../../predictions/{_year}/{i}'):
            for h in os.listdir(f'../../predictions/{_year}/{i}/{day}'):
                if not h.endswith('.ndjson'):
                    continue
                with open(f'../../predictions/{_year}/{i}/{day}/{h}')as f:
                    for line in f.readlines():
                        json_line = json.loads(line)
                        id_ = json_line['id']
                        pred_map[id_] = json_line['prediction']
    return pred_map
                            
pred_map = _get_pred_map()

In [7]:
def _get_final_fixation(gaze_valid, is_fixated, id_, pred_map):
    if gaze_valid:
        return is_fixated
    if id_ in pred_map:
        return pred_map[id_]
    return False

filtered_impressions['final_fixation'] = filtered_impressions.apply(lambda row: _get_final_fixation(row['gaze_valid'], row['is_fixated'], row['id'], pred_map), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_impressions['final_fixation'] = filtered_impressions.apply(lambda row: _get_final_fixation(row['gaze_valid'], row['is_fixated'], row['id'], pred_map), axis=1)


In [8]:
filtered_impressions["true_is_iab_inview"] = filtered_impressions.apply(
    lambda x: 
    True if (
        (x["ad_technical_format"] == "out-stream") & (x["exist_viewable_1_s_threshold_50"] == True) & (x["exist_viewable_2_s_threshold_50"] == False)
    ) | (x["is_iab_inview"] == True) 
    else False, 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_impressions["true_is_iab_inview"] = filtered_impressions.apply(


In [9]:
grouped_df = filtered_impressions.groupby(["hostname", "pid", "size"]).apply(
    lambda x: pd.Series(
        {
            "impressions": x["id"].count(),
            "fixations": x.loc[x["final_fixation"], "id"].count(),
            "inview": x.loc[x["true_is_iab_inview"], "id"].count()
        }
    )
)
grouped_df = grouped_df.reset_index()

In [29]:
grouped_df = impressions_df.groupby(["pid", "size", "hostname"]).apply(
    lambda x: pd.Series(
        {
            "impressions": x["impression_id"].count(),
            "fixations": x.loc[x["is_fixated"], "impression_id"].count(),
            "inview": x.loc[x["is_iab_inview"], "impression_id"].count()
        }
    )
)
grouped_df = grouped_df.reset_index()

In [35]:
grouped_df["fixation ratio %"] = round(grouped_df["fixations"] / grouped_df["impressions"] * 100, 2).fillna(0).replace([np.inf, -np.inf], 0)
grouped_df["inview ratio %"] = round(grouped_df["inview"] / grouped_df["impressions"] * 100, 2).fillna(0).replace([np.inf, -np.inf], 0)
grouped_df["fixation/inview ratio %"] = round(grouped_df["fixations"] / grouped_df["inview"] * 100, 2).fillna(0).replace([np.inf, -np.inf], 0)
grouped_df

Unnamed: 0,pid,size,impressions,fixations,inview,fixation ratio %,inview ratio %,fixation/inview ratio %
0,19499407,300x604,1,1,1,100.0,100.0,100.0
1,19499407,5x5,2,1,2,50.0,100.0,50.0
2,19499410,640x320,1,1,0,100.0,0.0,0.0
3,19499411,640x320,1,1,0,100.0,0.0,0.0
4,19499417,160x604,1,0,1,0.0,100.0,0.0
...,...,...,...,...,...,...,...,...
70,23373681,640x324,2,0,0,0.0,0.0,0.0
71,23375557,640x324,2,0,0,0.0,0.0,0.0
72,23767473,1536x864,3,0,3,0.0,100.0,0.0
73,23767473,402x142,1,0,1,0.0,100.0,0.0


In [36]:
merged_df = pd.merge(client_df, grouped_df, left_on=["Placement Id", "Size", "Hostname"], right_on=["pid", "size", "hostname"], how="inner")
merged_df

Unnamed: 0,Seller Name,Publisher Name,Placement Id,Size,Hostname,pid,size,impressions,fixations,inview,fixation ratio %,inview ratio %,fixation/inview ratio %
0,Schibsted Sweden,SE-Aftonbladet,19499410,640x320,---,19499410,640x320,1,1,0,100.0,0.0,0.0
1,Schibsted Sweden,SE-Aftonbladet,19499410,640x320,aftonbladet.se,19499410,640x320,1,1,0,100.0,0.0,0.0
2,Schibsted Sweden,SE-Aftonbladet,19499410,640x320,nutritiouspanda.com,19499410,640x320,1,1,0,100.0,0.0,0.0
3,Schibsted Sweden,SE-Aftonbladet,19499410,640x320,play.google.com/store/apps/details?id=com.afto...,19499410,640x320,1,1,0,100.0,0.0,0.0
4,Schibsted Sweden,SE-Aftonbladet,19499410,640x320,play.google.com/store/apps/details?id=se.afton...,19499410,640x320,1,1,0,100.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Schibsted Sweden,SE-Aftonbladet,23373680,640x320,---,23373680,640x320,1,0,1,0.0,100.0,0.0
89,Schibsted Sweden,SE-Aftonbladet,23373680,640x320,aftonbladet.se,23373680,640x320,1,0,1,0.0,100.0,0.0
90,Schibsted Sweden,SE-Aftonbladet,23373680,640x320,play.google.com/store/apps/details?id=com.afto...,23373680,640x320,1,0,1,0.0,100.0,0.0
91,Schibsted Sweden,SE-Aftonbladet,23373680,640x320,play.google.com/store/apps/details?id=se.afton...,23373680,640x320,1,0,1,0.0,100.0,0.0


In [37]:
merged_df.sort_values("impressions", ascending=False)

Unnamed: 0,Seller Name,Publisher Name,Placement Id,Size,Hostname,pid,size,impressions,fixations,inview,fixation ratio %,inview ratio %,fixation/inview ratio %
69,Schibsted Sweden,SE-Aftonbladet,19499441,980x240,---,19499441,980x240,24,0,22,0.0,91.67,0.0
70,Schibsted Sweden,SE-Aftonbladet,19499441,980x240,aftonbladet.se,19499441,980x240,24,0,22,0.0,91.67,0.0
17,Schibsted Sweden,SE-Aftonbladet,19499420,300x600,aftonbladet.se,19499420,300x600,13,0,1,0.0,7.69,0.0
16,Schibsted Sweden,SE-Aftonbladet,19499420,300x600,---,19499420,300x600,13,0,1,0.0,7.69,0.0
74,Schibsted Sweden,SE-Aftonbladet,19499442,250x600,aftonbladet.se,19499442,250x600,12,0,11,0.0,91.67,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Schibsted Sweden,SE-Aftonbladet,19499436,640x320,aftonbladet.se,19499436,640x320,1,0,0,0.0,0.00,0.0
59,Schibsted Sweden,SE-Aftonbladet,19499436,640x320,play.google.com/store/apps/details?id=com.afto...,19499436,640x320,1,0,0,0.0,0.00,0.0
60,Schibsted Sweden,SE-Aftonbladet,19499436,640x320,play.google.com/store/apps/details?id=se.afton...,19499436,640x320,1,0,0,0.0,0.00,0.0
61,Schibsted Sweden,SE-Aftonbladet,19499437,640x320,---,19499437,640x320,1,0,0,0.0,0.00,0.0
