In [5]:
import pandas as pd
import numpy as np
import json
import os
import re
from pyathena import connect
from pyathena.pandas.util import as_pandas
from pyathena.pandas.cursor import PandasCursor

In [6]:
cursor = connect(s3_staging_dir="s3://aws-athena-query-results-094611745175-eu-west-1/",
                 region_name="eu-west-1", profile_name="atexprodadminsso", cursor_class=PandasCursor).cursor()

impressions_df = cursor.execute(f''' 
select 
    "impression_model"."id", 
    "placement_ids", 
    "placement_ids_chosen", 
    "total_fixation_duration", 
    "ad_technical_format",
    "is_fixated", 
    "gaze_valid", 
    "is_iab_inview", 
    "exist_viewable_1_s_threshold_50", 
    "exist_viewable_2_s_threshold_50", 
    CONCAT(cast(ad_width_chosen as VARCHAR), 'x', cast(ad_height_chosen as VARCHAR)) AS size
from "prod_attentionpanel_com_eu_west_1"."impression_model"
where impression_model.part_year = '2023' and gaze_valid = True
''').as_pandas()
#  and impression_model.part_month in ('09')

In [7]:
def extract_pid(placement_id):
    placement_id = json.loads(placement_id)
    bam_ad_slots = placement_id["bam_ad_slot"]
    final_bam_ad_slots = [bas for bas in bam_ad_slots if bas.strip() != '']
    try:
        ret = placement_id['tag_id'][0]
    except:
        return
    ret += ','.join([bas for bas in final_bam_ad_slots])
    return ret

def pid_type(pid):
    try:
        int(pid)
        return 'int'
    except:
        return 'str'

impressions_df['pid'] = impressions_df['placement_ids'].apply(extract_pid)
impressions_df['pid_type'] = impressions_df['pid'].apply(pid_type)
impressions_df = impressions_df[impressions_df["pid_type"] == 'int']
impressions_df['pid'] = impressions_df['pid'].apply(lambda pid: int(pid))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  impressions_df['pid'] = impressions_df['pid'].apply(lambda pid: int(pid))


In [8]:
impressions_df["true_is_iab_inview"] = impressions_df.apply(
    lambda x: 
    True if (
        (x["ad_technical_format"] == "out-stream") & (x["exist_viewable_1_s_threshold_50"] == True) & (x["exist_viewable_2_s_threshold_50"] == False)
    ) | (x["is_iab_inview"] == True) 
    else False, 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  impressions_df["true_is_iab_inview"] = impressions_df.apply(


In [16]:
impressions_df["is_fixated"].value_counts()

False    337358
True     120202
Name: is_fixated, dtype: int64

In [25]:
grouped_df = impressions_df.groupby(["pid", "size"]).apply(
    lambda x: pd.Series(
        {
            "imps": x["id"].count(),
            "fixations": x.loc[x["is_fixated"], "id"].count(),
            "inview": x.loc[x["true_is_iab_inview"], "id"].count()
        }
    )
)
grouped_df = grouped_df.reset_index()
final_df = grouped_df.sort_values(["imps"], ascending=False).head(100)

In [26]:
final_df["fixation ratio"] = round(final_df["fixations"] / grouped_df["imps"] * 100, 2)
final_df["inview ratio"] = round(final_df["inview"] / grouped_df["imps"] * 100, 2)
final_df["fixation/inview ratio"] = round(final_df["fixations"] / grouped_df["inview"] * 100, 2)

final_df["fixation ratio"] = final_df["fixation ratio"].replace([np.inf, -np.inf], 0)
final_df["fixation ratio"] = final_df["fixation ratio"].fillna(0)
final_df["inview ratio"] = final_df["inview ratio"].replace([np.inf, -np.inf], 0)
final_df["inview ratio"] = final_df["inview ratio"].fillna(0)
final_df["fixation/inview ratio"] = final_df["fixation/inview ratio"].replace([np.inf, -np.inf], 0)
final_df["fixation/inview ratio"] = final_df["fixation/inview ratio"].fillna(0)

In [27]:
final_df.to_excel("./sample.xlsx", index=False)