In [None]:
import pandas as pd
import numpy as np

In [None]:
# 2022
impressions_df_q1 = pd.read_csv("../datasets/asets/tasets/impressions/2022/impressions_q1.csv")
impressions_df_q2 = pd.read_csv("../datasets/asets/tasets/impressions/2022/impressions_q2.csv")
impressions_df_q3 = pd.read_csv("../datasets/asets/tasets/impressions/2022/impressions_q3.csv")
impressions_df_q4 = pd.read_csv("../datasets/asets/tasets/impressions/2022/impressions_q4.csv")

impressions_df = pd.concat([impressions_df_q1, impressions_df_q2, impressions_df_q3, impressions_df_q4])

In [None]:
impressions_df[
        (impressions_df["gaze_valid"] == True)
        & (impressions_df["is_fixated"] == True)
        & (impressions_df["total_fixation_duration"] < 1)
    ]

In [None]:
# Only use real data, ignore predictions
impressions_df = impressions_df[impressions_df["gaze_valid"] == True]

In [None]:
# calculate true iab_inview according to schibsted
impressions_df["true_is_iab_inview"] = impressions_df.apply(
    lambda x: 
    True if (
        (x["ad_technical_format"] == "out-stream") & (x["exist_viewable_1_s_threshold_50"] == True) & (x["exist_viewable_2_s_threshold_50"] == False)
    ) | (x["is_iab_inview"] == True) 
    else False, 
    axis=1
)

In [None]:
# Normalise outliers for fixation duration (10 seconds)
impressions_df.loc[impressions_df["total_fixation_duration"] >= 10000, "total_fixation_duration"] = 10000

In [None]:
# run through predictions for 2022
import os
import json

def _get_pred_map():
    pred_map = dict()
    
    #for i in range(1, 7):
    #for i in range(7, 13):
    for i in range(1, 13):
        if i < 10:
            i = f'0{i}'
        for day in os.listdir(f'../predictions/2022/{i}'):
            for h in os.listdir(f'../predictions/2022/{i}/{day}'):
                if not h.endswith('.ndjson'):
                    continue
                with open(f'../predictions/2022/{i}/{day}/{h}')as f:
                    for line in f.readlines():
                        json_line = json.loads(line)
                        id_ = json_line['id']
                        pred_map[id_] = json_line['prediction']
    return pred_map
                            
pred_map = _get_pred_map()

In [None]:
def _get_final_fixation(is_iab_inview, gaze_valid, is_fixated, id_, pred_map):
    """
    Do we need to remove 
        gaze_valid = True (eyetracker configured/enabled)
        is_fixated = True (gaze cordinates and ad overlap)
        is_iab_inview = False (example, only 40% of ad is visable)
    """
    if gaze_valid:
        return is_fixated
    if id_ in pred_map and is_iab_inview:
        return pred_map[id_]
    return False

impressions_df['final_fixation'] = impressions_df.apply(lambda row: _get_final_fixation(row['true_is_iab_inview'], row['gaze_valid'], row['is_fixated'], row['id'], pred_map), axis=1)


In [None]:
# Calculate mean values for for different technical formats. This will be used to fillna
display_total_fixation_duration_mean = impressions_df[
    (impressions_df["is_fixated"] == True) 
    & (impressions_df["ad_technical_format"] == "display") 
    & (impressions_df["total_fixation_duration"] != 0)
]["total_fixation_duration"].mean()

out_stream_total_fixation_duration_mean = impressions_df[
    (impressions_df["is_fixated"] == True) 
    & (impressions_df["ad_technical_format"] == "out-stream") 
    & (impressions_df["total_fixation_duration"] != 0)
]["total_fixation_duration"].mean()

in_stream_total_fixation_duration_mean = impressions_df[
    (impressions_df["is_fixated"] == True) 
    & (impressions_df["ad_technical_format"] == "in-stream") 
    & (impressions_df["total_fixation_duration"] != 0)
]["total_fixation_duration"].mean()

In [None]:
print(f"Display: {round(display_total_fixation_duration_mean, 4)}")
print(f"Out-stream: {round(out_stream_total_fixation_duration_mean, 4)}")
print(f"In-stream: {round(in_stream_total_fixation_duration_mean, 4)}")

In [None]:
# When using fixations from predictions, we set the fixation time to the overall average, since otherwise its 0
impressions_df.loc[
    (impressions_df["final_fixation"] == True) 
    & (impressions_df["gaze_valid"] == False) 
    & (impressions_df["total_fixation_duration"] == 0) 
    & (impressions_df["ad_technical_format"] == "display"),
    "total_fixation_duration"
] = display_total_fixation_duration_mean

impressions_df.loc[
    (impressions_df["final_fixation"] == True) 
    & (impressions_df["gaze_valid"] == False)
    & (impressions_df["total_fixation_duration"] == 0) 
    & (impressions_df["ad_technical_format"] == "in-stream"),
    "total_fixation_duration"
] = in_stream_total_fixation_duration_mean

impressions_df.loc[
    (impressions_df["final_fixation"] == True) 
    & (impressions_df["gaze_valid"] == False) 
    & (impressions_df["total_fixation_duration"] == 0) 
    & (impressions_df["ad_technical_format"] == "out-stream"),
    "total_fixation_duration"
] = out_stream_total_fixation_duration_mean

In [None]:
grouped_df = impressions_df.groupby(["ad_technical_format", "channel"]).apply(
    lambda x: pd.Series({
        "impression_count": x["id"].count(),
        "fixation_count": x[x["is_fixated"] == True]["id"].count(),
        "in_view_count": x[x["is_iab_inview"] == True]["id"].count(),
        "true_in_view_count": x[x["true_is_iab_inview"] == True]["id"].count(),
        "fixation_time_total": round(x["total_fixation_duration"].sum(), 4),
        "fixation_time_fixated": round(x[x["is_fixated"] == True]["total_fixation_duration"].sum(), 4)
    })
)
grouped_df = grouped_df.reset_index()
grouped_df

In [None]:
grouped_df.to_excel("../final_reports/2022_benchmark_gaze_valid.xlsx", index=False)

In [None]:
final_df = grouped_df.copy()

In [None]:
final_df["avg_fixation_time_total"] = final_df.apply(
    lambda x: f'{round(x["fixation_time_total"] / x["fixation_count"], 4)} ms', 
    axis=1
)
final_df["avg_fixation_time_fixated"] = final_df.apply(
    lambda x: f'{round(float(x["fixation_time_fixated"]) / x["fixation_count"], 4)} ms', 
    axis=1
)
final_df["inview_impression_ratio"] = final_df.apply(lambda x: f'{round((x["true_in_view_count"] / x["impression_count"]) * 100, 0)} %', axis=1)
final_df["inview_fixation_ratio"] = final_df.apply(lambda x: f'{round((x["fixation_count"] / x["in_view_count"]) * 100, 0)} %', axis=1)


final_df["fixation_ratio"] = final_df.apply(lambda x: f'{round((x["fixation_count"] / x["impression_count"]) * 100, 0)} %', axis=1)

final_df

In [None]:
final_df = final_df[
    [
        "ad_technical_format", "channel", "impression_count", "inview_impression_ratio", "fixation_count", 
        "fixation_time_total", "avg_fixation_time_fixated", "fixation_time_fixated",
        "in_view_count", "inview_fixation_ratio", "fixation_ratio"
    ]
]
final_df.to_excel("../final_reports/2022_benchmark.xlsx", startrow=16, index=False)

In [None]:
#import xlsxwriter
#final_df_2022 = pd.read_excel("../final_reports/2022_benchmark_GAZE_VALID_V6.xlsx")
#final_df_2022_q1 = pd.read_excel("../final_reports/2022_benchmark_GAZE_VALID_V7_q1.xlsx")
#final_df_2022_q2 = pd.read_excel("../final_reports/2022_benchmark_GAZE_VALID_V7_q2.xlsx")
#final_df_2022_q3 = pd.read_excel("../final_reports/2022_benchmark_GAZE_VALID_V7_q3.xlsx")
#final_df_2022_q4 = pd.read_excel("../final_reports/2022_benchmark_GAZE_VALID_V7_q4.xlsx")


#with pd.ExcelWriter('../final_reports/2022_benchmark_v_7.xlsx', engine='xlsxwriter') as writer:
#    final_df_2022.to_excel(writer, startrow=1, startcol=0, index=False)
#    final_df_2022_q1.to_excel(writer, startrow=1+len(final_df_2022)+3, startcol=0, index=False)
#    final_df_2022_q2.to_excel(writer, startrow=1+len(final_df_2022)+len(final_df_2022_q1)+6, startcol=0, index=False)
#    final_df_2022_q3.to_excel(writer, startrow=1+len(final_df_2022)+len(final_df_2022_q1)+len(final_df_2022_q2)+9, startcol=0, index=False)
#    final_df_2022_q4.to_excel(writer, startrow=1+len(final_df_2022)+len(final_df_2022_q1)+len(final_df_2022_q2)+len(final_df_2022_q3)+12, startcol=0, index=False)

#print("Done")