In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
impressions_df = pd.read_csv("../datasets/impressions/impressions_groupm_tele2_6_months.csv")

In [None]:
client_df = pd.read_excel("../datasets/groupm_tele2_report.xlsx")
client_df

In [None]:
import json

def extract_pid(placement_id):
    placement_id = json.loads(placement_id)
    bam_ad_slots = placement_id["bam_ad_slot"]
    final_bam_ad_slots = [bas for bas in bam_ad_slots if bas.strip() != '']
    try:
        ret = placement_id['tag_id'][0]
    except:
        return
    ret += ','.join([bas for bas in final_bam_ad_slots])
    return ret

def pid_type(pid):
    try:
        int(pid)
        return 'int'
    except:
        return 'str'

impressions_df['pid'] = impressions_df['placement_ids'].apply(extract_pid)
impressions_df['pid_type'] = impressions_df['pid'].apply(pid_type)

In [None]:
# run through predictions
import os
import json


def _get_pred_map():
    pred_map = dict()
    for _i in range(1,2):
        if _i == 1:
            _year = 2022
            _range1 = 11
            _range2 = 13
        else:
            _year = 2023
            _range1 = 1
            _range2 = 4
        for i in range(_range1, _range2):
            if i < 10:
                i = f'0{i}'
            for day in os.listdir(f'../predictions/{_year}/{i}'):
                for h in os.listdir(f'../predictions/{_year}/{i}/{day}'):
                    if not h.endswith('.ndjson'):
                        continue
                    with open(f'../predictions/{_year}/{i}/{day}/{h}')as f:
                        for line in f.readlines():
                            json_line = json.loads(line)
                            id_ = json_line['id']
                            pred_map[id_] = json_line['prediction']
    return pred_map
                            
pred_map = _get_pred_map()

In [None]:
impressions_df = impressions_df[impressions_df["pid_type"] == 'int']
impressions_df['pid'] = impressions_df['pid'].apply(lambda pid: int(pid))
impressions_df

In [None]:
tele2_placements = list(client_df['Placement Id'].unique())
tele2_impressions = impressions_df[impressions_df['pid'].isin(tele2_placements)]
tele2_impressions

In [None]:
def _get_final_fixation(gaze_valid, is_fixated, id_, pred_map):
    if gaze_valid:
        return is_fixated
    if id_ in pred_map:
        return pred_map[id_]
    return False

tele2_impressions['final_fixation'] = tele2_impressions.apply(lambda row: _get_final_fixation(row['gaze_valid'], row['is_fixated'], row['id'], pred_map), axis=1)

In [None]:
grouped_df = tele2_impressions.groupby(["pid"]).apply(
    lambda row: pd.Series(
        {
            "num_impressions": row["id"].count(),
            "num_fixations": row[row["final_fixation"] == True]["id"].count(),
        }
    )
)
grouped_df = grouped_df.reset_index()
grouped_df

In [None]:
client_df_grouped = client_df.groupby(["domain", "Placement Id"]).apply(
    lambda x: pd.Series(
        {
            "num_imps": np.sum(x["Imps"])
        }
    )
)
client_df_grouped = client_df_grouped.reset_index()
client_df_grouped

In [None]:
merged_df = pd.merge(grouped_df, client_df_grouped, how='outer', left_on=['pid'], right_on=['Placement Id'])
merged_df

In [None]:
def get_sample_size(num_impressions):
    if np.isnan(num_impressions):
        return "N/A"
    if num_impressions <= 20:
        return "low"
    elif num_impressions > 20 and num_impressions <= 100:
        return "medium"
    else:
        return "high"
    
def get_weight(x):
    if x == "high":
        return 1
    elif x == "medium":
        return 0.7
    elif x == "low":
        return 0.5
    else:
        return None
    
merged_df['sample_size'] = merged_df['num_impressions'].apply(lambda num_impressions: get_sample_size(num_impressions))
merged_df["weight"] = merged_df["sample_size"].apply(lambda x: get_weight(x))

merged_df

In [None]:
merged_df = merged_df.dropna(subset=["num_imps"])
merged_df = merged_df.dropna(subset=["num_impressions"])
merged_df["fix_ratio"] = round(merged_df["num_fixations"] / merged_df["num_impressions"], 2)
merged_df["client_fixations"] = merged_df.apply(lambda x: math.floor(x["fix_ratio"] * x["num_imps"] * x["weight"]), axis=1)
merged_df

In [None]:
grouped_df = merged_df.groupby(["domain", "pid"]).apply(
    lambda x: pd.Series(
        {
            "sum_imp": x["num_impressions"].sum(),
            "sum_imp_client": x["num_imps"].sum(),
            "sum_fixations": x["num_fixations"].sum(),
            "sum_fixations_client": x["client_fixations"].sum(),
            "avg_weight": np.average(x["weight"]),
        }
    )
)
grouped_df

In [None]:
result_df = grouped_df.groupby(["domain"]).apply(
    lambda x: pd.Series(
        {
            "weight": np.average(x["avg_weight"]),
            "impressions": x["sum_imp"].sum(),
            "impressions_client": x["sum_imp_client"].sum(),
            "fixations": x["sum_fixations_client"].sum()
        }
    )
)
result_df["fixation_ratio"] = round(result_df["fixations"] / result_df["impressions_client"] * 100, 2)
result_df["sample_size"] = result_df["impressions"].apply(lambda x: get_sample_size(x))
result_df = result_df.reset_index()
result_df = result_df.sort_values(by=["impressions_client"], ascending=False)
final_df = result_df[["domain", "impressions_client", "fixations", "fixation_ratio", "sample_size"]]
final_df = final_df.rename(columns={"impressions_client": "impressions"})
final_df.to_excel("../final_reports/tele2_groupm_campaign.xlsx", index=False)

In [None]:
round(new_final_df["fixations"].sum() / new_final_df["impressions"].sum() * 100, 2)