In [None]:
import sys
import os
import socket
import datetime

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.window import Window

import pandas as pd
import numpy as np

In [None]:
sparkHome = "/apache/spark3.1"
bxPexFile = "bx_pex_env_5_9_1.pex"
os.environ["SPARK_HOME"] = sparkHome
os.environ["PYSPARK_PYTHON"] = f"./{bxPexFile}"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/share/anaconda3/python3.7/bin/python"
sys.path.append(f"{sparkHome}/python/lib/py4j-0.10.9-src.zip")
sys.path.append(f"{sparkHome}/python")

host_ip = socket.gethostbyname(socket.gethostname())

spark = SparkSession \
    .builder \
    .appName("click_propensity_estimator_" + os.environ["KRYLOV_PRINCIPAL"]) \
    .master("yarn") \
    .config("spark.driver.host", host_ip) \
    .config("spark.driver.port", "30202") \
    .config("spark.yarn.executor.memoryOverhead", "1024") \
    .config("spark.executor.cores", "3") \
    .config("spark.driver.memory", "25g") \
    .config("spark.executor.memory", "25g") \
    .config("spark.rdd.compress", True) \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "300s") \
    .config("spark.sql.broadcastTimeout", "1200s") \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.initialExecutors", 10) \
    .config("spark.dynamicAllocation.maxExecutors", 50) \
    .config("spark.yarn.queue", "hddq-exprce-perso-high-mem") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.driver.extraJavaOptions", '-Dhttp.proxyHost=httpproxy.vip.ebay.com -Dhttp.proxyPort=80 -Dhttps.proxyHost=httpproxy.vip.ebay.com -Dhttps.proxyPort=80') \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.1.1") \
    .config("spark.files", f"viewfs://apollo-rno/user/b_perso/pex_environments/{bxPexFile}") \
    .enableHiveSupport() \
    .getOrCreate()

spark


# click-propensity

In [None]:
schema = T.StructType(
    [
        T.StructField('siteId', T.IntegerType(), True),
        T.StructField('placementId', T.IntegerType(), True),
        T.StructField('displayRank', T.IntegerType(), True),
        T.StructField('clickPropensity', T.FloatType(), True),
        T.StructField('purchasePropensity', T.FloatType(), True),
        T.StructField('n', T.IntegerType(), True)
    ]
)

path = "viewfs://apollo-rno/apps/b_perso/hp/click_propensity/mad_cdl/position_bias.tsv"

clickCount = spark.read.option("header", True).schema(schema).option("delimiter", "\t").csv(path)


In [None]:

w = Window.partitionBy("siteId", "placementId").orderBy([F.col("displayRank").asc()])

clickCount1 = clickCount \
    .withColumn("bias1", F.first("clickPropensity", True).over(w)) \
    .withColumn("biasK", F.col("clickPropensity")) \
    .withColumn("propensity_pretrainer", F.col("biasK") / F.col("bias1")) \
    .withColumn("propensity_impression_weights", 1 / F.col("propensity_pretrainer"))



In [None]:
pdf = clickCount1.toPandas()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',5)

In [None]:
pdf

In [None]:
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

In [None]:

@interact(placementId=sorted(pdf["placementId"].unique()),siteId=sorted(pdf["siteId"].unique()))
def calc_plot(placementId, siteId):
    pdf1 = pdf.query(f"placementId == {placementId} and siteId == {siteId}")
    #pdf1["propensity_impression_weights"].plot.hist()
    #print(pdf1['propensity_impression_weights'].max())
    pdf1.plot.line(x="displayRank", y="clickPropensity")

        