In [1]:
from zipfile import ZipFile

import altair as alt
import numpy as np
import pandas as pd

In [None]:
zipfile = ZipFile("./data/avazu-ctr-prediction.zip")
train = pd.read_csv(
    zipfile.open("train.gz"), compression="gzip", usecols=["click", "hour"]
)

# # For faster data reloads
# train.to_feather("./data/train.feather")
# assert train.equals(pd.read_feather("./data/train.feather"))
# train = pd.read_feather("./data/train.feather")

In [6]:
train["dthour"] = pd.to_datetime(train["hour"], format="%y%m%d%H")
assert (train["hour"].astype(str).str[-2:].astype(int) == train["dthour"].dt.hour).all()
train = train.set_index("dthour").drop(columns="hour")

In [7]:
window = "7D"
hourly = pd.DataFrame()
hourly["CTR"] = train.resample("H")["click"].mean()
hourly["clicks"] = train.resample("H")["click"].sum()
hourly["impressions"] = train.resample("H")["click"].count()

In [8]:
hourly[f"{window}-mean"] = (
    hourly.rolling(window, center=True)["clicks", "impressions"]
    .sum()
    .apply(lambda x: x["clicks"] / x["impressions"], axis=1)
)

hourly["squared_error"] = (hourly["CTR"] - hourly[f"{window}-mean"]) ** 2
hourly[f"{window}-squared_error"] = (
    hourly["squared_error"].rolling(window, center=True).sum()
)

hourly["hours"] = hourly.rolling(window, center=True).apply(lambda x: x.size).iloc[:, 0]
hourly[f"{window}-std"] = np.sqrt(hourly[f"{window}-squared_error"] / hourly["hours"])

In [9]:
hourly["top"] = hourly[f"{window}-mean"] + hourly[f"{window}-std"] * 1.5
hourly["bottom"] = hourly[f"{window}-mean"] - hourly[f"{window}-std"] * 1.5
hourly["outlier"] = (hourly["CTR"] > hourly["top"]) | (
    hourly["CTR"] < hourly["bottom"]
).astype(bool)

In [10]:
print(hourly["outlier"].astype(int).describe())
hourly.loc[:, ["CTR", f"{window}-mean", f"{window}-std", "top", "bottom", "outlier"]]

count    240.000000
mean       0.137500
std        0.345094
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: outlier, dtype: float64


Unnamed: 0_level_0,CTR,7D-mean,7D-std,top,bottom,outlier
dthour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-10-21 00:00:00,0.174714,0.170904,0.017741,0.197515,0.144292,False
2014-10-21 01:00:00,0.173695,0.171102,0.017683,0.197627,0.144577,False
2014-10-21 02:00:00,0.150696,0.171301,0.017615,0.197723,0.144879,False
2014-10-21 03:00:00,0.169791,0.171509,0.017560,0.197849,0.145168,False
2014-10-21 04:00:00,0.151206,0.171381,0.017509,0.197645,0.145117,False
...,...,...,...,...,...,...
2014-10-30 19:00:00,0.194021,0.161827,0.020543,0.192642,0.131013,True
2014-10-30 20:00:00,0.187967,0.161602,0.020611,0.192518,0.130686,False
2014-10-30 21:00:00,0.186728,0.161327,0.020625,0.192265,0.130390,False
2014-10-30 22:00:00,0.171141,0.161240,0.020740,0.192350,0.130130,False


In [11]:
try:
    hourly.reset_index("dthour", inplace=True)
except KeyError:
    print("`dthour` is already index")

In [12]:
points = (
    alt.Chart(hourly)
    .mark_point()
    .encode(
        x="dthour:T",
        y="CTR:Q",
        color=alt.Color("outlier:N"),
        tooltip=["dthour:T", "CTR", "outlier"],
        # tooltip=alt.Tooltip('dthour', timeUnit='hours')
    )
)

lines = alt.layer(
    alt.Chart(hourly).mark_line().encode(x="dthour:T", y="CTR:Q"),
    alt.Chart(hourly)
    .mark_line(opacity=0.5)
    .encode(
        x="dthour:T",
        y=f"{window}-mean:Q",
    ),
    alt.Chart(hourly)
    .mark_area(opacity=0.2)
    .encode(x="dthour:T", y="top:Q", y2="bottom:Q"),
)

(points + lines).properties(title="CTR Outliers", width=600, height=150)
