In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

np.random.seed(2)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

***

In [None]:
train = pd.read_parquet("../input/janestreet-preprocessing/train.parquet")
len(train) / 1e6

In [None]:
train = train.query("date > 85").reset_index(drop=True)
len(train) / 1e6

In [None]:
train = train.query("weight > 0").reset_index(drop=True)
len(train) / 1e6

In [None]:
train.date.nunique()

In [None]:
train1 = train.query("date < 450").reset_index(drop=True)
train2 = train.query("date >= 450").reset_index(drop=True)

In [None]:
train1.date.nunique()

In [None]:
train2.date.nunique()

In [None]:
np.sqrt(250/364)

In [None]:
np.sqrt(250/414)

In [None]:
df = train.groupby("date")["ts_id"].count().reset_index(name="trx_by_day")

plt.figure(figsize=(15,5))
plt.plot(df.date, df.trx_by_day)
plt.grid()
plt.show()

In [None]:
sns.displot(df.trx_by_day, aspect=2)
plt.title("Whole dataset")
plt.grid()

In [None]:
sns.displot(df.query("date < 450").trx_by_day, aspect=2)
plt.title("Train dataset")
plt.grid()

In [None]:
sns.displot(df.query("date >= 450").trx_by_day, aspect=2)
plt.title("Valid dataset")
plt.grid()

In [None]:
# days in LB
1e6 / int(df.query("date >= 400").trx_by_day.median())

In [None]:
# "date" for simulating LB
dates = df.query("trx_by_day < 8000").tail(248).date.values
dates

***
### Utility metric analysis

In [None]:
def utility_score(date, weight, resp, action):
    """
    Fast computation of utility score
    """
    date = date.astype(int)
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return -u

In [None]:
repetitions = 23
stratified = False

train_idx = train.query("date < 450").index
valid_idx = train.query("date >= 450").index
lb_idx = train.query("date in @dates").index

u_train = list()
s_train = list()
u_valid = list()
s_valid = list()
u_full = list()
s_full = list()
u_lb = list()
s_lb = list()

for error_rate in np.arange(0.43, 0.471, 0.0025)[::-1]:
    
    print(f" Error rate: {error_rate:0f} ".center(60,"-"))

    u1_results = list()
    for i in range(repetitions):
        df1 = train.loc[train_idx, :].copy()
        df1["action"] = df1.eval("resp > 0").astype(int)
        if stratified:
            idx = df1.groupby("date").sample(frac=error_rate).index
        else:
            idx = df1.sample(frac=error_rate).index
        df1.loc[idx,"action"] = df1.loc[idx].action.map({0:1, 1:0})
        u1 = utility_score(df1.date.values, df1.weight.values, df1.resp.values, df1.action.values)
        u1_results.append(u1)
    print(f"Utility score on train: {np.median(u1_results)}")
    u_train.append(np.median(u1_results))
    s_train.append(np.std(u1_results))

    u2_results = list()
    for i in range(repetitions):
        df2 = train.loc[valid_idx, :].copy()
        df2["action"] = df2.eval("resp > 0").astype(int)
        if stratified:
            idx = df2.groupby("date").sample(frac=error_rate).index
        else:
            idx = df2.sample(frac=error_rate).index
        df2.loc[idx,"action"] = df2.loc[idx].action.map({0:1, 1:0})
        u2 = utility_score(df2.date.values, df2.weight.values, df2.resp.values, df2.action.values)
        u2_results.append(u2)
    print(f"Utility score on valid: {np.median(u2_results)}")
    u_valid.append(np.median(u2_results))
    s_valid.append(np.std(u2_results))

    
    u3_results = list()
    for i in range(repetitions):
        df3 = train.loc[:, :].copy()
        df3["action"] = df3.eval("resp > 0").astype(int)
        if stratified:
            idx = df3.groupby("date").sample(frac=error_rate).index
        else:
            idx = df3.sample(frac=error_rate).index
        df3.loc[idx,"action"] = df3.loc[idx].action.map({0:1, 1:0})
        u3 = utility_score(df3.date.values, df3.weight.values, df3.resp.values, df3.action.values)
        u3_results.append(u3)
    print(f"Utility score on full: {np.median(u3_results)}")
    u_full.append(np.median(u3_results))
    s_full.append(np.std(u3_results))

    
    u4_results = list()
    for i in range(repetitions):
        df4 = train.loc[lb_idx, :].copy()
        df4["action"] = df4.eval("resp > 0").astype(int)
        if stratified:
            idx = df4.groupby("date").sample(frac=error_rate).index
        else:
            idx = df4.sample(frac=error_rate).index
        df4.loc[idx,"action"] = df4.loc[idx].action.map({0:1, 1:0})
        u4 = utility_score(df4.date.values, df4.weight.values, df4.resp.values, df4.action.values)
        u4_results.append(u4)
    print(f"Utility score on LB: {np.median(u4_results)}")
    u_lb.append(np.median(u4_results))
    s_lb.append(np.std(u4_results))

In [None]:
# summary table

error_rates = np.arange(0.43, 0.471, 0.0025)[::-1]

summary = pd.DataFrame({
    "error_rate":error_rates,
    "utility_train":u_train,
    "std_train":s_train,
    "utility_valid":u_valid,
    "std_valid":s_valid,
    "utility_full":u_full,
    "std_full":s_full,
    "utility_lb":u_lb,
    "std_lb":s_lb,
})

summary

***