In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
root = Path("../input/jane-street-market-prediction/")
!ls ../input/jane-street-market-prediction/

In [None]:
%%time
train = pd.read_csv(root/"train.csv")
train.info()

In [None]:
print("Number of rows:", len(train)/1e6, "millions")
print("Number of ts_id:", train.ts_id.nunique()/1e6, "millions")
print("Number of dates:", train.date.nunique())

In [None]:
features = pd.read_csv(root/"features.csv")
features.info()

In [None]:
example_sample_submission = pd.read_csv(root/"example_sample_submission.csv")
example_sample_submission.info()

In [None]:
example_test = pd.read_csv(root/"example_test.csv")
example_test.info()

***
reduces memory usage of dataframe

In [None]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time
train = reduce_mem_usage(train, verbose=True)

In [None]:
%%time
features = reduce_mem_usage(features, verbose=True)

In [None]:
%%time
example_sample_submission = reduce_mem_usage(example_sample_submission, verbose=True)

In [None]:
%%time
example_test = reduce_mem_usage(example_test, verbose=True)

***
### calculates a new weight column

In [None]:
sns.distplot(train.query("weight > 0").weight)
train.query("weight > 0").weight.describe()

In [None]:
sns.distplot(train.resp)
train.resp.describe()

In [None]:
sns.distplot(train.resp_1)
plt.show()
sns.distplot(train.resp_2)
plt.show()
sns.distplot(train.resp_3)
plt.show()
sns.distplot(train.resp_4)
plt.show()

In [None]:
print("Positive response:", len(train.query("resp > 0"))/1e6)
print("Negative response:", len(train.query("resp < 0"))/1e6)
print("Total rows:", len(train)/1e6)

In [None]:
sns.distplot(train.query("weight > 0").eval("resp * weight"))
train.query("weight > 0").eval("resp * weight").describe()

In [None]:
sns.distplot(train.query("weight > 0").eval("abs(resp) * weight"))
train.query("weight > 0").eval("abs(resp) * weight").describe()

In [None]:
def bce_loss(yreal, ypred, weight=None, label_smoothing=1e-3):
    yreal_smooth = yreal*(1.0 - label_smoothing) + 0.5*label_smoothing
    pw_loss = yreal_smooth*np.log(ypred) + (1-yreal_smooth)*np.log(1-ypred)
    if weight is None:
        return np.mean(pw_loss)  
    else:
        return np.sum(pw_loss*weight)/np.sum(weight)

In [None]:
def utility_score(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

In [None]:
# synthetic responses
def get_synthetic_preds(error_rate=0.4, pred_proba=(0.45,0.55)):
    mapping = {0:1, 1:0,}
    _train = train.copy(deep=True)
    _train = _train.query("weight > 0").reset_index(drop=True)

    _train["action"] = _train.eval("resp > 0").astype(int)
    _train["action_pred"] = _train.eval("resp > 0").astype(int)
    idx_to_change = _train.sample(frac=error_rate).index
    _train.loc[idx_to_change,"action_pred"] = _train.loc[idx_to_change].action.map(mapping)

    _train["action_pred_proba"] = _train["action_pred"].copy()
    _train.loc[_train.query("action_pred == 0").index, "action_pred_proba"] = pred_proba[0]
    _train.loc[_train.query("action_pred == 1").index, "action_pred_proba"] = pred_proba[1]
    
    return _train

lets try different weighting strategies

In [None]:
TEST_STRATEGIES = False

In [None]:
# output:
# (-0.12964928835325826, 0.19857346746690388)
# SpearmanrResult(correlation=-0.13839623840694318, pvalue=0.1696962671593964)

if TEST_STRATEGIES:

    n_repetitions = 100
    error_rate = 0.46

    bce_losses = list()
    utilities = list()

    for i in tqdm(range(n_repetitions)):
        _train = get_synthetic_preds(error_rate=error_rate)
        bce = bce_loss(_train.action.values, _train.action_pred_proba.values)
        utility = utility_score(_train.date.values, _train.weight.values, _train.resp.values, _train.action_pred.values)

        bce_losses.append(bce)
        utilities.append(utility)
        
    print(stats.pearsonr(bce_losses, utilities))
    print(stats.spearmanr(bce_losses, utilities))

In [None]:
# output:
# (1.0, 0.0)
# SpearmanrResult(correlation=0.9999999999999999, pvalue=0.0)

if TEST_STRATEGIES:
    
    n_repetitions = 100
    error_rate = 0.46

    bce_losses = list()
    utilities = list()

    for i in tqdm(range(n_repetitions)):
        _train = get_synthetic_preds(error_rate=error_rate)
        bce = bce_loss(_train.action.values, _train.action_pred_proba.values, weight=_train.eval("abs(resp)*weight").values)
        utility = utility_score(_train.date.values, _train.weight.values, _train.resp.values, _train.action_pred.values)

        bce_losses.append(bce)
        utilities.append(utility)
        
    print(stats.pearsonr(bce_losses, utilities))
    print(stats.spearmanr(bce_losses, utilities))

In [None]:
# output:
# (0.8509862865195542, 3.6685975664067804e-29)
# SpearmanrResult(correlation=0.8396399639963996, pvalue=1.0021829338677619e-27)
if TEST_STRATEGIES:
    
    n_repetitions = 100
    error_rate = 0.46

    bce_losses = list()
    utilities = list()

    for i in tqdm(range(n_repetitions)):
        _train = get_synthetic_preds(error_rate=error_rate)
        bce = bce_loss(_train.action.values, _train.action_pred_proba.values, weight=_train.eval("sqrt(abs(resp)*weight)").values)
        utility = utility_score(_train.date.values, _train.weight.values, _train.resp.values, _train.action_pred.values)

        bce_losses.append(bce)
        utilities.append(utility)
        
    print(stats.pearsonr(bce_losses, utilities))
    print(stats.spearmanr(bce_losses, utilities))

In [None]:
# output:
# (0.721238654097245, 2.605129227641238e-17)
# SpearmanrResult(correlation=0.648892889288929, pvalue=2.8591355087361215e-13)

if TEST_STRATEGIES:
    
    n_repetitions = 100
    error_rate = 0.46

    bce_losses = list()
    utilities = list()

    for i in tqdm(range(n_repetitions)):
        _train = get_synthetic_preds(error_rate=error_rate)
        bce = bce_loss(_train.action.values, _train.action_pred_proba.values, weight=_train.eval("sqrt(sqrt(abs(resp)*weight))").values)
        utility = utility_score(_train.date.values, _train.weight.values, _train.resp.values, _train.action_pred.values)

        bce_losses.append(bce)
        utilities.append(utility)
        
    print(stats.pearsonr(bce_losses, utilities))
    print(stats.spearmanr(bce_losses, utilities))

In [None]:
# output:
# (0.9619929743250912, 4.755577482374738e-57)
# SpearmanrResult(correlation=0.9432463246324632, pvalue=1.0318766706939007e-48)

if TEST_STRATEGIES:
    
    n_repetitions = 100
    error_rate = 0.46

    bce_losses = list()
    utilities = list()

    for i in tqdm(range(n_repetitions)):
        _train = get_synthetic_preds(error_rate=error_rate)
        bce = bce_loss(_train.action.values, _train.action_pred_proba.values, weight=np.log1p(_train.eval("abs(resp)*weight").values))
        utility = utility_score(_train.date.values, _train.weight.values, _train.resp.values, _train.action_pred.values)

        bce_losses.append(bce)
        utilities.append(utility)
        
    print(stats.pearsonr(bce_losses, utilities))
    print(stats.spearmanr(bce_losses, utilities))

In [None]:
train["w"] = train.eval("abs(resp)*weight").values
train["w1"] = train.eval("abs(resp_1)*weight").values
train["w2"] = train.eval("abs(resp_2)*weight").values
train["w3"] = train.eval("abs(resp_3)*weight").values
train["w4"] = train.eval("abs(resp_4)*weight").values

***

In [None]:
df = train.query("date >= 86").query("weight > 0").reset_index(drop=True)
print(len(df) / 1e6)

In [None]:
actions = df[["resp_1","resp_2","resp_3","resp_4","resp"]].copy()
actions = (actions > 0).astype(int).copy()
actions["acum"] = actions.eval("resp_1 + resp_2 + resp_3 + resp_4")
actions

In [None]:
actions.groupby("resp").count()

In [None]:
print("# samples with resp=0 :", len(actions.query("resp == 0")))
print("% samples with resp=0 | 4/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==0")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 3/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==1")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 2/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==2")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 1/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==3")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 0/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==4")) / len(actions.query("resp==0")))

In [None]:
print("# samples with resp=1 :", len(actions.query("resp == 1")))
print("% samples with resp=1 | 4/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==4")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 3/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==3")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 2/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==2")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 1/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==1")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 0/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==0")) / len(actions.query("resp==1")))

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_1.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_2.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_3.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_4.values)

In [None]:
actions.query("resp==0 & acum==3")[["resp_1","resp_2","resp_3","resp_4"]].sum(axis=0)

In [None]:
actions.query("resp==1 & acum==1")[["resp_1","resp_2","resp_3","resp_4"]].sum(axis=0)

***
saves the results

In [None]:
# saving results in parquet format
train.to_parquet("train.parquet", index=False)
features.to_parquet("features.parquet", index=False)
example_sample_submission.to_parquet("example_sample_submission.parquet", index=False)
example_test.to_parquet("example_test.parquet", index=False)

***