In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATASETS
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE CELL.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote
from urllib.error import HTTPError
from zipfile import ZipFile

CHUNK_SIZE = 40960 
DATASET_MAPPING = 'jane-street-market-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F23304%2F1691737%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20201207%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20201207T215204Z%26X-Goog-Expires%3D259199%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1e49f845875c71c86cd5e6261f97830a08d8de8f26b085529e3994a58660c2cf1dfa8ad08b2fe74cc7a375affcb6dba7caa2aec5b36106de784d11ae5fc7ee4cbc2b82685e4cbf6ee87f1abe69e012dd719e3d0c0cc0f074b65b17e2adfc578dbf449000f2d2b8b53743f7089f39020e1ab4f56ef9c3164839cd6e6313c01461b549229588ac52539033668c0ebb76d0d57679d75f83c51cfa89e59d8da3e2a6bd3475908a3ef9d8eed85ce47b659148c77ec7e2ed01623be598d65ba4e9459f9853dc1bc62697d12c7b205fdbf84dde53c8ccfaa5f52c5fbbd51f792d519ab44c0c2c9fc9439accab3f480b2415b98541e20120ab629791b0f5b3aa702678fc'
KAGGLE_INPUT_PATH='/home/kaggle/input'
KAGGLE_INPUT_SYMLINK='/kaggle'

os.makedirs(KAGGLE_INPUT_PATH, 777)
os.symlink(KAGGLE_INPUT_PATH, os.path.join('..', 'input'), target_is_directory=True)
os.makedirs(KAGGLE_INPUT_SYMLINK)
os.symlink(KAGGLE_INPUT_PATH, os.path.join(KAGGLE_INPUT_SYMLINK, 'input'), target_is_directory=True)

for dataset_mapping in DATASET_MAPPING.split(','):
    directory, download_url_encoded = dataset_mapping.split(':')
    download_url = unquote(download_url_encoded)
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as zipfileres, NamedTemporaryFile() as tfile:
            total_length = zipfileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes zipped')
            dl = 0
            data = zipfileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = zipfileres.read(CHUNK_SIZE)
            print(f'\nUnzipping {directory}')
            with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue
print('Dataset import complete.')


In [1]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.simplefilter(action='ignore')

In [2]:
df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")

Here I describe some intuitions behing the utility score function for the Jane Street Market Prediction
problem in Kaggle.

You can find more information on the problem here: https://www.kaggle.com/c/jane-street-market-prediction/overview

## Utility Score Definition

This competition is evaluated on a utility score. Each row in the test set represents a trading opportunity for which you will be predicting an action value, 1 to make the trade and 0 to pass on it. Each trade j has an associated weight and resp, which represents a return.


$$
p_i = \sum_j(weight_{ij} * resp_{ij} * action_{ij}),
$$

$$
t = \frac{\sum p_i }{\sqrt{\sum p_i^2}} * \sqrt{\frac{250}{|i|}},
$$

where \(|i|\) is the number of unique dates in the test set. The utility is then defined as:

$$ u = min(max(t,0), 6) \sum p_i. $$

In [3]:
df.head(2)

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1


## $p_i$

Each row or trading opportunity can be chosen (action == 1) or not (action == 0). 

The variable $p_i$ is a indicator for each day $i$, showing how much return we got for that day.

Let's say for example we want to verify the potential return for day 0.


In [4]:
df_0 = df[df['date'] == 0]

Let's say we end up choosing all transactions for day 0. We would have:

$$
p_i = \sum_j(weight_{ij} * resp_{ij} * 1)
$$

In [5]:
# If we choose all transactions
df_0['mult'] = df_0['weight']*df_0['resp']*1
p_0 = df_0['mult'].sum()
p_0

12.045550816445324

Obviously, if we choose no transactions, $p_i$ = 0

In [6]:
# If we choose no transactions
df_0['mult'] = df_0['weight']*df_0['resp']*0
p_0 = df_0['mult'].sum()
p_0

0.0

Now, let's say that we only choose the ones that would give us a positive return.
Let's see what is the maximum return we can get from day 0.

In [7]:
# Highest possible p for day 0
df_0['mult'] = df_0['weight']*df_0['resp']*(df_0['resp'] > 0)
p_0 = df_0['mult'].sum()
p_0

74.55811669838056

Since we want to maximize u, we also want to maximize $p_i$. To do that, we have to select the least amount of
negative $resp$ values as possible (since this is the only negative value in my equation and only value that would make the total sum of p going down)
and maximize the positive number of positive $resp$ transactions we select.

## $t$

Now, let's try to understan what $t$ is all about.
Let's create an example.

Let's say we have two days to compose $t$.

First scenario, we have:

$$ Day0 -> p_0 = 74$$

$$ Day1 -> p_1 = 2$$


where, $\sum p_i = 76 $. If we calculate $t$ for this scenario we would have:

In [8]:
t = (np.sum(np.array([74, 2])/np.sqrt(np.sum(np.array([74,2])**2))))*np.sqrt(250/2)
t

11.47831977327336

Now, let's say we had different values for each day.


$$ Day0 -> p_0 = 38$$

$$ Day1 -> p_1 = 38$$

Note that in this scenario $\sum p_i $ is also 76.

In [9]:
t = (np.sum(np.array([38, 38])/np.sqrt(np.sum(np.array([38,38])**2))))*np.sqrt(250/2)
t

15.811388300841896

Ok, so we can see that $t$ is larger when the return for each day is better distributed and has lower variation.
It is better to have returns uniformly divided among days than have all of your returns concentrated in just one day.
It reminds me a little of a $L_1$ over $L_2$ situation, where the $L_2$ norm penalizes outliers more than $L_1$.

There is one more thing to consider in the $t$ equation.
We have a multiplying factor of $\sqrt{\frac{250}{|i|}}$.

So, basically, the higher $i$ the lower my $t$ value will be.

Let's say that, similar to the scanerio above, we actually had 3 days instead of 2:

$$ Day0 -> p_0 = 38$$

$$ Day1 -> p_1 = 38$$

$$ Day2 -> p_1 = 0$$


In [10]:
t = (np.sum(np.array([38, 38, 0])/np.sqrt(np.sum(np.array([38,38, 0])**2))))*np.sqrt(250/3)
t

12.909944487358056

We can see we get a lower $t$ value than with 2 days.

Basically, we want to select uniformly distributed distributed returns over days, maiximizing our return 
but giving a penalty on choosing too many dates.

The variable $t$, however, will only matter if it is lower than 6, given the final equation:

$$ u = min(max(t,0), 6) \sum p_i. $$

otherwise, $t$ wil be replaced by the number 6 (I am still trying to understand why 6, if anyone knows please share it with me :) ).
