In [1]:
import re, gc, os, json

import pandas as pd
import numpy as np

from datetime import datetime
from preprocessing_script import *

pd.set_option("display.max_columns", 500)

NB_VENUES = 6
MICROSEC = int(1e6)
MIN_PER_HOUR = 60
SEC_PER_MIN = 60
TARGET = "source_id"
FEATURES_LIST = list()
BASE_DATA_PATH = "./input/cfmdatachallenge/"

print(f"Kernel lancé le : {datetime.now().strftime('%d %b, %H h %M')}")

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

kmeans-smote 0.1.2 requires imbalanced-learn<0.5,>=0.4.0, but you'll have imbalanced-learn 0.7.0 which is incompatible.
kmeans-smote 0.1.2 requires numpy<1.16,>=1.13, but you'll have numpy 1.18.5 which is incompatible.
kmeans-smote 0.1.2 requires scikit-learn<0.21,>=0.19.0, but you'll have scikit-learn 0.23.2 which is incompatible.
dask-xgboost 0.1.11 requires xgboost<=0.90, but you'll have xgboost 1.2.1 which is incompatible.
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade 

1.4.1
Kernel lancé le : 20 Jan, 18 h 40


In [2]:
train_data, test_data = read_data_files(BASE_DATA_PATH, "train.h5", "test.h5", "train_labels.csv")

In [3]:
def rank_prices(price_columns: list, df: pd.DataFrame, nb_venues: int) -> pd.DataFrame:
    if len(price_columns) != nb_venues:
        raise ValueError(
            f"Not the right number of columns: {nb_venues} expected, got {len(price_columns)}")

    prices = df[price_columns].values
    # ranks the columns : each column is assigned a 0->5 int
    ordered_prices = scipy.stats.rankdata(prices, method="average", axis=1)
    rank_cols = [f"{col}_rank" for col in price_columns]

    df_ordered_prices = pd.DataFrame(ordered_prices,
                                     columns=rank_cols,
                                     index=df.index,
                                     dtype=np.float16)

    return df_ordered_prices

In [4]:
print("Begining Processing and Features Engineering...\n")

# Rename columns to make them more readable

train_data = rename_columns(train_data)
test_data  = rename_columns(test_data)

# Time transformation: from microseconds to quarters/minutes
# Only do it for the last trade as feats would be correlated if we did it for the 10 last trades

for df in [train_data, test_data]:
    df["trade_quarter"] = df["trade_tod_0"].apply(get_quarter_from_time)
    df["trade_minute"]  = df["trade_tod_0"].apply(get_minute_from_time)

FEATURES_LIST.extend(["trade_quarter", "trade_minute"])

# Normalise time for 10 last trades by time of most recent trade
#Convert `trade_tod_i` from absolute time to time relative to last trade to measure the time difference between the trades

train_data = normalize_trades_time(train_data)
test_data  = normalize_trades_time(test_data)

# Frequency of venues among last k trades

last_n_trades = [5, 10]

for last_k in last_n_trades:
    train_data = get_venues_frequency_last_trades(train_data, last_k)
    test_data  = get_venues_frequency_last_trades(test_data,  last_k)
    
# Normalise updates by time to most recent update

train_data = normalize_OB_updates(train_data)
test_data  = normalize_OB_updates(test_data)

# Total book size per venue

train_data = compute_total_book_size(train_data)
test_data  = compute_total_book_size(test_data)

# Opposite transform of bid prices to make it "same scale"

train_data = normalize_prices(train_data)
test_data  = normalize_prices(test_data)

# Weighted best OB (qty * 1/price**2)

train_data = get_weighted_qty_price(train_data)
test_data  = get_weighted_qty_price(test_data)

# Rank order books for best to worst price offered

train_data = rank_venues_by_price(train_data)
test_data  = rank_venues_by_price(test_data)

# Features on last trades

train_data = make_stats_on_lasttrades(train_data)
test_data  = make_stats_on_lasttrades(test_data)

# Biggest / bestprice trade feature

train_data = biggest_best_trade_feats(train_data)
test_data  = biggest_best_trade_feats(test_data)

# Making prices relative to the best available price

bid_ask_prices = [r"OB_ask_[0-5]$", r"OB_ask1_[0-5]$", r"OB_bid_[0-5]$", r"OB_bid1_[0-5]$"]
train_data = make_relative_price_features(train_data, bid_ask_prices)
test_data  = make_relative_price_features(test_data , bid_ask_prices)

# Make ratios between 1st and 2nd level of the book for size and price

train_data = make_ratios_price_size(train_data)
test_data  = make_ratios_price_size(test_data)

# Normalizing the features by the k best of the 6 OBs
# slightly worsens the score (by 0.02%)

k_best_feats = 3
train_data = get_weighted_OB_share(train_data, k_best_feats)
test_data  = get_weighted_OB_share(test_data,  k_best_feats)


# Trades on n last seconds
train_data = trades_last_n_sec(train_data, limit_time=int(1e3))
train_data = trades_last_n_sec(train_data, limit_time=int(1e2))
train_data = trades_last_n_sec(train_data, limit_time=int(1e1))

test_data = trades_last_n_sec(test_data, limit_time=int(1e3))
test_data = trades_last_n_sec(test_data, limit_time=int(1e2))
test_data = trades_last_n_sec(test_data, limit_time=int(1e1))


print("\nFeatures Engineering Completed!")

Begining Processing and Features Engineering...

Columns renamed
Columns renamed
Trade times normalized
Trade times normalized
5 last venues frequency computed
5 last venues frequency computed
10 last venues frequency computed
10 last venues frequency computed
Order book normalized
Order book normalized
Total book size computed
Total book size computed
Bid prices normalized
Bid prices normalized
Weighted quantity price feature computed
Weighted quantity price feature computed
Features on last trades computed
Features on last trades computed
Prices normalize vs. best done!
Prices normalize vs. best done!
Ratios created!
Ratios created!


  df[norm_cols_names] = df_OB_cols / sum_kbest_prices
  df[norm_cols_names] = df_OB_cols / sum_kbest_prices


OB features normalised by sum of 3 best
OB features normalised by sum of 3 best

Features Engineering Completed!


In [5]:
# Stock + quarter categorical variable

#train_data["stock_quarter"] = train_data["stock_id"].astype(str) + "_" + train_data["trade_quarter"].astype(str)
#test_data["stock_quarter"]  = test_data["stock_id"].astype(str)  + "_" + test_data["trade_quarter"].astype(str)

In [6]:
ENCODE_OB_FEATS = False

if ENCODE_OB_FEATS:
    patt = "OB_(bid|ask)1?_[0-5]$"
    agg_names = ["med"]
    train_data = encode_OB_features(train_data, patt, agg_names)
    test_data  = encode_OB_features(test_data,  patt, agg_names)
    
    FEATURES_LIST.extend([c for c in train_data.columns if "_encoded" in c])

In [7]:
STOCKDAY_ENCODING = True

if STOCKDAY_ENCODING:
    stockday_columns = ["stock_id", "day_id"]
    source_cols = [c for c in train_data.columns if "trade_source_id_" in c]
    
    features = dict()
    mapping = dict()
    train_test_mapping = list()
    
    for df in [train_data, test_data]:
        for k, df_gp in df.groupby(stockday_columns):
            
            df_gp_source = df_gp[source_cols].values
            nb_samples = 10 * df_gp_source.shape[0]
            
            for i in range(NB_VENUES):
                features[i] = np.count_nonzero(df_gp_source == i) / nb_samples

            mapping[k] = features
            features = dict()

        mapping_stock_day = pd.DataFrame(mapping).T
        mapping_stock_day.reset_index(inplace=True)
        mapping_stock_day.columns = stockday_columns + [f"venue_stockday_{k}" for k in range(NB_VENUES)]
        
        train_test_mapping.append(mapping_stock_day)
        
    test_data = test_data.reset_index()
        
    train_data = train_data.merge(train_test_mapping[0], on=stockday_columns)
    test_data  = test_data .merge(train_test_mapping[1], on=stockday_columns)
    
    test_data = test_data.set_index("ID")

    FEATURES_LIST.extend([f"venue_stockday_{k}" for k in range(NB_VENUES)])
    del df; gc.collect()

print("Stockday encoding done!")

Stockday encoding done!


In [8]:
%reset_selective -f "^.{1-7}$"

In [9]:
train_data.isna().sum().sort_values().tail(20)

OB_ask_1_sum_norm                 985
OB_ask_5_sum_norm                1016
OB_bid_4_sum_norm                1039
OB_bid_5_sum_norm                1093
OB_bid_1_sum_norm                1110
OB_ts_last_update_2_sum_norm     8120
OB_ts_last_update_0_sum_norm    11582
ask_size_2                      15357
OB_ask_size1_2                  15357
OB_ask1_2                       15357
normalized_OB_ask1_2            15357
OB_ts_last_update_3_sum_norm    16947
OB_bid1_2                       18378
bid_size_2                      18378
OB_bid_size1_2                  18378
normalized_OB_bid1_2            18378
OB_ts_last_update_4_sum_norm    24444
OB_ts_last_update_5_sum_norm    25618
total_size_2                    28059
OB_ts_last_update_1_sum_norm    28981
dtype: int64

In [10]:
# Test : try to "double features by giving the previous line for stock and day as a feature"

def make_lag_feature(df: pd.DataFrame, feats_to_lag: List[str]) -> pd.DataFrame:
    sorted_data = df.sort_values(by=["stock_id", "day_id"])
    lagged_features = sorted_data.shift(1)
    lagged_features = lagged_features[[f for f in feats_to_lag if f in lagged_features.columns]]
    lagged_features.columns = ["lagged_" + c for c in lagged_features.columns]
    df = pd.concat([sorted_data, lagged_features], axis=1)

    df["change_stock_day"] = (sorted_data["stock_id"] != sorted_data["stock_id"].shift(1)) | (sorted_data["day_id"] != sorted_data["day_id"].shift(1))
    lagged_cols = [c for c in df.columns if "lagged" in c]
    lagged_naned_data = df[lagged_cols].values
    lagged_naned_data[df["change_stock_day"], :] = np.nan
    
    df = df.drop("change_stock_day", axis=1)
    
    return df

In [11]:
# train_data = make_lag_feature(train_data, best_features)
# test_data  = make_lag_feature(test_data, best_features)

In [12]:
print("Groups of features used :\n")
main_features_groups = sorted(set([''.join([l for l in c if not l.isdigit()]) for c in train_data.columns]))
print(main_features_groups)

Groups of features used :

['OB_ask_', 'OB_ask__sum_norm', 'OB_ask_size_', 'OB_bid_', 'OB_bid__sum_norm', 'OB_bid_size_', 'OB_ts_last_update_', 'OB_ts_last_update__sum_norm', 'ask_size_', 'bid_size_', 'day_id', 'last_trades__.', 'max_lasttrades_price', 'max_lasttrades_qty', 'mean_lasttrades_price', 'mean_lasttrades_qty', 'min_lasttrades_price', 'min_lasttrades_qty', 'nb__past_venues', 'normalized_OB_ask_', 'normalized_OB_bid_', 'num_lasttrades_ask_side', 'ratio__ask?__', 'ratio__ask_size?__', 'ratio__bid?__', 'ratio__bid_size?__', 'source_id', 'std_lasttrades_price', 'std_lasttrades_qty', 'std_lasttrades_time', 'stock_id', 'total_size_', 'trade_minute', 'trade_price_', 'trade_qty_', 'trade_quarter', 'trade_source_id_', 'trade_tod_', 'venue_stockday_', 'weighted_ask_OB_price_', 'weighted_bid_OB_price_']


In [13]:
train_data.shape

(946657, 241)

In [14]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 946657 entries, 0 to 946656
Columns: 241 entries, OB_ask_0 to venue_stockday_5
dtypes: float32(118), float64(67), int16(12), int64(31), int8(13)
memory usage: 1.1 GB


In [15]:
print("Saving train data with feature engineering...")
train_data.to_csv("train_data.csv")

print("Saving test data with feature engineering...")
test_data.to_csv("test_data.csv")

Saving train data with feature engineering...
Saving test data with feature engineering...
