In [1]:
import pandas as pd
import os
from pathlib import Path
import gc
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

import cudf
import numba
from numba import cuda
import numpy as np

# parallelize the process on all columns using joblib
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

from src.data_utils import (
    load_recent_data_from_file,
    save_daily_data,
    save_in_folders,
    get_latest_date,
    read_available_dates
)
from src.config import (
    DAILY_DATA_DIR,
    DATA_DIR,
    DAILY_PRIMARY_FEATURES_DIR,
    DAILY_SECONDARY_FEATURES_DIR,
    DAILY_SCALED_FEATURES_DIR,
)


In [2]:
FROM_SCRATCH = True

In [9]:
def apply_cut(df, cols):
    _res = Parallel(
        n_jobs=10,
    )(delayed(pd.qcut)(df[col], q=5, labels=False, duplicates="drop") for col in cols)
    _res = pd.concat(_res, axis=1).astype("int8")
    #print(_res)
    assert abs(_res["feature_2_ratio_rsi_50_close"].mean() - 2) < 0.05, "mean should be 2"
    return _res


def apply_cut_cpu(recent_data, feature_columns):
    recent_data = recent_data.dropna(subset=feature_columns, axis=0)
    recent_data_gpu = cudf.DataFrame.from_pandas(recent_data)

    ranks = (
        recent_data_gpu[["date"] + feature_columns]
        .groupby("date")
        .rank(pct=True, method="first", ascending=True, na_option="keep")
    )
    ranks["date"] = recent_data["date"]
    ranks["bloomberg_ticker"] = recent_data["bloomberg_ticker"]

    ranks_pd = ranks.to_pandas()

    del recent_data_gpu, ranks

    res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
    res["date"] = ranks_pd["date"]
    res["bloomberg_ticker"] = ranks_pd["bloomberg_ticker"]

    return res


In [10]:
dates = read_available_dates(DAILY_SECONDARY_FEATURES_DIR)

start_index = len(dates) - 1000 if not FROM_SCRATCH else 0

# iterate over all dates in chunks of 200
for i in tqdm(range(start_index, len(dates), 200)):
    print(i)
    _tmp = load_recent_data_from_file(
        DAILY_SECONDARY_FEATURES_DIR,
        n_days=200,
        ascending=True,
        offset=i,
        dtype="float32",
    )
    # print max and min date
    print(_tmp["date"].min(), _tmp["date"].max())
    _tmp = _tmp.reset_index(drop=True)
    _tmp = _tmp.sort_values(["date", "bloomberg_ticker"])
    _tmp = _tmp.groupby("date").filter(lambda x: len(x) > 10)

    # get all feature columns
    feature_columns = [f for f in _tmp.columns if f.startswith("feature_")]

    # apply cut
    res = apply_cut_cpu(_tmp, feature_columns)

    # save
    save_in_folders(res, DAILY_SCALED_FEATURES_DIR)

    #del _tmp, res
    gc.collect()
    

  0%|          | 0/31 [00:00<?, ?it/s]

0
2000-01-04 00:00:00 2000-10-09 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 323.31it/s]
  3%|▎         | 1/31 [00:15<07:38, 15.28s/it]

200
2000-10-10 00:00:00 2001-07-18 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 340.09it/s]
  6%|▋         | 2/31 [00:38<09:46, 20.23s/it]

400
2001-07-19 00:00:00 2002-04-25 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 319.69it/s]
 10%|▉         | 3/31 [01:02<10:07, 21.69s/it]

600
2002-04-26 00:00:00 2003-01-31 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 291.94it/s]
 13%|█▎        | 4/31 [01:26<10:09, 22.59s/it]

800
2003-02-03 00:00:00 2003-11-07 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 276.14it/s]
 16%|█▌        | 5/31 [01:51<10:12, 23.56s/it]

1000
2003-11-10 00:00:00 2004-08-16 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 300.72it/s]
 19%|█▉        | 6/31 [02:16<10:00, 24.03s/it]

1200
2004-08-17 00:00:00 2005-05-23 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 270.30it/s]
 23%|██▎       | 7/31 [02:42<09:52, 24.69s/it]

1400
2005-05-24 00:00:00 2006-02-27 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 244.10it/s]
 26%|██▌       | 8/31 [03:09<09:47, 25.54s/it]

1600
2006-02-28 00:00:00 2006-12-04 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 186.76it/s]
 29%|██▉       | 9/31 [03:37<09:35, 26.18s/it]

1800
2006-12-05 00:00:00 2007-09-11 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 151.71it/s]
 32%|███▏      | 10/31 [04:04<09:16, 26.50s/it]

2000
2007-09-12 00:00:00 2008-06-18 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 242.32it/s]
 35%|███▌      | 11/31 [04:31<08:49, 26.48s/it]

2200
2008-06-19 00:00:00 2009-03-26 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 247.91it/s]
 39%|███▊      | 12/31 [04:56<08:16, 26.14s/it]

2400
2009-03-27 00:00:00 2009-12-31 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 232.70it/s]
 42%|████▏     | 13/31 [05:22<07:46, 25.93s/it]

2600
2010-01-04 00:00:00 2010-10-08 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 240.75it/s]
 45%|████▌     | 14/31 [05:48<07:23, 26.11s/it]

2800
2010-10-11 00:00:00 2011-07-15 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 237.13it/s]
 48%|████▊     | 15/31 [06:15<06:59, 26.23s/it]

3000
2011-07-18 00:00:00 2012-04-20 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 234.16it/s]
 52%|█████▏    | 16/31 [06:40<06:29, 25.99s/it]

3200
2012-04-23 00:00:00 2013-01-28 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 160.73it/s]
 55%|█████▍    | 17/31 [07:07<06:06, 26.15s/it]

3400
2013-01-29 00:00:00 2013-11-04 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:00<00:00, 211.30it/s]
 58%|█████▊    | 18/31 [07:35<05:48, 26.82s/it]

3600
2013-11-05 00:00:00 2014-08-12 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 171.96it/s]
 61%|██████▏   | 19/31 [08:02<05:23, 26.97s/it]

3800
2014-08-13 00:00:00 2015-05-20 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 162.42it/s]
 65%|██████▍   | 20/31 [08:29<04:56, 26.99s/it]

4000
2015-05-21 00:00:00 2016-02-25 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 163.43it/s]
 68%|██████▊   | 21/31 [08:57<04:30, 27.07s/it]

4200
2016-02-26 00:00:00 2016-12-01 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 162.10it/s]
 71%|███████   | 22/31 [09:24<04:04, 27.20s/it]

4400
2016-12-02 00:00:00 2017-09-07 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:02<00:00, 88.08it/s] 
 74%|███████▍  | 23/31 [09:54<03:43, 27.90s/it]

4600
2017-09-08 00:00:00 2018-06-15 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 145.44it/s]
 77%|███████▋  | 24/31 [10:29<03:30, 30.10s/it]

4800
2018-06-18 00:00:00 2019-03-25 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 150.20it/s]
 81%|████████  | 25/31 [10:59<03:01, 30.21s/it]

5000
2019-03-26 00:00:00 2019-12-30 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 151.26it/s]
 84%|████████▍ | 26/31 [11:29<02:30, 30.06s/it]

5200
2019-12-31 00:00:00 2020-10-06 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 149.09it/s]
 87%|████████▋ | 27/31 [11:59<02:00, 30.20s/it]

5400
2020-10-07 00:00:00 2021-07-14 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 149.20it/s]
 90%|█████████ | 28/31 [12:30<01:31, 30.43s/it]

5600
2021-07-15 00:00:00 2022-04-20 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 149.04it/s]
 94%|█████████▎| 29/31 [13:01<01:01, 30.50s/it]

5800
2022-04-21 00:00:00 2023-01-25 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 200/200 [00:01<00:00, 152.13it/s]
 97%|█████████▋| 30/31 [13:31<00:30, 30.33s/it]

6000
2023-01-26 00:00:00 2023-07-07 00:00:00


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  res = ranks_pd.groupby("date").apply(lambda df: apply_cut(df, feature_columns))
100%|██████████| 117/117 [00:00<00:00, 145.57it/s]
100%|██████████| 31/31 [13:49<00:00, 26.77s/it]
