In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
OUT_PATH = Path("../data/processed/dsv08")
if not OUT_PATH.exists():
    OUT_PATH.mkdir(parents=True, exist_ok=True)

***

In [3]:
df_pfi = pd.read_csv("../data/feat-selection/catb-pfi.csv")
df_pfi.iloc[:,1:].mean(axis=0).sort_values(ascending=False).head(2000)

B_11-P_2_last          0.001962
B_3_last               0.001343
D_44_last              0.001306
B_9_last               0.001243
S_16-P_2_last          0.001207
                         ...   
R_10_above_mean        0.000050
P_3_diff_last-first    0.000050
S_9_min                0.000050
B_7_diff_lag2          0.000050
D_39-P_2_pchg_std      0.000050
Length: 2000, dtype: float64

In [4]:
df_lfc = pd.read_csv("../data/feat-selection/catb-lfc.csv")
df_lfc.mean(axis=0).sort_values(ascending=False).head(2000)

B_11-P_2_last      1.111200e-03
S_16-P_2_last      4.456111e-04
B_3_last           3.106185e-04
D_44_last          2.779437e-04
P_2_last           2.672461e-04
                       ...     
R_28_pchg_lag1     4.460415e-07
B_12_sum_cxty1     4.458861e-07
R_19_mean_cxty2    4.455652e-07
B_26_mean_cxty2    4.455383e-07
B_23_pchg_std      4.451403e-07
Length: 2000, dtype: float64

In [5]:
df_shap = pd.read_csv("../data/feat-selection/catb-shap.csv")
df_shap.mean(axis=0).sort_values(ascending=False).head(2000)

B_11-P_2_last          0.127874
S_16-P_2_last          0.084109
P_2_last               0.064072
S_23-P_2_last          0.059025
B_5_last               0.056832
                         ...   
B_23_pchg_mean         0.000659
D_39-P_3_below_mean    0.000657
D_71_diff_std          0.000657
D_115_min              0.000657
D_60_pchg_min          0.000656
Length: 2000, dtype: float64

In [6]:
selected_pfi = df_pfi.iloc[:,1:].mean(axis=0).sort_values(ascending=False).head(2000).index.tolist()
selected_lfc = df_lfc.mean(axis=0).sort_values(ascending=False).head(2000).index.tolist()
selected_shap = df_shap.mean(axis=0).sort_values(ascending=False).head(2000).index.tolist()

In [7]:
selected_set01 = list(set(selected_pfi) & set(selected_lfc) & set(selected_shap))
selected_set02 = list(set(selected_pfi) & set(selected_lfc)) 
selected_set03 = list(set(selected_lfc) & set(selected_shap))
selected_set04 = list(set(selected_pfi) & set(selected_shap))

## filtering feature from adversarial validation

In [8]:
to_remove_set01 = [
    'B_29_diff_last-first', 'B_29_mean', 'S_9_diff_min', 'S_9_diff_last-mean', 
    'S_9_mean_nan', 'S_9_min', 'S_9_mean_cxty2', 'S_9_diff_last-max', 'S_9_median', 
    'S_9_max', 'S_9_last', 'S_9_diff_last-median', 'S_9_diff_last-first', 'S_9_mean', 
    'R_1_mean_cxty1', 'R_1_mean', 'R_1_diff_last-min', 'R_1_std', 'R_1_median', 
    'R_1_diff_min', 'R_1_diff_last-median', 'R_1_last', 'R_1_min', 'R_1_diff_last-mean', 
    'R_1_max', 'R_1_diff_lag3', 'R_1_sum_cxty1', 'D_59_pchg_lag3', 'D_59_slope', 'D_59_pchg_lag1', 
    'D_59_diff_last-mean', 'D_59_std', 'D_59_pchg_max', 'D_59_median', 'D_59_last', 'D_59_diff_mean', 
    'D_59_sum_cxty1', 'D_59_max', 'D_59_pchg_mean', 'D_59_mean', 'D_59_diff_last-min', 'S_11_mean_cxty1', 
    'S_11_mean', 'S_11_diff_last-median', 'S_11_median', 'S_11_min', 'S_11_pchg_min', 'S_11_last', 
    'S_11_max', 'S_9_diff_min', 'S_9_diff_last-mean', 'S_9_mean_nan', 'S_9_min', 'S_9_mean_cxty2', 
    'S_9_diff_last-max', 'S_9_median', 'S_9_max', 'S_9_last', 'S_9_diff_last-median', 'S_9_diff_last-first', 
    'S_9_mean', 'D_121_mean_cxty2', 'D_121_pchg_max', 'D_121_pchg_std', 'D_121_max', 'D_121_median', 
    'D_121_min', 'D_121_sum_cxty2', 'D_121_last', 'D_121_mean', 'D_121_pchg_mean',
]
to_remove_set01 = list(set(to_remove_set01))
len(to_remove_set01)

59

In [9]:
top_av_selected = [
    'S_15_last', 'D_120_2_mean', 'R_27_mean', 'D_126_4_mean', 'D_126_4_lo', 'S_15_max', 
    'D_120_2_lo', 'D_120_1_lo', 'D_123_idxmax', 'S_24_last', 'D_123_slope', 'S_24_max', 
    'D_117_7_lo', 'D_91_mean', 'S_13_max', 'S_15_above_mean_cc', 'D_117_7_mean', 'S_13_last', 
    'S_15_mean', 'S_24_pchg_max', 'S_15_median', 'S_15_diff_last-median', 'D_117_4_lo', 
    'S_24_pchg_min', 'S_27_median', 'S_24_mean', 'S_15_diff_last-mean', 'D_118_pchg_mean', 
    'D_117_4_mean', 'S_24_pchg_lag1', 'S_24_diff_last-min', 'D_45_pchg_mean', 'S_24_pchg_std', 
    'D_56_diff_idxmax', 'R_27_median', 'S_24_diff_min', 'S_24_diff_lag1', 'D_118_mean_cxty2', 
    'D_119_pchg_mean', 'D_64_4_lo', 'S_24_diff_last-median', 'S_24_diff_last-mean', 'D_64_4_mean', 
    'R_27_min', 'S_27_mean', 'S_27_max', 'D_117_8_mean', 'D_122_mean', 'D_140_mean', 'R_26_min', 
    'D_117_8_lo', 'R_26_mean', 'D_119_mean_cxty2', 'D_114_1_mean', 'D_55_pchg_mean', 'D_114_2_mean', 
    'D_119_pchg_max', 'R_27_max', 'D_114_2_lo', 'D_119_pchg_std', 'D_119_sum_cxty2', 'D_55_min',
    'D_64_1_mean', 'D_145_diff_last-mean', 'D_124_pchg_max', 'R_27_last', 'D_118_min', 'D_119_min', 
    'P_4_median', 'D_68_1_lo', 'R_27_pchg_min', 'D_55_diff_lag3', 'P_4_mean', 'R_27_pchg_std', 
    'D_129_mean', 'R_27_sum_cxty2', 'D_129_slope', 'D_124_max', 'D_45_max', 'D_42_pchg_mean', 
    'R_27_diff_min', 'B_17_diff_lag2', 'D_45_median', 'B_17_idxmin', 'D_107_std', 'D_45_mean', 
    'R_26_max', 'B_17_diff_lag1', 'B_8_sum_cxty2', 'D_123_diff_mean', 'B_8_mean_cxty2', 'D_45_min', 
    'R_26_last', 'D_55_mean', 'D_124_mean', 'D_45_last', 'D_45_pchg_max', 'D_107_slope', 'D_118_median', 
    'D_91_last', 'D_45_pchg_std', 'D_119_median', 'D_55_pchg_min', 'D_122_min', 'D_61_max', 
    'B_8_pchg_std', 'D_45_sum_cxty2', 'S_22_pchg_min', 'D_55_diff_min', 'D_119_mean', 'D_45_mean_cxty2', 
    'D_55_median', 'D_118_mean', 'B_8_mean', 'S_22_pchg_mean', 'D_55_max', 'S_22_pchg_std', 'D_64_1_lo', 
    'D_56_diff_last-mean', 'D_42_pchg_min', 'S_22_sum_cxty2', 'D_55_last', 'D_55_diff_last-mean', 
    'B_8_diff_min', 'S_15_min', 'B_8_last', 'D_55_diff_last-first', 'B_8_std', 'S_22_pchg_lag1', 
    'D_55_diff_mean', 'D_55_diff_last-min', 'D_55_diff_last-median', 'D_118_last', 'S_13_diff_last-first', 
    'R_26_median', 'D_55_slope', 'D_119_last', 'D_144_idxmin', 'B_8_median', 'D_119_max', 'D_115_max', 
    'D_118_max', 'D_42_sum_cxty2', 'D_42_sum_nan', 'S_13_pchg_std', 'S_13_pchg_mean', 'D_122_max', 'B_8_min', 
    'D_42_min', 'D_114_1_lo', 'S_13_sum_cxty2', 'S_13_sum_cxty1', 'S_13_mean_cxty2', 'B_8_max', 
    'D_42_mean_nan', 'S_13_diff_mean', 'B_8_mean_cxty1', 'D_142_max', 'B_8_diff_std', 'D_42_mean', 
    'B_8_sum_cxty1', 'S_13_diff_last-median', 'S_13_mean_cxty1', 'S_8_max', 'D_142_median', 'S_13_diff_std', 
    'D_42_last', 'S_16-P_3_diff_idxmax', 'S_27_mean_cxty1', 'S_8_mean', 'S_27_sum_cxty1', 'D_42_pchg_max', 
    'S_27_diff_std', 'D_61_mean', 'B_17_last', 'D_42_median', 'D_42_diff_last-max', 'B_17_diff_last-max', 
    'D_82_diff_last-mean', 'B_4_idxmax', 'B_17_diff_last-median', 'D_42_max', 'D_56_mean_cxty2', 'D_117_5_lo', 
    'B_38_2_mean', 'D_124_min', 'D_117_5_mean', 'B_17-P_2_diff_lag1', 'S_8_median', 'D_66_1_mean', 'S_22_mean', 
    'D_42_diff_last-first', 'D_39_last', 'D_42_diff_last-median', 'D_142_pchg_max', 'D_42_diff_mean', 
    'D_42_diff_last-mean', 'D_79_mean_cxty2', 'D_61_median', 'D_82_mean', 'D_39-P_3_max', 'D_79_idxmin', 
    'D_61_last', 'D_56_pchg_std', 'B_6_max', 'D_79_pchg_std', 'D_39_max', 'B_17-P_2_pchg_lag1', 'D_124_median', 
    'D_47_pchg_mean', 'D_70_min', 'D_47_min', 'R_9_min', 'D_47_median', 'S_13_diff_last-mean', 'D_39-P_2_max', 
    'D_122_median', 'D_72_mean', 'D_42_pchg_lag3', 'D_39_idxmin', 'S_7_last', 'D_39_diff_last-first', 
    'D_61_std', 'S_8_diff_max', 'D_122_last', 'D_39_diff_mean', 'D_56_diff_last-max', 'D_141_mean', 
    'D_55_idxmin', 'D_124_last', 'D_39-P_2_diff_mean', 'B_40_max', 'D_56_sum_cxty2', 'D_123_max', 
    'S_22_diff_last-mean', 'D_123_diff_last-first', 'D_39-P_2_pchg_lag1', 'D_63_2_mean', 'B_6_diff_last-mean', 
    'D_39-P_2_pchg_lag3', 'S_12_diff_lag3', 'B_40_mean', 'B_6_mean', 'D_77_mean_nan', 'B_9_max', 
    'D_82_last', 'S_12_pchg_lag3', 'D_61_diff_min', 'D_91_min', 'D_62_max', 'D_61_mean_cxty1', 
    'D_82_max', 'S_3_pchg_std', 'D_39_diff_lag1', 'D_47_mean', 'S_7_median', 'D_47_max', 'D_39_diff_last-min', 
    'D_47_last', 'D_62_mean', 'D_47_pchg_max', 'D_62_median', 'D_77_max', 'D_39-P_2_diff_lag1', 'S_7_mean', 
    'S_3_pchg_min', 'D_62_min', 'D_62_last', 'D_39-P_2_last', 'D_66_1_lo', 'D_77_mean', 'S_7_min', 
    'D_39-P_2_min', 'D_39-P_2_diff_last-min', 'D_39-P_3_last', 'D_39-P_3_diff_last-min', 'D_77_median', 
    'D_105_diff_lag2', 'B_6_pchg_min', 'S_7_max', 'D_102_diff_last-min', 'D_39_std', 'D_51_mean', 
    'D_39-P_3_median', 'D_48_idxmin', 'D_39-P_2_median', 'B_38_1_lo', 'D_51_min', 'D_39-P_3_std', 
    'S_22_diff_last-median', 'S_3_pchg_max', 'B_38_1_mean', 'D_39-P_2_std', 'R_9_mean', 'D_39-P_2_sum_cxty1', 
    'D_39_mean_cxty1', 'B_16_slope', 'S_3_max', 'B_4_above_mean_cc', 'B_17_min', 'D_56_min', 'D_39-P_3_diff_max', 
    'S_3_diff_last-max', 'B_9_diff_std', 'S_3_mean', 'S_3_diff_last-mean', 'B_9_std', 'S_3_median', 
    'D_39-P_2_diff_max', 'D_66_2_mean', 'D_39_diff_max', 'S_3_min', 'D_102_diff_mean', 'S_7_diff_last-max', 
    'B_9_mean', 'B_9_diff_last-max', 'S_7_diff_last-mean', 'S_7_diff_last-median', 'S_3_diff_std', 
    'S_3_last', 'S_3_diff_last-min', 'S_3_std', 'S_3_mean_cxty2', 'D_44_min', 'B_9_median', 'S_3_pchg_lag1', 
    'D_43_mean_cxty1', 'B_40_diff_mean', 'B_12_mean', 'D_46_min', 'S_3_diff_max', 'B_9_diff_mean', 
    'D_43_diff_std', 'S_8_mean_cxty2', 'D_44_pchg_min', 'D_43_sum_cxty1', 'S_7_diff_lag1', 'B_9_diff_min', 
    'S_8_slope', 'S_7_mean_cxty1', 'S_3_sum_cxty2', 'S_3_diff_lag1', 'B_1_slope', 'D_44_idxmin', 
    'B_17-P_2_diff_last-mean', 'D_39-P_3_mean', 'D_39_slope', 'D_78_pchg_mean', 'S_3_idxmax', 'D_61_diff_mean', 
    'B_16_diff_mean', 'B_9_last', 'B_6_pchg_mean', 'B_25_idxmax', 'P_3_max', 'D_39_mean', 'B_16_diff_last-mean', 
    'B_33_diff_mean', 'B_9_diff_last-mean', 'D_70_mean', 'R_3_pchg_mean', 'B_9_min', 'B_17-P_2_last', 
    'D_39-P_2_slope', 'B_3_slope', 'D_102_diff_last-first', 'B_17-P_3_sum_nan', 'D_128_mean_cxty1', 
    'D_79_pchg_max', 'S_5_diff_lag2', 'B_17-P_2_diff_idxmax', 'D_142_pchg_std', 'B_4_diff_last-first', 
    'B_17-P_3_mean_nan', 'D_44_diff_mean', 'B_4_diff_mean', 'D_142_sum_cxty2', 'D_44_pchg_mean', 'S_8_last', 
    'D_43_std', 'S_8_min', 'D_43_diff_last-max', 'B_17_median', 'B_4_pchg_min', 'S_25_max', 'D_43_max', 
    'S_25_last', 'B_28_idxmax', 'B_13_max', 'D_56_mean', 'B_17_pchg_min', 'B_9_diff_last-first', 'D_56_median', 
    'B_4_diff_min', 'S_27_diff_lag2', 'B_37_max', 'B_17-P_3_last', 'B_6_median', 'D_132_mean_nan', 
    'D_39-P_3_slope', 'B_38_7_lo', 'D_56_last', 'D_39-P_3_idxmax', 'B_17_diff_min', 'D_39_diff_last-mean', 
    'D_56_max', 'D_39-P_2_mean', 'B_38_7_mean', 'B_38_3_mean', 'B_17_mean', 'B_4_pchg_mean', 'B_2_mean_cxty2', 
    'D_43_diff_lag2', 'B_6_pchg_std', 'D_51_median', 'B_1_max', 'B_6_mean_cxty2', 'D_102_std', 'B_13_mean', 
    'B_17_mean_cxty1', 'B_6_sum_cxty2', 'D_128_mean', 'B_6_pchg_max', 'S_26_last', 'D_78_pchg_std', 
    'S_26_diff_last-min', 'D_75_diff_mean', 'B_11-P_3_diff_min', 'D_61_pchg_lag1', 'D_39-P_2_diff_last-mean', 
    'B_3_min', 'D_39-P_3_diff_last-mean', 'D_50_pchg_max', 'B_9_diff_last-min', 'D_46_idxmax', 
    'B_10_max', 'D_52_mean', 'D_39-P_3_diff_last-median', 'D_39-P_2_diff_last-median', 'R_3_diff_last-mean', 
    'D_39_diff_last-median', 'B_9_diff_last-median', 'D_52_max', 'B_12_sum_cxty1', 'B_20_median', 
    'B_2_max', 'S_23-P_2_idxmax', 'D_39-P_2_idxmax', 'D_52_min', 'D_80_slope', 'D_80_diff_last-mean', 
    'B_2_mean', 'B_10_mean', 'D_52_last', 'D_48_diff_lag2', 'D_52_median', 'D_74_diff_std', 
    'S_16-P_2_pchg_lag3', 'D_48_pchg_lag2', 'B_17_slope', 'P_3_min', 'B_40_pchg_mean', 'B_37_mean', 
    'B_17_diff_max', 'D_52_pchg_min', 'S_23-P_2_pchg_lag3', 'B_1_mean', 'D_48_idxmax', 'D_46_pchg_lag2', 
    'D_46_pchg_std', 'D_46_mean_cxty2', 'B_9_diff_lag3', 'D_74_mean_cxty1', 'B_2_median', 'B_25_pchg_min', 
    'B_2_pchg_min', 'D_46_pchg_min', 'D_74_diff_mean', 'D_105_max', 'B_9_idxmax', 'D_43_diff_idxmin', 
    'B_2_diff_max', 'B_14-P_2_idxmax', 'B_5_max', 'R_3_median', 'B_2_mean_cxty1', 'B_13_median', 
    'S_16-P_3_diff_min', 'D_78_std', 'B_9_pchg_std', 'B_28_diff_last-max', 'D_43_diff_idxmax', 'B_4_slope', 
    'B_2_diff_last-mean', 'R_3_mean', 'B_16_diff_std', 'D_43_nan_lpo', 'D_46_idxmin', 'D_44_diff_last-min', 
    'B_5_mean',]
len(top_av_selected)

500

In [10]:
SELECT_TOP = 100

to_remove_set02 = top_av_selected[:SELECT_TOP]
len(to_remove_set02)

100

In [11]:
to_remove = to_remove_set01 + to_remove_set02
len(to_remove)

159

In [12]:
selected_features = [feat for feat in selected_set01 if feat not in to_remove]
len(selected_features)

1149

***
## filtering for the selected features

In [13]:
!ls ../data/processed/dsv05

test.parquet  train.parquet


In [14]:
train = pd.read_parquet("../data/processed/dsv05/train.parquet", columns=selected_features)
test =  pd.read_parquet("../data/processed/dsv05/test.parquet",  columns=selected_features)

In [15]:
train.to_parquet(OUT_PATH/"train.parquet")
test.to_parquet(OUT_PATH/"test.parquet")

***