In [1]:
import pandas as pd 

USECOLS = [
    "v10", "v12", "v14", "v21", "v22", "v24", "v30", "v31",
    "v34", "v38", "v40", "v47", "v50", "v52", "v56", "v62",
    "v66", "v72", "v75", "v79", "v91", "v112", "v113", "v114", "v129",
]


def preload():
    # Download CSV files first.
    # `$ kaggle competitions download -c bnp-paribas-cardif-claims-management`
    pd.concat([
        pd.read_csv("train.csv.zip"),
        pd.read_csv("test.csv.zip"),
    ], sort=False).reset_index(drop=True)[USECOLS + ["target"]].to_feather("train_test.ftr")


preload()

In [5]:
pd.read_feather("train_test.ftr").head()

Unnamed: 0,v10,v12,v14,v21,v22,v24,v30,v31,v34,v38,...,v66,v72,v75,v79,v91,v112,v113,v114,v129,target
0,0.503281,6.085711,11.636387,7.730923,XDX,C,C,A,7.270147,0,...,C,1,D,E,A,O,,15.634907,0,1.0
1,1.31291,6.507647,11.636386,6.76311,GUV,C,C,A,3.615077,0,...,A,2,D,D,B,U,G,10.308044,0,1.0
2,0.765864,6.38467,9.603542,5.245035,FQ,E,,A,4.043864,0,...,A,3,B,E,G,S,,11.205561,2,1.0
3,6.542669,9.646653,14.094723,7.517125,ACUE,D,C,B,8.70355,0,...,A,2,D,B,B,J,,13.777666,1,1.0
4,1.050328,6.320087,10.991098,6.414567,HIT,E,,A,6.083151,0,...,C,1,D,C,G,T,G,14.097099,0,1.0


In [3]:
from xfeat import SelectNumerical


print("(1) Save numerical features")
SelectNumerical().fit_transform(pd.read_feather("train_test.ftr")).reset_index(
    drop=True
).to_feather("feature_num_features.ftr")

  from .autonotebook import tqdm as notebook_tqdm


(1) Save numerical features


In [4]:
pd.read_feather("feature_num_features.ftr").head()

Unnamed: 0,v10,v12,v14,v21,v34,v38,v40,v50,v62,v72,v114,v129,target
0,0.503281,6.085711,11.636387,7.730923,7.270147,0,7.711453,0.89942,1,1,15.634907,0,1.0
1,1.31291,6.507647,11.636386,6.76311,3.615077,0,14.305766,1.37921,2,2,10.308044,0,1.0
2,0.765864,6.38467,9.603542,5.245035,4.043864,0,13.077201,0.604504,1,3,11.205561,2,1.0
3,6.542669,9.646653,14.094723,7.517125,8.70355,0,11.523045,3.329176,1,2,13.777666,1,1.0
4,1.050328,6.320087,10.991098,6.414567,6.083151,0,10.13892,1.364536,1,1,14.097099,0,1.0


In [6]:
from xfeat import SelectCategorical, LabelEncoder, Pipeline


print("(2) Categorical encoding using label encoding: 13 features")
Pipeline([
    SelectCategorical(), 
    LabelEncoder(output_suffix="")]).fit_transform(pd.read_feather("train_test.ftr")
    ).reset_index(drop=True).to_feather("feature_1way_label_encoding.ftr")

(2) Categorical encoding using label encoding: 13 features


In [9]:
pd.read_feather("feature_1way_label_encoding.ftr").head()

Unnamed: 0,v22,v24,v30,v31,v47,v52,v56,v66,v75,v79,v91,v112,v113
0,0,0,0,0,0,0,0,0,0,0,0,0,-1
1,1,0,0,0,1,0,1,1,0,1,1,1,0
2,2,1,-1,0,0,1,2,1,1,0,2,2,-1
3,3,2,0,1,0,2,3,1,0,2,1,3,-1
4,4,1,-1,0,2,2,-1,0,0,3,2,4,0


In [10]:
from xfeat import SelectCategorical, ConcatCombination


print("(3) 2-order combination of categorical features: 78 features (13 * 12 / 2 = 78)")
Pipeline(
    [
        SelectCategorical(),
        ConcatCombination(drop_origin=True, r=2),
        LabelEncoder(output_suffix=""),
    ]
).fit_transform(pd.read_feather("train_test.ftr")
).reset_index(drop=True).to_feather("feature_2way_label_encoding.ftr")

(3) 2-order combination of categorical features: 78 features (13 * 12 / 2 = 78)


In [14]:
pd.read_feather("feature_2way_label_encoding.ftr").head()

Unnamed: 0,v22v24_combi,v22v30_combi,v22v31_combi,v22v47_combi,v22v52_combi,v22v56_combi,v22v66_combi,v22v75_combi,v22v79_combi,v22v91_combi,...,v75v79_combi,v75v91_combi,v75v112_combi,v75v113_combi,v79v91_combi,v79v112_combi,v79v113_combi,v91v112_combi,v91v113_combi,v112v113_combi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,0,2,2,2
3,3,3,3,3,3,3,3,3,3,3,...,3,1,3,0,3,3,2,3,3,3
4,4,4,4,4,4,4,4,4,4,4,...,4,3,4,1,4,4,3,4,4,4


In [15]:
print("(4) 3-order combination of categorical features")
# Use `include_cols=` kwargs to reduce the total count of combinations.
# 66 features (12 * 11 / 2 = 66)
Pipeline(
    [
        SelectCategorical(),
        ConcatCombination(drop_origin=True, include_cols=["v22"], r=3),
        LabelEncoder(output_suffix=""),
    ]
).fit_transform(pd.read_feather("train_test.ftr")).reset_index(
    drop=True
).to_feather("feature_3way_including_v22_label_encoding.ftr")

(4) 3-order combination of categorical features


In [21]:
pd.read_feather("feature_3way_including_v22_label_encoding.ftr").head()

Unnamed: 0,v22v24v30_combi,v22v24v31_combi,v22v24v47_combi,v22v24v52_combi,v22v24v56_combi,v22v24v66_combi,v22v24v75_combi,v22v24v79_combi,v22v24v91_combi,v22v24v112_combi,...,v22v75v79_combi,v22v75v91_combi,v22v75v112_combi,v22v75v113_combi,v22v79v91_combi,v22v79v112_combi,v22v79v113_combi,v22v91v112_combi,v22v91v113_combi,v22v112v113_combi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [22]:
pd.read_feather("feature_num_features.ftr").head()

Unnamed: 0,v10,v12,v14,v21,v34,v38,v40,v50,v62,v72,v114,v129,target
0,0.503281,6.085711,11.636387,7.730923,7.270147,0,7.711453,0.89942,1,1,15.634907,0,1.0
1,1.31291,6.507647,11.636386,6.76311,3.615077,0,14.305766,1.37921,2,2,10.308044,0,1.0
2,0.765864,6.38467,9.603542,5.245035,4.043864,0,13.077201,0.604504,1,3,11.205561,2,1.0
3,6.542669,9.646653,14.094723,7.517125,8.70355,0,11.523045,3.329176,1,2,13.777666,1,1.0
4,1.050328,6.320087,10.991098,6.414567,6.083151,0,10.13892,1.364536,1,1,14.097099,0,1.0


In [44]:
from xfeat import SelectNumerical, LambdaEncoder


print("(5) Convert numerical to categorical using round: 12 features")
df_rnum = (
    Pipeline(
        [
            SelectNumerical(),
            LambdaEncoder(
                lambda x: str(x)[:-2],
                output_suffix="_rnum",
                exclude_cols=["target"],
            ),
        ]
    )
    .fit_transform(pd.read_feather("train_test.ftr"))
    .reset_index(drop=True)
)
df_rnum.to_feather("feature_round_num.ftr")

(5) Convert numerical to categorical using round: 12 features


In [41]:
df_rnum.head()

Unnamed: 0,v10,v12,v14,v21,v34,v38,v40,v50,v62,v72,...,v14_rnum,v21_rnum,v34_rnum,v38_rnum,v40_rnum,v50_rnum,v62_rnum,v72_rnum,v114_rnum,v129_rnum
0,0.503281,6.085711,11.636387,7.730923,7.270147,0,7.711453,0.89942,1,1,...,11.6363868,7.7309233,7.2701466,,7.71145325,0.899420004,,,15.6349073,
1,1.31291,6.507647,11.636386,6.76311,3.615077,0,14.305766,1.37921,2,2,...,11.6363858,6.7631095,3.61507749,,14.305766,1.37921006,,,10.3080435,
2,0.765864,6.38467,9.603542,5.245035,4.043864,0,13.077201,0.604504,1,3,...,9.60354169,5.245035,4.04386445,,13.077201,0.604504083,,,11.2055613,
3,6.542669,9.646653,14.094723,7.517125,8.70355,0,11.523045,3.329176,1,2,...,14.0947229,7.51712473,8.7035497,,11.5230447,3.32917647,,,13.777666,
4,1.050328,6.320087,10.991098,6.414567,6.083151,0,10.13892,1.364536,1,1,...,10.9910978,6.414566,6.08315059,,10.1389198,1.36453586,,,14.0970986,


In [45]:
rnum_cols = [col for col in df_rnum.columns if col.endswith("_rnum")]
rnum_cols

['v10_rnum',
 'v12_rnum',
 'v14_rnum',
 'v21_rnum',
 'v34_rnum',
 'v38_rnum',
 'v40_rnum',
 'v50_rnum',
 'v62_rnum',
 'v72_rnum',
 'v114_rnum',
 'v129_rnum']

In [46]:
Pipeline([
    LabelEncoder(output_suffix="")]).fit_transform(
    pd.read_feather("feature_round_num.ftr")[rnum_cols]
).reset_index(drop=True).to_feather("feature_round_num_label_encoding.ftr")

In [47]:
pd.read_feather("feature_round_num_label_encoding.ftr").head()

Unnamed: 0,v10_rnum,v12_rnum,v14_rnum,v21_rnum,v34_rnum,v38_rnum,v40_rnum,v50_rnum,v62_rnum,v72_rnum,v114_rnum,v129_rnum
0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,1,1,0,0,1,0
2,2,2,2,2,2,0,2,2,0,0,2,0
3,3,3,3,3,3,0,3,3,0,0,3,0
4,4,4,4,4,4,0,4,4,0,0,4,0


In [48]:
from xfeat import ArithmeticCombinations


print("(6) 2-order Arithmetic combinations.")
Pipeline(
    [
        SelectNumerical(),
        ArithmeticCombinations(
            exclude_cols=["target"], drop_origin=True, operator="+", r=2,
        ),
    ]
).fit_transform(pd.read_feather("train_test.ftr")).reset_index(
    drop=True
).to_feather(
    "feature_arithmetic_combi2.ftr"
)

(6) 2-order Arithmetic combinations.


In [49]:
pd.read_feather("feature_arithmetic_combi2.ftr").head()

Unnamed: 0,v10v12_combi,v10v14_combi,v10v21_combi,v10v34_combi,v10v38_combi,v10v40_combi,v10v50_combi,v10v62_combi,v10v72_combi,v10v114_combi,...,v50v62_combi,v50v72_combi,v50v114_combi,v50v129_combi,v62v72_combi,v62v114_combi,v62v129_combi,v72v114_combi,v72v129_combi,v114v129_combi
0,6.588992,12.139668,8.234205,7.773428,0.503281,8.214735,1.402701,1.503281,1.503281,16.138189,...,1.89942,1.89942,16.534327,0.89942,2,16.634907,1,16.634907,1,15.634907
1,7.820557,12.949296,8.076019,4.927987,1.31291,15.618676,2.69212,3.31291,3.31291,11.620953,...,3.37921,3.37921,11.687254,1.37921,4,12.308044,2,12.308044,2,10.308044
2,7.150534,10.369406,6.010899,4.809728,0.765864,13.843065,1.370368,1.765864,3.765864,11.971425,...,1.604504,3.604504,11.810065,2.604504,4,12.205561,3,14.205561,5,13.205561
3,16.189322,20.637392,14.059794,15.246219,6.542669,18.065714,9.871846,7.542669,8.542669,20.320336,...,4.329176,5.329176,17.106843,4.329176,3,14.777666,2,15.777666,3,14.777666
4,7.370416,12.041426,7.464895,7.133479,1.050328,11.189248,2.414864,2.050328,2.050328,15.147427,...,2.364536,2.364536,15.461635,1.364536,2,15.097099,1,15.097099,1,14.097099
