In [8]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline

import raimitigations.dataprocessing as dp
from raimitigations.cohort.cohort_definition import CohortDefinition
from raimitigations.cohort.base_cohort import CohortManager

SEED = 42

In [9]:
def create_df():
    np.random.seed(SEED)
    random.seed(SEED)
    def add_nan(vec, pct):
        vec = list(vec)
        nan_index = random.sample(range(len(vec)), int(pct * len(vec)))
        for index in nan_index:
            vec[index] = np.nan
        return vec

    df = dp.create_dummy_dataset(
        samples=500,
        n_features=2,
        n_num_num=0,
        n_cat_num=2,
        n_cat_cat=0,
        num_num_noise=[0.01, 0.05],
        pct_change=[0.05, 0.1],
    )
    col_with_nan = ["num_0", "num_1", "CN_0_num_0"]
    for col in col_with_nan:
        if col != "label":
            df[col] = add_nan(df[col], 0.1)

    X = df.drop(columns=["label"])
    y = df[["label"]]

    return X, y

# -----------------------------------
def get_model():
    model = xgb.XGBClassifier(
            objective="binary:logistic",
            learning_rate=0.1,
            n_estimators=30,
            max_depth=10,
            colsample_bytree=0.7,
            alpha=0.0,
            reg_lambda=10.0,
            nthreads=4,
            verbosity=0,
            use_label_encoder=False,
        )
    return model

In [10]:
X, y = create_df()
X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,2.032241,2.022308,val0_1,val1_3
1,1.326021,1.502541,val0_1,val1_2
2,2.529883,2.382760,val0_0,val1_3
3,0.515886,1.041807,val0_0,val1_2
4,1.039100,1.382369,val0_0,val1_2
...,...,...,...,...
495,2.758050,2.574233,val0_1,val1_3
496,1.142336,-1.332735,val0_0,val1_1
497,1.210778,1.608618,,val1_2
498,,2.230597,val0_1,val1_3


In [11]:
cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
]

c1 = [ ['CN_0_num_0', '==', 'val0_1'], 'and', ['num_0', '>', 0.0] ]
c2 = [ ['CN_0_num_0', '==', 'val0_0'], 'and', ['num_0', '>', 0.0] ]
c3 = None
#c3 = [ ['CN_1_num_1', '==', 'val1_1'] ]

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)
cohort_set.fit(X=X, y=y)
new_X = cohort_set.transform(X)
new_X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,0.194340,0.780045,val0_1,val1_3
1,0.000000,0.718002,val0_1,val1_2
9,0.350901,0.815573,val0_1,val1_3
10,0.642266,0.070318,val0_1,val1_3
11,0.493946,0.868305,val0_1,val1_3
...,...,...,...,...
472,0.522286,0.908867,val0_1,val1_3
482,0.586663,0.709537,,val1_3
490,0.522286,0.882874,val0_1,val1_3
497,0.425085,0.761658,,val1_2


In [12]:
cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
    dp.EncoderOrdinal(verbose=False),
    get_model()
]

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)
cohort_set.fit(X=X, y=y)
new_X = cohort_set.transform(X)



In [13]:
new_X[0]

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,0.194340,0.780045,0,3
1,0.000000,0.718002,0,2
9,0.350901,0.815573,0,3
10,0.642266,0.070318,0,3
11,0.493946,0.868305,0,3
...,...,...,...,...
486,0.424650,0.835958,0,3
489,0.258453,0.794880,0,3
491,0.537414,0.877393,0,3
493,0.199570,0.322163,0,1


In [17]:
pred_list = cohort_set.predict_proba(X)
print(pred_list[0].shape)
print(pred_list[1].shape)
print(pred_list[2].shape)

(204, 2)
(193, 2)
(103, 2)


In [6]:
skpipe = Pipeline([
    ("cohort_preprocess", cohort_set),
    ("encoder", dp.EncoderOrdinal(verbose=False)),
    ("model", get_model())
])
skpipe.fit(X, y)
pred = skpipe.predict_proba(X)
pred = skpipe.predict(X)
pred



ValueError: ERROR: expected parameter 'df' of EncoderOrdinal to be of type pandas.DataFrame or numpy.ndarray, but got a parameter of type <class 'list'> instead.