In [1]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline

import raimitigations.dataprocessing as dp
from raimitigations.cohort.cohort_definition import CohortDefinition
from raimitigations.cohort.base_cohort import CohortManager

SEED = 42

In [2]:
def create_df():
    np.random.seed(SEED)
    random.seed(SEED)
    def add_nan(vec, pct):
        vec = list(vec)
        nan_index = random.sample(range(len(vec)), int(pct * len(vec)))
        for index in nan_index:
            vec[index] = np.nan
        return vec

    df = dp.create_dummy_dataset(
        samples=500,
        n_features=2,
        n_num_num=0,
        n_cat_num=2,
        n_cat_cat=0,
        num_num_noise=[0.01, 0.05],
        pct_change=[0.05, 0.1],
    )
    col_with_nan = ["num_0", "num_1", "CN_0_num_0"]
    for col in col_with_nan:
        if col != "label":
            df[col] = add_nan(df[col], 0.1)

    X = df.drop(columns=["label"])
    y = df[["label"]]

    return X, y

# -----------------------------------
def get_model():
    model = xgb.XGBClassifier(
            objective="binary:logistic",
            learning_rate=0.1,
            n_estimators=30,
            max_depth=10,
            colsample_bytree=0.7,
            alpha=0.0,
            reg_lambda=10.0,
            nthreads=4,
            verbosity=0,
            use_label_encoder=False,
        )
    return model

In [3]:
X, y = create_df()
X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,2.032241,2.022308,val0_1,val1_3
1,1.326021,1.502541,val0_1,val1_2
2,2.529883,2.382760,val0_0,val1_3
3,0.515886,1.041807,val0_0,val1_2
4,1.039100,1.382369,val0_0,val1_2
...,...,...,...,...
495,2.758050,2.574233,val0_1,val1_3
496,1.142336,-1.332735,val0_0,val1_1
497,1.210778,1.608618,,val1_2
498,,2.230597,val0_1,val1_3


In [4]:
cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
]

c1 = [ ['CN_0_num_0', '==', 'val0_1'], 'and', ['num_0', '>', 0.0] ]
c2 = [ ['CN_0_num_0', '==', 'val0_0'], 'and', ['num_0', '>', 0.0] ]
c3 = None
#c3 = [ ['CN_1_num_1', '==', 'val1_1'] ]

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)
cohort_set.fit(X=X, y=y)
new_X = cohort_set.transform(X)
new_X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,0.194340,0.780045,val0_1,val1_3
1,0.000000,0.718002,val0_1,val1_2
2,1.000000,1.000000,val0_0,val1_3
3,0.179172,0.755248,val0_0,val1_2
4,0.392414,0.817407,val0_0,val1_2
...,...,...,...,...
495,0.394071,0.845926,val0_1,val1_3
496,0.434489,0.321843,val0_0,val1_1
497,0.425085,0.761658,,val1_2
498,0.522286,0.843679,val0_1,val1_3


In [5]:
cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
    dp.EncoderOrdinal(verbose=False),
    get_model()
]

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)
cohort_set.fit(X=X, y=y)
new_X = cohort_set.transform(X)



In [6]:
new_X[0]

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,0.194340,0.780045,0,3
1,0.000000,0.718002,0,2
9,0.350901,0.815573,0,3
10,0.642266,0.070318,0,3
11,0.493946,0.868305,0,3
...,...,...,...,...
486,0.424650,0.835958,0,3
489,0.258453,0.794880,0,3
491,0.537414,0.877393,0,3
493,0.199570,0.322163,0,1


In [7]:
pred_list = cohort_set.predict_proba(X)
print(pred_list[0].shape)
print(pred_list[1].shape)
print(pred_list[2].shape)

(204, 2)
(193, 2)
(103, 2)


In [8]:
X, y = create_df()

try:
    cohort_pipeline = [
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False)
    ]
    cohort_set = CohortManager(
        transform_pipe=None,
        cohort_col=["CN_0_num_0", "CN_1_num_1"]
    )
except Exception as e:
    print(e)


try:
    cohort_pipeline = [
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.Rebalance(verbose=False)
    ]
    cohort_set = CohortManager(
        transform_pipe=cohort_pipeline,
        cohort_col=["CN_0_num_0", "CN_1_num_1"]
    )
except Exception as e:
    print(e)


try:
    cohort_pipeline = [
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.Synthesizer(verbose=False)
    ]
    cohort_set = CohortManager(
        transform_pipe=cohort_pipeline,
        cohort_col=["CN_0_num_0", "CN_1_num_1"]
    )
except Exception as e:
    print(e)


try:
    cohort_pipeline = [
        dp.BasicImputer(verbose=False),
        get_model(),
        dp.DataMinMaxScaler(verbose=False),
    ]
    cohort_set = CohortManager(
        transform_pipe=cohort_pipeline,
        cohort_col=["CN_0_num_0", "CN_1_num_1"]
    )
except Exception as e:
    print(e)

ERROR: the transform from class Rebalance passed to the transform_pipe parameter does not have a fit() method.
ERROR: one of the transformers in the transform_pipe parameter, from class Synthesizer, is not allowed. 
ERROR: only the last object in the transform_pipe parameter is allowed to not have a transform() method, but the object in position 1, from class XGBClassifier, doesn't have a transform() method.


In [9]:
cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
]
cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)

skpipe = Pipeline([
    ("cohort_preprocess", cohort_set),
    ("encoder", dp.EncoderOrdinal(verbose=False)),
    ("model", get_model())
])
skpipe.fit(X, y)
pred = skpipe.predict_proba(X)
pred = skpipe.predict(X)

In [10]:
X, y = create_df()
X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,2.032241,2.022308,val0_1,val1_3
1,1.326021,1.502541,val0_1,val1_2
2,2.529883,2.382760,val0_0,val1_3
3,0.515886,1.041807,val0_0,val1_2
4,1.039100,1.382369,val0_0,val1_2
...,...,...,...,...
495,2.758050,2.574233,val0_1,val1_3
496,1.142336,-1.332735,val0_0,val1_1
497,1.210778,1.608618,,val1_2
498,,2.230597,val0_1,val1_3


In [11]:
X, y = create_df()

cohort_pipeline = [
    dp.DataMinMaxScaler(verbose=False),
]

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_col=["CN_0_num_0", "CN_1_num_1"]
)
cohort_set.fit(X=X, y=y)
cohort_set.save("cohort.json")
new_X = cohort_set.transform(X)


cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def="cohort.json"
)
cohort_set.fit(X=X, y=y)
new_X = cohort_set.transform(X)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [12]:
new_X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,0.020719,0.763408,val0_1,val1_3
1,0.000000,0.000000,val0_1,val1_2
2,1.000000,1.000000,val0_0,val1_3
3,0.523626,0.651588,val0_0,val1_2
4,0.701657,0.816354,val0_0,val1_2
...,...,...,...,...
495,0.263492,0.834273,val0_1,val1_3
496,0.394104,0.531167,val0_0,val1_1
497,0.664769,0.935466,,val1_2
498,,0.790152,val0_1,val1_3


In [13]:
X

Unnamed: 0,num_0,num_1,CN_0_num_0,CN_1_num_1
0,2.032241,2.022308,val0_1,val1_3
1,1.326021,1.502541,val0_1,val1_2
2,2.529883,2.382760,val0_0,val1_3
3,0.515886,1.041807,val0_0,val1_2
4,1.039100,1.382369,val0_0,val1_2
...,...,...,...,...
495,2.758050,2.574233,val0_1,val1_3
496,1.142336,-1.332735,val0_0,val1_1
497,1.210778,1.608618,,val1_2
498,,2.230597,val0_1,val1_3


In [14]:
X, y = create_df()

X.drop(columns=["CN_0_num_0", "CN_1_num_1"], inplace=True)

cohort_pipeline = [
    dp.BasicImputer(verbose=False),
    dp.DataMinMaxScaler(verbose=False),
    get_model()
]


c1 = [ ['num_0', '>', 2.5] ]
c2 = [ ['num_0', '<', 0.5] ]
c3 = None

cohort_set = CohortManager(
    transform_pipe=cohort_pipeline,
    cohort_def=[c1, c2, c3]
)
cohort_set.fit(X=X, y=y)
#new_X = cohort_set.transform(X)
pred = cohort_set.predict_proba(X)
print(len(pred))

500


In [15]:
subsets = cohort_set.get_subsets(X, y, apply_transform=False)

for key in subsets.keys():
    print(f"\n{key}")
    print(subsets[key]["X"].shape)
    print(subsets[key]["y"].shape)


cohort_0
(123, 2)
(123, 1)

cohort_1
(23, 2)
(23, 1)

cohort_2
(354, 2)
(354, 1)
