In [1]:
import pandas as pd
import numpy as np
import uci_dataset as database

import raimitigations.dataprocessing as dp
from raimitigations.cohort import DecoupledClass

# Setting up the dataset

In [2]:
df = database.load_breast_cancer()
label_col = "Class"
df[label_col] = df[label_col].replace({	"recurrence-events": 1, 
										"no-recurrence-events": 0})
df

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,0,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,0,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,0,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,0,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,1,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,1,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,1,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,1,40-49,ge40,30-34,3-5,no,3,left,left_low,no


# Test 1: Merge Invalid Cohorts 

In [3]:
preprocessing = [dp.EncoderOrdinal(), dp.BasicImputer()]

dec_class = DecoupledClass(
					cohort_col=["age", "menopause"], 
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df, label_col="Class")

dec_class.print_cohorts()

No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
FINAL COHORTS
cohort_0:
	Size: 91
	Query:
		((((((((`age` == "20-29") and (`menopause` == "premeno")) or ((`age` == "30-39") and (`me

# Test 2: Specify the Cohorts

In [4]:
cohorts = {
    "cohort_1": [['age', '==', '40-49'], 'and', ['menopause', '==', 'premeno']],
	"cohort_2": [
            [['age', '==', '60-69'], 'and', ['menopause', '==', 'ge40']], 'or',
            [['age', '==', '30-39'], 'and', ['menopause', '==', 'premeno']],
        ],
	"cohort_3": None
}

preprocessing = [dp.EncoderOrdinal(), dp.BasicImputer()]

dec_class = DecoupledClass(
					cohort_def=cohorts, 
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df, label_col="Class")

dec_class.print_cohorts()

No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
FINAL COHORTS
cohort_1:
	Size: 81
	Query:
		(`age` == "40-49") and (`menopause` == "premeno")
	Value Counts:
		0: 58 (71.60%)
		1: 23

# Test 3: Choose the Estimator

In [5]:
import xgboost as xgb

model = xgb.XGBClassifier(
            objective="binary:logistic",
            learning_rate=0.1,
            n_estimators=30,
            max_depth=10,
            colsample_bytree=0.7,
            alpha=0.0,
            reg_lambda=10.0,
            nthreads=4,
            verbosity=0,
            use_label_encoder=False,
        )

preprocessing = [dp.EncoderOrdinal(), dp.BasicImputer()]

dec_class = DecoupledClass(
					cohort_col=["age", "menopause"], 
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df, label_col="Class")

No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]
No columns specified for encoding. These columns have been automatically identfied as the following:
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat']
No columns specified for imputation. These columns have been automatically identified:
[]


<raimitigations.cohort.decoupled_class.decoupled_classifier.DecoupledClass at 0x7f029393a1f0>

## predict x predict_proba

In [6]:
X = df.drop(columns=[label_col])

y_pred = dec_class.predict(X)

print(f"y_pred size = {len(y_pred)}")
print(f"{y_pred[:6]} ... {y_pred[-6:]}")

y_pred size = 286
[0 0 1 0 0 0] ... [1 1 1 1 1 1]


In [7]:
y_pred = dec_class.predict_proba(X)

print(f"y_pred size = {len(y_pred)}")
print(f"{y_pred[:6]} ... {y_pred[-6:]}")

y_pred size = 286
[[1.  0. ]
 [1.  0. ]
 [0.5 0.5]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]] ... [[0.  1. ]
 [0.5 0.5]
 [0.  1. ]
 [0.  1. ]
 [0.  1. ]
 [0.  1. ]]


# Test 4: Transfer Learning

## Using a fixed $\theta$ value

In [8]:
preprocessing = [dp.EncoderOrdinal(verbose=False)]

imputer = dp.BasicImputer(categorical={'missing_values':np.nan, 
										'strategy':'most_frequent', 
										'fill_value':None },
                          verbose=False)
imputer.fit(df)
df_nomiss = imputer.transform(df)

dec_class = DecoupledClass(
					cohort_col=["breast-quad"], 
					theta=0.3,
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df_nomiss, label_col="Class")

dec_class.print_cohorts()

FINAL COHORTS
cohort_0:
	Size: 21
	Query:
		(`breast-quad` == "central")
	Value Counts:
		0: 17 (80.95%)
		1: 4 (19.05%)
	Invalid: True
		Cohorts used as outside data: ['cohort_1', 'cohort_2', 'cohort_3', 'cohort_4']
		Theta = 0.3


cohort_1:
	Size: 111
	Query:
		(`breast-quad` == "left_low")
	Value Counts:
		0: 75 (67.57%)
		1: 36 (32.43%)
	Invalid: False


cohort_2:
	Size: 97
	Query:
		(`breast-quad` == "left_up")
	Value Counts:
		0: 71 (73.20%)
		1: 26 (26.80%)
	Invalid: False


cohort_3:
	Size: 24
	Query:
		(`breast-quad` == "right_low")
	Value Counts:
		0: 18 (75.00%)
		1: 6 (25.00%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_4']
		Theta = 0.3


cohort_4:
	Size: 33
	Query:
		(`breast-quad` == "right_up")
	Value Counts:
		0: 20 (60.61%)
		1: 13 (39.39%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_3']
		Theta = 0.3




## Finding the best $\theta$ parameter using Cross-Validation

### Using a specific list of possible $\theta$ values

In [9]:
dec_class = DecoupledClass(
					cohort_col=["breast-quad"], 
					theta=[0.2, 0.4, 0.6, 0.8],
					min_fold_size_theta=5,
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df_nomiss, label_col="Class")

dec_class.print_cohorts()

FINAL COHORTS
cohort_0:
	Size: 21
	Query:
		(`breast-quad` == "central")
	Value Counts:
		0: 17 (80.95%)
		1: 4 (19.05%)
	Invalid: True
		Cohorts used as outside data: ['cohort_1', 'cohort_2', 'cohort_3', 'cohort_4']
		Theta = 0.6


cohort_1:
	Size: 111
	Query:
		(`breast-quad` == "left_low")
	Value Counts:
		0: 75 (67.57%)
		1: 36 (32.43%)
	Invalid: False


cohort_2:
	Size: 97
	Query:
		(`breast-quad` == "left_up")
	Value Counts:
		0: 71 (73.20%)
		1: 26 (26.80%)
	Invalid: False


cohort_3:
	Size: 24
	Query:
		(`breast-quad` == "right_low")
	Value Counts:
		0: 18 (75.00%)
		1: 6 (25.00%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_4']
		Theta = 0.6


cohort_4:
	Size: 33
	Query:
		(`breast-quad` == "right_up")
	Value Counts:
		0: 20 (60.61%)
		1: 13 (39.39%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_3']
		Theta = 0.6




### Using a default list of possible $\theta$ values

In [10]:
dec_class = DecoupledClass(
					cohort_col=["breast-quad"], 
					theta=True,
					min_fold_size_theta=5,
					min_cohort_pct=0.2,
					minority_min_rate=0.15,
					transform_pipe=preprocessing
				)
dec_class.fit(df=df_nomiss, label_col="Class")

dec_class.print_cohorts()

FINAL COHORTS
cohort_0:
	Size: 21
	Query:
		(`breast-quad` == "central")
	Value Counts:
		0: 17 (80.95%)
		1: 4 (19.05%)
	Invalid: True
		Cohorts used as outside data: ['cohort_1', 'cohort_2', 'cohort_3', 'cohort_4']
		Theta = 0.2


cohort_1:
	Size: 111
	Query:
		(`breast-quad` == "left_low")
	Value Counts:
		0: 75 (67.57%)
		1: 36 (32.43%)
	Invalid: False


cohort_2:
	Size: 97
	Query:
		(`breast-quad` == "left_up")
	Value Counts:
		0: 71 (73.20%)
		1: 26 (26.80%)
	Invalid: False


cohort_3:
	Size: 24
	Query:
		(`breast-quad` == "right_low")
	Value Counts:
		0: 18 (75.00%)
		1: 6 (25.00%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_4']
		Theta = 0.2


cohort_4:
	Size: 33
	Query:
		(`breast-quad` == "right_up")
	Value Counts:
		0: 20 (60.61%)
		1: 13 (39.39%)
	Invalid: True
		Cohorts used as outside data: ['cohort_0', 'cohort_1', 'cohort_2', 'cohort_3']
		Theta = 0.2




In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import load_iris
from copy import deepcopy

def get_iris_data():
	iris = load_iris()
	X = pd.DataFrame(iris['data'], columns = iris['feature_names'])
	y = pd.Series(iris['target'], name = 'target_values')
	#y = y.replace({2:1})
	return X, y

# -----------------------------------
def get_fold(
	X: pd.DataFrame,
	y: pd.DataFrame,
	train_index: list,
    test_index: list,
	transform_pipe: list
):
	train_x = X.filter(items=train_index, axis=0)
	train_y = y.filter(items=train_index, axis=0)
	test_x = X.filter(items=test_index, axis=0)
	test_y = y.filter(items=test_index, axis=0)
	for tf in transform_pipe:
		tf.fit(train_x)
	for tf in transform_pipe:
		train_x = tf.transform(train_x)
		test_x = tf.transform(test_x)
	return train_x, train_y, test_x, test_y

# -----------------------------------

X, y = get_iris_data()
preprocessing = [dp.EncoderOrdinal(verbose=False), dp.BasicImputer(verbose=False)]
#estimator = DecisionTreeClassifier(max_features="sqrt")
estimator = LogisticRegression(solver="liblinear")
skf = StratifiedKFold(n_splits=5)
weights = [1.0 for _ in range(df.shape[0])]
for train_index, test_index in skf.split(X, y):
	weights_train = [weights[i] for i in train_index]
	transform_copy = [deepcopy(tf) for tf in preprocessing]
	train_x, train_y, test_x, test_y = get_fold(X, y, train_index, test_index, transform_copy)
	estimator.fit(train_x, train_y, sample_weight=weights_train)
	y_pred = estimator.predict_proba(test_x)
	if len(y_pred.shape) > 1 and y_pred.shape[1] == 2:
		y_pred = y_pred[:,1]
	roc_auc = metrics.roc_auc_score(test_y, y_pred, average="weighted", multi_class='ovr')
	print(roc_auc)


1.0
0.9966666666666666
0.9766666666666666
0.9683333333333333
1.0
