In [183]:
owner = 'test-18'

In [184]:
import polaris as po
import datamol as dm
import numpy as np

In [185]:
#benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
#train, test = benchmark.get_train_test_split(featurization_fn=dm.to_fp)
#train, test = benchmark.get_train_test_split()
# ys = train.y
# ys = np.stack([ys[target] for target in benchmark.target_cols], axis=1)
# mask = ~np.any(np.isnan(ys), axis=1)
# mask.sum()

# X_train = train.X[mask]
# y_train = ys[mask]



In [186]:
## Balancing multi-task

from functools import partial

from imblearn.over_sampling import SMOTE
import polaris as po
import datamol as dm
import numpy as np

mapping = {
 (0.0, 0.0, 0.0): 0,
 (1.0, 0.0, 0.0): 1,
 (1.0, 0.0, 1.0): 2,
 (1.0, 1.0, 0.0): 3,
 (1.0, 1.0, 1.0): 4,
}
inv_mapping = {v: k for k, v in mapping.items()}

# load dataset
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
# use ECFP fingerprint
train, test = benchmark.get_train_test_split(featurization_fn=partial(dm.to_fp, fp_type='ecfp'))

# define order of target values
target_order = ['CLASS_KIT', 'CLASS_KIT_(T6701_mutant)', 'CLASS_KIT_(V560G_mutant)']

# reshape the y values for convenience
ys = train.y
ys = np.stack([ys[target] for target in target_order], axis=1)
ys.shape

# remove the rows with NaN values
mask = ~np.any(np.isnan(ys), axis=1)
mask.sum()
X = train.X[mask]
ys = ys[mask]

ys_scalarized = [tuple(item) for item in ys]
ys_scalarized = [mapping[item] for i, item in enumerate(ys_scalarized)]

X_resampled, y_resampled = SMOTE(k_neighbors=2).fit_resample(X, ys_scalarized)
y_resampled = [inv_mapping[item] for i, item in enumerate(y_resampled)]


X_train = X_resampled
y_train = y_resampled

[32m2024-06-21 13:26:33.023[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m
[32m2024-06-21 13:26:33.026[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m


In [187]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from bayes_opt import BayesianOptimization


def get_param(optimizer):
    best_params = optimizer.max['params']
    best_params['input_dim'] = int(best_params['input_dim'])
    best_params['n_estimators'] = int(best_params['n_estimators'])
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['min_samples_split'] = int(best_params['min_samples_split'])
    best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
    best_params['bootstrap'] = bool(best_params['bootstrap'])
    return best_params

In [190]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from bayes_opt import BayesianOptimization
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, average_precision_score


def opt_multi_targets_pca(param_bounds):

    X_train = X_resampled
    y_train = y_resampled
    # X_train = train.X[mask]
    # y_train = ys[mask]

    def objective(input_dim, n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap):
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_features = max(min(max_features, 0.999), 1e-3)  # to avoid 0 and 1
        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bool(bootstrap),
            random_state=42
        )

        pipeline = Pipeline([
            ('pca', PCA(int(input_dim))),
            ('scaler', StandardScaler()),
            ('rf', rf)
        ])
        
        # Use K-Fold cross-validation
        kfold = KFold(n_splits=2, shuffle=True, random_state=42)
        
        # Evaluate the model using cross-validation
        scoring = make_scorer(average_precision_score, needs_proba=True)
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
        return cv_scores.mean()

    
    


    
    def rf_multi_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap):
        # Convert parameters to int where necessary
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_features = max(min(max_features, 0.999), 1e-3)  # to avoid 0 and 1

        


        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bool(bootstrap),
            random_state=42
        )
        
        # Perform cross-validation
        acc = 0
        for i in range(5):
            cval = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=2)
            acc += cval.mean()
        
        return acc/5
    
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=param_bounds,
        random_state=42,
        verbose=2
    )
    
    optimizer.maximize(init_points=10, n_iter=50)

        
    best_params = get_param(optimizer)
    
    print("Best parameters found: ", best_params)
    
    best_rf = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        bootstrap=best_params['bootstrap'],
        random_state=42
    )
    
    best_rf.fit(X_train, y_train)
    y_pred = best_rf.predict(test.X)
    #cval = cross_val_score(best_rf, X_train, y_train, scoring='accuracy', cv=3)
    
    accuracy = cval.mean()
    print(f'Accuracy: {accuracy:.4f}')

    return best_rf, best_params


In [190]:
def opt_multi_targets(param_bounds):

    X_train = X_resampled
    y_train = y_resampled
    # X_train = train.X[mask]
    # y_train = ys[mask]

    def rf_multi_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap):
        # Convert parameters to int where necessary
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_features = max(min(max_features, 0.999), 1e-3)  # to avoid 0 and 1

        


        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bool(bootstrap),
            random_state=42
        )
        
        # Perform cross-validation
        acc = 0
        for i in range(5):
            cval = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=2)
            acc += cval.mean()
        
        return acc/5
    
    optimizer = BayesianOptimization(
        f=rf_multi_cv,
        pbounds=param_bounds,
        random_state=42,
        verbose=2
    )
    
    optimizer.maximize(init_points=10, n_iter=50)

        
    best_params = get_param(optimizer)
    
    print("Best parameters found: ", best_params)
    
    best_rf = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        bootstrap=best_params['bootstrap'],
        random_state=42
    )
    
    best_rf.fit(X_train, y_train)
    y_pred = best_rf.predict(test.X)
    #cval = cross_val_score(best_rf, X_train, y_train, scoring='accuracy', cv=3)
    
    accuracy = cval.mean()
    print(f'Accuracy: {accuracy:.4f}')

    return best_rf, best_params


In [191]:
param_bounds = {
    'n_estimators': (10, 200),
    'max_depth': (1, 50),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_features': (0.1, 0.999),
    'bootstrap': (0, 1)  # Treated as a boolean
}

best_rf, best_params = opt_multi_targets(param_bounds = param_bounds)


|   iter    |  target   | bootstrap | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.2057   [0m | [0m0.3745   [0m | [0m47.59    [0m | [0m0.7581   [0m | [0m6.388    [0m | [0m3.248    [0m | [0m39.64    [0m |
| [0m2        [0m | [0m0.2057   [0m | [0m0.05808  [0m | [0m43.44    [0m | [0m0.6404   [0m | [0m7.373    [0m | [0m2.165    [0m | [0m194.3    [0m |
| [95m3        [0m | [95m0.2935   [0m | [95m0.8324   [0m | [95m11.4     [0m | [95m0.2635   [0m | [95m2.651    [0m | [95m4.434    [0m | [95m109.7    [0m |
| [0m4        [0m | [0m0.2935   [0m | [0m0.4319   [0m | [0m15.27    [0m | [0m0.6501   [0m | [0m2.255    [0m | [0m4.337    [0m | [0m79.61    [0m |
| [0m5        [0m | [0m0.2057   [0m | [0m0.4561   [0m | [0m39.47    [0m | [0m0.2795   [0m | [0m5.628    [0m | [0m6.739    [0m | [0m18

KeyboardInterrupt: 

In [182]:
y_pred = best_rf.predict(test.X)
#y_pred.shape

y_prob = best_rf.predict_proba(test.X)
y_prob = np.stack(y_prob, axis=1)
#y_prob.shape

y_pred_multi = {k: y_pred[:, idx] for idx, k in enumerate(benchmark.target_cols)}
y_prob_multi = {k: y_prob[:, idx, 1] for idx, k in enumerate(benchmark.target_cols)}

results_multi = benchmark.evaluate(y_pred=y_pred_multi, y_prob=y_prob_multi)


results_multi.name = "rf_multi_augm"
results_multi.description = best_params
results_multi.to_json('rf_multi_augm.json')

  Expected `str` but got `dict` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [160]:
results_multi

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6781609195
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.3333333333
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7627201566
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7522222222
test,CLASS_KIT,roc_auc,0.730359147
test,CLASS_KIT_(T6701_mutant),pr_auc,0.6407846394

0,1
CLASS_KIT_(T6701_mutant),bootstrapTruemax_depth47max_features0.7580625536884532min_samples_leaf6min_samples_split3n_estimators39
CLASS_KIT_(V560G_mutant),bootstrapTruemax_depth47max_features0.7580625536884532min_samples_leaf6min_samples_split3n_estimators39
CLASS_KIT,bootstrapTruemax_depth35max_features0.7658783536012476min_samples_leaf2min_samples_split7n_estimators178

0,1
bootstrap,True
max_depth,47
max_features,0.7580625536884532
min_samples_leaf,6
min_samples_split,3
n_estimators,39

0,1
bootstrap,True
max_depth,47
max_features,0.7580625536884532
min_samples_leaf,6
min_samples_split,3
n_estimators,39

0,1
bootstrap,True
max_depth,35
max_features,0.7658783536012476
min_samples_leaf,2
min_samples_split,7
n_estimators,178

0,1
slug,polaris
external_id,org_2gtoaJIVrgRqiIR8Qm5BnpFCbxu
type,organization

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6781609195
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.3333333333
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7627201566
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7522222222
test,CLASS_KIT,roc_auc,0.730359147
test,CLASS_KIT_(T6701_mutant),pr_auc,0.6407846394


In [150]:
def opt_target(target, param_bounds):
    if isinstance(target, int) is True:
        target = benchmark.target_cols[target]
    
    ys = train.y[target]
    y_train = ys[mask]
    
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap):
        # Convert parameters to int where necessary
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_features = max(min(max_features, 0.999), 1e-3)  # to avoid 0 and 1
    
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bool(bootstrap),
            random_state=42
        )
        
        # Perform cross-validation
        cval = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=3)
        
        return cval.mean()
    
    
    
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=param_bounds,
        random_state=42,
        verbose=2
    )
    optimizer.maximize(init_points=10, n_iter=30)
    best_params = get_param(optimizer)

    best_rf = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        bootstrap=best_params['bootstrap'],
        random_state=42
    )
    
    best_rf.fit(X_train, y_train)

    return best_rf, best_params


In [151]:
param_bounds = {
    'n_estimators': (10, 200),
    'max_depth': (1, 50),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_features': (0.1, 0.999),
    'bootstrap': (0, 1)  # Treated as a boolean
}

models = {target: opt_target(target, param_bounds = param_bounds) for target in benchmark.target_cols}

|   iter    |  target   | bootstrap | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9746   [0m | [0m0.3745   [0m | [0m47.59    [0m | [0m0.7581   [0m | [0m6.388    [0m | [0m3.248    [0m | [0m39.64    [0m |
| [0m2        [0m | [0m0.9746   [0m | [0m0.05808  [0m | [0m43.44    [0m | [0m0.6404   [0m | [0m7.373    [0m | [0m2.165    [0m | [0m194.3    [0m |
| [0m3        [0m | [0m0.9746   [0m | [0m0.8324   [0m | [0m11.4     [0m | [0m0.2635   [0m | [0m2.651    [0m | [0m4.434    [0m | [0m109.7    [0m |
| [0m4        [0m | [0m0.9746   [0m | [0m0.4319   [0m | [0m15.27    [0m | [0m0.6501   [0m | [0m2.255    [0m | [0m4.337    [0m | [0m79.61    [0m |
| [0m5        [0m | [0m0.9746   [0m | [0m0.4561   [0m | [0m39.47    [0m | [0m0.2795   [0m | [0m5.628    [0m | [0m6.739    [0m | [0m18.83    

In [158]:
y_prob_ind = {target: model.predict_proba(test.X)[:, 1] for target, (model, params) in models.items()}
y_pred_ind = {target: model.predict(test.X) for target, (model, params) in models.items()}

best_params_ind = {target: params for target, (model, params) in models.items()}

results_ind = benchmark.evaluate(y_pred=y_pred_ind, y_prob=y_prob_ind)

results_ind.name = "rf_ind"
results_ind.description = best_params_ind
results_ind.to_json('rf_ind.json')

  Expected `str` but got `dict` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [159]:
results_ind

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6551724138
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.2105263158
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7353228963
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7227777778
test,CLASS_KIT,roc_auc,0.8010662177
test,CLASS_KIT_(T6701_mutant),pr_auc,0.6131561014

0,1
CLASS_KIT_(T6701_mutant),bootstrapTruemax_depth47max_features0.7580625536884532min_samples_leaf6min_samples_split3n_estimators39
CLASS_KIT_(V560G_mutant),bootstrapTruemax_depth47max_features0.7580625536884532min_samples_leaf6min_samples_split3n_estimators39
CLASS_KIT,bootstrapTruemax_depth35max_features0.7658783536012476min_samples_leaf2min_samples_split7n_estimators178

0,1
bootstrap,True
max_depth,47
max_features,0.7580625536884532
min_samples_leaf,6
min_samples_split,3
n_estimators,39

0,1
bootstrap,True
max_depth,47
max_features,0.7580625536884532
min_samples_leaf,6
min_samples_split,3
n_estimators,39

0,1
bootstrap,True
max_depth,35
max_features,0.7658783536012476
min_samples_leaf,2
min_samples_split,7
n_estimators,178

0,1
slug,polaris
external_id,org_2gtoaJIVrgRqiIR8Qm5BnpFCbxu
type,organization

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6551724138
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.2105263158
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7353228963
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7227777778
test,CLASS_KIT,roc_auc,0.8010662177
test,CLASS_KIT_(T6701_mutant),pr_auc,0.6131561014
