In [1]:
from scipy.spatial import distance_matrix
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np
import torch

In [2]:
from SearchingOptimalEnsembles.metadatasets.ftc.metadataset import FTCMetaDataset
from SearchingOptimalEnsembles.metadatasets.ftc.hub import MODELS
from SearchingOptimalEnsembles.posthoc.neural_ensembler import NeuralEnsembler
from SearchingOptimalEnsembles.posthoc.greedy_ensembler import GreedyEnsembler

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:

def preprocess(X):
    X = pd.DataFrame(X)
    ohe = OneHotEncoder(handle_unknown='ignore', categories=[MODELS])
    X_model = ohe.fit_transform(X[["model"]].values).todense()
    X = np.concatenate([X[["lora_r", "learning_rate"]], X_model], axis=-1)
    return X

In [7]:
dataset_name = 'imdb'
data_verion="mini"

metadataset_mini = FTCMetaDataset( metric_name="error",
                                data_version="mini")
metadataset_extended = FTCMetaDataset( metric_name="error",
                        data_version="extended")  

results = []
for dataset_name in metadataset_extended.get_dataset_names():
    metadataset_mini.set_state(dataset_name=dataset_name, split="valid")
    metadataset_extended.set_state(dataset_name=dataset_name, split="valid")

    hp_mini = metadataset_mini.row_hp_candidates[dataset_name]
    hp_extended = metadataset_extended.row_hp_candidates[dataset_name]

    hp_mini = preprocess(hp_mini)
    hp_extended = preprocess(hp_extended)

        
    a = distance_matrix(hp_mini, hp_extended)
    from_mini_to_extended = a.argmin(-1)[a.min(-1)==0]
    from_extended_to_mini = a.argmin(0)[a.min(0)==0]

    ge = GreedyEnsembler(metadataset=metadataset_mini,
                        max_num_pipelines=5)
    X_obs_extended = np.arange(len(hp_extended))
    X_obs_mini = from_extended_to_mini[X_obs_extended]
    a = ge.sample(X_obs_mini)
    best_ensemble_mini = ge.best_ensemble

    best_ensemble_extended = from_mini_to_extended[best_ensemble_mini]
    print(best_ensemble_mini, best_ensemble_extended)
    metadataset_mini.set_state(dataset_name=dataset_name,
                            split="test")
    metadataset_extended.set_state(dataset_name=dataset_name,
                            split="test")
    output = metadataset_extended.evaluate_ensembles([best_ensemble_extended])
    print(dataset_name, output[1])
    results.append([dataset_name, output[1]])

GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[118, 94, 82, 84] [118  94  82  84]


  return self.times[self.dataset_name][torch.LongTensor(ensembles)]


imdb tensor([0.0376])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[68, 57, 93, 79, 23] [68 57 93 79 23]
mteb/tweet_sentiment_extraction tensor([0.1930])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[98, 44, 83, 73, 2] [98 44 83 73  2]
ag_news tensor([0.0588])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[18, 23, 12, 10] [18 23 12 10]
dbpedia_14 tensor([0.0080])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[118, 123, 105, 84] [118 123 105  84]
stanfordnlp/sst2 tensor([0.0382])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[13, 3, 8, 23, 22] [13  3  8 23 22]


In [None]:
results

NameError: name 'results' is not defined

In [21]:
metric_name = "nll"
metadataset_mini = FTCMetaDataset( metric_name=metric_name,
                                data_version="mini")
metadataset_extended = FTCMetaDataset( metric_name=metric_name,
                        data_version="extended")  

results = []
for dataset_name in metadataset_extended.get_dataset_names():
    metadataset_mini.set_state(dataset_name=dataset_name, split="valid")
    metadataset_extended.set_state(dataset_name=dataset_name, split="valid")

    hp_mini = metadataset_mini.row_hp_candidates[dataset_name]
    hp_extended = metadataset_extended.row_hp_candidates[dataset_name]

    hp_mini = preprocess(hp_mini)
    hp_extended = preprocess(hp_extended)

        
    a = distance_matrix(hp_mini, hp_extended)
    from_mini_to_extended = a.argmin(-1)[a.min(-1)==0]
    from_extended_to_mini = a.argmin(0)[a.min(0)==0][:len(hp_extended)]

    ge = GreedyEnsembler(metadataset=metadataset_mini,
                        max_num_pipelines=5)
    X_obs_extended = np.arange(len(hp_extended))
    X_obs_mini = from_extended_to_mini[X_obs_extended]
    X_obs_mini = [x for x in X_obs_mini if x in X_obs_extended.tolist()]
    ge.sample(X_obs_mini)
    best_ensemble_mini = ge.best_ensemble

    best_ensemble_extended = from_mini_to_extended[best_ensemble_mini]
    print(best_ensemble_mini, best_ensemble_extended)
    metadataset_mini.set_state(dataset_name=dataset_name,
                            split="test")
    metadataset_extended.set_state(dataset_name=dataset_name,
                            split="test")
    output = metadataset_extended.evaluate_ensembles([best_ensemble_extended])
    print(dataset_name, output[1])
    results.append([dataset_name, output[1]])

GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[124, 98, 118, 84, 87] [124  98 118  84  87]
imdb tensor([0.1187])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4
[54, 57, 73, 81, 28] [54 57 73 81 28]
mteb/tweet_sentiment_extraction tensor([0.5477])
GreedyEnsembler: 0
GreedyEnsembler: 1
GreedyEnsembler: 2
GreedyEnsembler: 3
GreedyEnsembler: 4


IndexError: index 99 is out of bounds for axis 0 with size 99

In [30]:
from_extended_to_mini[X_obs_extended]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [34]:
from_extended_to_mini

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [17]:
hp_extended.shape

(99, 7)

In [20]:
a

([99, 98, 38, 16, 93], tensor(0.2232))

In [18]:
hp_mini.shape

(120, 7)

In [27]:
metadataset_extended.get_dataset_names(
)

['imdb',
 'mteb/tweet_sentiment_extraction',
 'ag_news',
 'dbpedia_14',
 'stanfordnlp/sst2',
 'SetFit/mnli']

In [None]:
metadataset_extended.evaluate_ensembles([best_ensemble_extended])