# Notebook description

<p>This notebook trains the Model for Recommending Framework (MRF) for classification task.</p>
<p>The MRF is trained using simulated benchmark data.</p>
<p>Actual benchmark should be performed, as described in our paper.</p>
<p>Trained MRF is saved and available for actual usage on the service.</p>
<hr>

### Define constants


In [1]:
FRAMEWORKS = ["AutoSklearn1", "TPOT", "FLAML", "LightAutoML"]
TIME_BUDGETS = [15, 60, 240]  # in minutes

N = 100  # number of datasets
M = len(FRAMEWORKS)    # number of AutoML frameworks
T = len(TIME_BUDGETS)    # number of time budgets

SIZE_MIN = 1000
SIZE_MAX = 500000
FEATURES_MIN = 5
FEATURES_MAX = 200
NUMERICAL_MIN_PERCENTAGE = 0
NUMERICAL_MAX_PERCENTAGE = 1
MISSING_MIN_PERCENTAGE = 0
MISSING_MAX_PERCENTAGE = 0.2
ACCURACY_MIN_PERCENTAGE = 0.7
ACCURACY_MAX_PERCENTAGE = 1

MRF_NAME = "mrf_classification.mrf"
MRF_PATH = "/automl-user-side/autoML_service/frs/mrf/"

### Simulate Benchmark Training data

In [2]:
import numpy as np
import pandas as pd
import random


class SimulatedDataEntry():
    def __init__(self):
        self.entry = {}
    
    @classmethod
    def make_from_data_entry(cls, data_entry: "SimulatedDataEntry"):
        new_data_entry = SimulatedDataEntry()
        new_data_entry.entry = data_entry.entry.copy()
    
        return new_data_entry
    
    def generate_dataset_charachteristics(self, dataset_id: int):
        size = random.randint(SIZE_MIN, SIZE_MAX) 
        features = random.randint(FEATURES_MIN, FEATURES_MAX)
        numerical = np.round(random.uniform(NUMERICAL_MIN_PERCENTAGE, NUMERICAL_MAX_PERCENTAGE), 2)
        categorical = 1 - numerical
        missing = np.round(random.uniform(MISSING_MIN_PERCENTAGE, MISSING_MAX_PERCENTAGE), 2)

        dataset_charachteristics = {
            "dataset_id": dataset_id,
            "size": size,
            "features": features,
            "numerical": numerical,
            "categorical": categorical,
            "missing": missing
        }
        
        self.entry = dataset_charachteristics
        
    def generate_framework_performance(self, framework: str, time_budget: int):
        self.entry["framework"] = framework
        self.entry["time_budget"] = time_budget
        self.entry["accuracy"] = random.uniform(ACCURACY_MIN_PERCENTAGE, ACCURACY_MAX_PERCENTAGE)
        

class SimulatedTrainingData():
    def __init__(self, N: int, M: int):
        data = []
        for i in range(0, N):
            simulated_data_entry = SimulatedDataEntry()
            simulated_data_entry.generate_dataset_charachteristics(i)

            for framework in FRAMEWORKS:
                for time in TIME_BUDGETS:
                    new_data_entry = SimulatedDataEntry.make_from_data_entry(simulated_data_entry)
                    new_data_entry.generate_framework_performance(framework, time)
                    data.append(new_data_entry.entry)
            
        header = new_data_entry.entry.keys()
        self.df = pd.DataFrame(data, columns=header)
        
    
simulated_data = SimulatedTrainingData(N, M)
simulated_data.df.head(20)

Unnamed: 0,dataset_id,size,features,numerical,categorical,missing,framework,time_budget,accuracy
0,0,230580,9,0.4,0.6,0.02,AutoSklearn1,15,0.911716
1,0,230580,9,0.4,0.6,0.02,AutoSklearn1,60,0.742363
2,0,230580,9,0.4,0.6,0.02,AutoSklearn1,240,0.888508
3,0,230580,9,0.4,0.6,0.02,TPOT,15,0.835362
4,0,230580,9,0.4,0.6,0.02,TPOT,60,0.704748
5,0,230580,9,0.4,0.6,0.02,TPOT,240,0.762984
6,0,230580,9,0.4,0.6,0.02,FLAML,15,0.736692
7,0,230580,9,0.4,0.6,0.02,FLAML,60,0.740943
8,0,230580,9,0.4,0.6,0.02,FLAML,240,0.920389
9,0,230580,9,0.4,0.6,0.02,LightAutoML,15,0.763096


### Convert Categorical Features To Numerical

In [3]:
from sklearn.preprocessing import LabelEncoder

df_input = simulated_data.df.copy()
df_input.drop(columns=["dataset_id"], inplace=True)

le = LabelEncoder()
df_input["framework"] = le.fit_transform(df_input["framework"])

df_input.head(20)


Unnamed: 0,size,features,numerical,categorical,missing,framework,time_budget,accuracy
0,230580,9,0.4,0.6,0.02,0,15,0.911716
1,230580,9,0.4,0.6,0.02,0,60,0.742363
2,230580,9,0.4,0.6,0.02,0,240,0.888508
3,230580,9,0.4,0.6,0.02,3,15,0.835362
4,230580,9,0.4,0.6,0.02,3,60,0.704748
5,230580,9,0.4,0.6,0.02,3,240,0.762984
6,230580,9,0.4,0.6,0.02,1,15,0.736692
7,230580,9,0.4,0.6,0.02,1,60,0.740943
8,230580,9,0.4,0.6,0.02,1,240,0.920389
9,230580,9,0.4,0.6,0.02,2,15,0.763096


### Train/Test Data Split

In [4]:
from sklearn.model_selection import train_test_split

SPLIT_SIZE = 0.2
OUTPUT = "accuracy"
STRATIFY = "framework"

#___Split data into training and testing sets
df_train, df_test = train_test_split(df_input, test_size=SPLIT_SIZE, stratify=df_input[STRATIFY])

#___Split training and testing sets into inputs and outputs
X_train = df_train[df_train.columns.difference([OUTPUT], sort=False)].to_numpy()
y_train = df_train[OUTPUT].to_numpy()

X_test = df_test[df_test.columns.difference([OUTPUT], sort=False)].to_numpy()
y_test = df_test[OUTPUT].to_numpy()

print(f"Training set ({100*(1-SPLIT_SIZE)} %):\n - Input size: {X_train.shape}\n - Output size: {y_train.shape}\n")
print(f"Testing set ({100*SPLIT_SIZE} %):\n - Input size: {X_test.shape}\n - Output size: {y_test.shape}\n")


Training set (80.0 %):
 - Input size: (960, 7)
 - Output size: (960,)

Testing set (20.0 %):
 - Input size: (240, 7)
 - Output size: (240,)



### Train Model For Recommending Framework

In [5]:
from sklearn.ensemble import RandomForestRegressor

mrf = RandomForestRegressor(n_estimators = 100, random_state = 42, max_depth=6)
mrf.fit(X_train, y_train)


RandomForestRegressor(max_depth=6, random_state=42)

### Demonstration Of MRF For New Data 

In [6]:
size = random.randint(SIZE_MIN, SIZE_MAX) 
features = random.randint(FEATURES_MIN, FEATURES_MAX)
numerical = np.round(random.uniform(NUMERICAL_MIN_PERCENTAGE, NUMERICAL_MAX_PERCENTAGE), 2)
categorical = np.round(1 - numerical, 2)
missing = np.round(random.uniform(MISSING_MIN_PERCENTAGE, MISSING_MAX_PERCENTAGE), 2)
time_budget = random.randint(min(TIME_BUDGETS), max(TIME_BUDGETS))

print("Dataset charachteristics:")
print(f"  - Size: {size}")
print(f"  - Number of features: {features}")
print(f"  - Numerical features ratio: {numerical}")
print(f"  - Categorical fetures ratio: {categorical}")
print(f"  - Missing values ratio: {missing}")

print(f"\nTime budget: {time_budget} minutes")

mrf_predictions = []
for framework in FRAMEWORKS:
    framework_label = le.transform([framework])[0]
    
    new_data = np.array([size, features, numerical, categorical, missing, framework_label, time_budget]).reshape(1, -1)
    prediction = {
        "framework": framework,
        "predicted_accuracy": np.round(mrf.predict(new_data)[0], 3)
    }

    mrf_predictions.append(prediction)
    
mrf_predictions.sort(key=lambda x: x["predicted_accuracy"], reverse=True)
for i in range(len(mrf_predictions)):
    mrf_predictions[i]["rank"] = i + 1

df_rankings = pd.DataFrame(mrf_predictions, columns=["framework", "predicted_accuracy", "rank"])
df_rankings.set_index("rank", inplace=True)

print("\nFramework rankings: ")
df_rankings


Dataset charachteristics:
  - Size: 3091
  - Number of features: 50
  - Numerical features ratio: 0.1
  - Categorical fetures ratio: 0.9
  - Missing values ratio: 0.01

Time budget: 224 minutes

Framework rankings: 


Unnamed: 0_level_0,framework,predicted_accuracy
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,FLAML,0.839
2,AutoSklearn1,0.838
3,TPOT,0.837
4,LightAutoML,0.836


### Saving the MRF

In [7]:
import joblib

mrf_binary = {
    "model": mrf,
    "label_encoder": le 
}

filepath = "".join([MRF_PATH, MRF_NAME])
joblib.dump(mrf_binary, filepath)

['/automl-user-side/autoML_service/frs/mrf/mrf_classification.mrf']

### Testing saved MRF

In [8]:
filepath = "".join([MRF_PATH, MRF_NAME])
mrf_loaded = joblib.load(filepath)
mrf_model = mrf_loaded["model"]
mrf_label_encoder = mrf_loaded["label_encoder"]

size = random.randint(SIZE_MIN, SIZE_MAX) 
features = random.randint(FEATURES_MIN, FEATURES_MAX)
numerical = np.round(random.uniform(NUMERICAL_MIN_PERCENTAGE, NUMERICAL_MAX_PERCENTAGE), 2)
categorical = np.round(1 - numerical, 2)
missing = np.round(random.uniform(MISSING_MIN_PERCENTAGE, MISSING_MAX_PERCENTAGE), 2)
time_budget = random.randint(min(TIME_BUDGETS), max(TIME_BUDGETS))

print("Dataset charachteristics:")
print(f"  - Size: {size}")
print(f"  - Number of features: {features}")
print(f"  - Numerical features ratio: {numerical}")
print(f"  - Categorical fetures ratio: {categorical}")
print(f"  - Missing values ratio: {missing}")

print(f"\nTime budget: {time_budget} minutes")

mrf_predictions = []
for framework in FRAMEWORKS:
    framework_label = mrf_label_encoder.transform([framework])[0]
    
    new_data = np.array([size, features, numerical, categorical, missing, framework_label, time_budget]).reshape(1, -1)
    prediction = {
        "framework": framework,
        "predicted_accuracy": np.round(mrf_model.predict(new_data)[0], 3)
    }

    mrf_predictions.append(prediction)
    
mrf_predictions.sort(key=lambda x: x["predicted_accuracy"], reverse=True)
for i in range(len(mrf_predictions)):
    mrf_predictions[i]["rank"] = i + 1

df_rankings = pd.DataFrame(mrf_predictions, columns=["framework", "predicted_accuracy", "rank"])
df_rankings.set_index("rank", inplace=True)

print("\nFramework rankings: ")
df_rankings

Dataset charachteristics:
  - Size: 342668
  - Number of features: 6
  - Numerical features ratio: 0.7
  - Categorical fetures ratio: 0.3
  - Missing values ratio: 0.05

Time budget: 102 minutes

Framework rankings: 


Unnamed: 0_level_0,framework,predicted_accuracy
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,TPOT,0.854
2,LightAutoML,0.85
3,FLAML,0.844
4,AutoSklearn1,0.84
