In [44]:
""" Example script for defining and using custom models in AutoGluon Tabular """

from autogluon.core.utils import infer_problem_type
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config
from autogluon.core.data import LabelCleaner
from autogluon.core.models import AbstractModel


In [48]:
import torch
import torch.nn as nn
from autogluon.core.models import AbstractModel
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import precision_score


class ConvNet(nn.Module):
    def __init__(self, input_channels,seq_len):
        super(ConvNet, self).__init__()
        self.input_channels = input_channels
        self.seq_len = seq_len

        # Input layer
        self.conv1 = nn.Conv1d(input_channels, 4, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(4)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(4, 16, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(16)
        self.relu2 = nn.ReLU()
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()

        
        # Hidden layer
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(32*seq_len, 128)
        self.bn5 = nn.BatchNorm1d(128)
        self.relu5 = nn.ReLU()
        self.fc2 = nn.Linear(128, 16)
        self.bn6 = nn.BatchNorm1d(16)
        self.relu6 = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        
        # Output layer
        self.fc3 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)

        x = self.flatten(x)
        x = self.fc1(x)
        x = self.bn5(x)
        x = self.relu5(x)
        x = self.fc2(x)
        x = self.bn6(x)
        x = self.relu6(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x
    
    def fit(self,  X: pd.DataFrame, y: pd.Series, X_val=None, y_val=None, time_limit=None, batch_size=256, epochs=3, lr=1e-4):
        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        for epoch in range(epochs):
            running_loss = 0.0
            for i in range(0, len(X), batch_size):
                inputs = X[i:i+batch_size]
                labels = y[i:i+batch_size]
                inputs = torch.unsqueeze(inputs, dim=1)
                labels = torch.unsqueeze(labels, dim=1)
                optimizer.zero_grad()
                
                # Forward pass
                outputs = self(inputs)
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()

            # Print average loss for the epoch
            print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(X)}")

    
class ConvNetModel(AbstractModel):
    # The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model.
    # `_preprocess` is called by `preprocess` and is used during model fit and model inference.
    def _preprocess(self, X, **kwargs):
        # Perform preprocessing steps such as imputation, scaling, and one-hot encoding.
        # Add any custom preprocessing steps if needed.
        print(f'Before {self.__class__.__name__} Preprocessing ({len(X.columns)} features):\n\t')
        X = super()._preprocess(X, **kwargs)
        print(f'After  {self.__class__.__name__} Preprocessing ({len(X.columns)} features):\n\t')
        return X

            
    def _fit(self, X, y, batch_size=128, epochs=10, lr=0.00001,  **kwargs):
        X = torch.tensor(X.values, dtype=torch.float32)
        y = torch.tensor(y.values, dtype=torch.float32)
        seq_len = X.shape[1]
        params = self._get_model_params()
        self.model = ConvNet(input_channels = 1, seq_len=seq_len)
        print(f'Hyperparameters: {params}')
        self.model.fit(X, y, **params)

            
    def _set_default_params(self):
        default_params = {
            'batch_size': 64,
            'lr': 1e-5,
            'epochs': 10,
        }
        for param, val in default_params.items():
            self._set_default_param_value(param, val)
            
#     def _get_default_searchspace(self):
#         space = {
#                 "batch_size":128, 
#                 'lr': Real(1e-4, 1e-2, default=5e-4, log=True),
#                 "epochs":5, 
#         }
#         return space


    def predict_proba(self, X):
        X_test = torch.tensor(X.values, dtype=torch.float32)
        X_test = X_test.unsqueeze(1)  # Adjust the input shape for Conv1d
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test)
            predictions = outputs.squeeze().numpy()
        return predictions

        def predict(self, X):
            proba = self.predict_proba(X)
        return np.round(proba).astype(int)
            

In [52]:
################
# Loading Data #
################

from autogluon.core.utils.loaders import load_pd
import pandas as pd
train_data  = load_pd.load('./train.csv')
test_data  = load_pd.load('./test.csv')
# sub dataset

concatenated_df = pd.concat([train_data,test_data], axis=0)
label = 'solubility'
print(concatenated_df.shape)

Loaded data from: ./train.csv | Columns = 4 / 4 | Rows = 11224 -> 11224
Loaded data from: ./test.csv | Columns = 3 / 3 | Rows = 1323 -> 1323


(12547, 4)


In [None]:
from pandas import DataFrame
import torch
from autogluon.features.generators import AbstractFeatureGenerator
from autogluon.common.features.types import R_INT,R_FLOAT,R_OBJECT,R_CATEGORY,S_TEXT_AS_CATEGORY 
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence

class one_hot_Generator(AbstractFeatureGenerator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.
        letter_to_int = {'C': 0, 'P': 1, 'R': 2, 'N': 3, 'F': 4, 'K': 5, 'A': 6, 'H': 7, 'Y': 8, 'V': 9, 'L': 10, 'D': 11, 'G': 12, 'E': 13, 'Q': 14, 'M': 15, 'T': 16, 'S': 17, 'I': 18, 'W': 19, 'X':20}

        def one_hot_encoding(sequence,letter_to_int):
            letter_sequence = [letter_to_int[letter] for letter in sequence]

            encoded_tensor  = torch.zeros((len(letter_to_int),len(sequence)), dtype=torch.int64)
            for i in range(len(letter_sequence)):
                encoded_tensor[letter_sequence[i],i] = 1
            return encoded_tensor
        
        # Convert the protein sequences to one-hot encoding
        
        one_hot_df = pd.DataFrame()

        # get the first column 
        column = X.iloc[:, 0]
        sequences = column.tolist()
        one_hot_seqs = []
        for seq in sequences:
            one_hot_seq = one_hot_encoding(seq,letter_to_int)
            one_hot_seqs.append(one_hot_seq.flatten().numpy())

        #print("one_hot_seqs size",one_hot_seqs.shape)
        # Create a dataframe with separate columns for each amino acid position
        max_length = max(len(seq) for seq in sequences)
        column_name = [f'aa{i}_{aa}' for aa in letter_to_int for i in range(1, max_length+1) ]


        #print("one_hot_seqs shape",one_hot_seqs.shape)
        df = pd.DataFrame(one_hot_seqs,columns = column_name)
        df  = df.fillna(value=0).astype("bool")
        return df

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_OBJECT]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.


In [56]:
from autogluon.features.generators import CategoryFeatureGenerator, AsTypeFeatureGenerator, BulkFeatureGenerator, DropUniqueFeatureGenerator, FillNaFeatureGenerator, PipelineFeatureGenerator, OneHotEncoderFeatureGenerator,IdentityFeatureGenerator
import copy
import sys
sys.path.append('/user/mahaohui/autoML/autogluon_examples')
from feature_generator import count_charge_Generator, net_charge_Generator, one_hot_Generator


train_feature_generator = PipelineFeatureGenerator(
    generators=[
        # Stage 1: Convert feature types to be the same as during fit. Does not need to be specified.
        # Stage 2: Fill NaN values of data. Does not need to be specified.
        [  # Stage 3: Add 5 to all int features and convert all object features to category features. Concatenate the outputs of each.
            # count_charge_Generator(),
            # net_charge_Generator(),
            one_hot_Generator(verbosity=3,features_in=['seq'],seq_type="protein"),
            #OneHotEncoderFeatureGenerator(),
            #CategoryFeatureGenerator(),
            IdentityFeatureGenerator(infer_features_in_args=dict(
                valid_raw_types=[R_INT, R_FLOAT])),
        ],
        # Stage 4: Drop any features which are always the same value (useless). Does not need to be specified.
     ],
    verbosity=3
)
one_hot_all_data = train_feature_generator.fit_transform(X=concatenated_df)
print(one_hot_all_data)

Fitting PipelineFeatureGenerator...
	Available Memory:                    718005.35 MB
	Train Data (Original)  Memory Usage: 4.79 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting one_hot_Generator...


{'valid_raw_types': ['object']}


			Types of features in original data (raw dtype, special dtypes):
				('object', []) : 1 | ['seq']
			Types of features in processed data (raw dtype, special dtypes):
				('bool', []) : 17010 | ['aa1_C', 'aa2_C', 'aa3_C', 'aa4_C', 'aa5_C', ...]
			448.9s = Fit runtime
			1 features in original data used to generate 17010 features in processed data.
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Unused Original Features (Count: 1): ['sid']
		These features were not used to generate any of the output features. Add a feature generator compatible with these features to utilize them.
		Features can also be unused if they carry very little information, such as being categorical but having almost entirely unique values or being duplicates of other features.
		These features do not need to be present at inference time.
		('object', []) : 1 | ['sid']
	Types of features in origi

      aa2_C  aa3_C  aa4_C  aa5_C  aa6_C  aa7_C  aa8_C  aa9_C  aa10_C  aa11_C  \
0     False  False  False  False  False  False  False  False   False   False   
1     False  False  False  False  False  False  False  False   False   False   
2     False  False  False  False  False  False  False  False   False   False   
3     False  False  False  False  False  False  False  False   False   False   
4     False  False  False  False  False  False  False  False   False   False   
...     ...    ...    ...    ...    ...    ...    ...    ...     ...     ...   
1318  False  False  False  False  False  False  False  False   False   False   
1319  False  False  False  False  False  False  False  False   False   False   
1320  False  False  False  False  False  False  False  False   False   False   
1321  False  False  False  False  False  False  False  False   False   False   
1322  False  False  False  False  False  False  False  False   False   False   

      ...  aa521_W  aa527_W  aa534_W  a

In [57]:
one_hot_train_data = one_hot_all_data[:len(train_data)]
one_hot_test_data = one_hot_all_data[len(train_data):]
print(one_hot_train_data.shape)
print(one_hot_test_data.shape)

(11224, 10783)
(1323, 10783)


In [58]:
one_hot_valid_data1 = one_hot_train_data[one_hot_train_data["fold"] ==0.0]
one_hot_train_data1 = one_hot_train_data[one_hot_train_data["fold"] !=0.0]

In [59]:
one_hot_train_data1 = one_hot_train_data1.drop(["fold"],axis=1)
one_hot_valid_data1 = one_hot_valid_data1.drop(["fold"],axis=1)

In [70]:
print(one_hot_train_data1.shape)
print(one_hot_valid_data1.shape)

(8281, 10782)
(2943, 10782)


In [None]:
from autogluon.common import space
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

custom_hyperparameters = get_hyperparameter_config('default')

# custom_hyperparameters[CustomRandomForestModel] = best_model_info['hyperparameters']

print(custom_hyperparameters)

In [12]:
# train customised model with other models together 
from autogluon.common import space

ConvNetoptions =  {
    "batch_size":space.Categorical(128), 
    'lr': space.Real(1e-5, 1e-2, default=5e-4, log=True),
    "epochs":10, 
}

custom_hyperparameters = {ConvNetModel: ConvNetoptions}

tune_kwargs = {
                  "searcher": "bayes", # bayes
                  "scheduler": "local",
                  "num_trials": 10,
              }
predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1, 
              tuning_data=one_hot_valid_data1, 
              hyperparameters=custom_hyperparameters, 
              hyperparameter_tune_kwargs= tune_kwargs,
              feature_generator=None, )  

No path specified. Models will be saved in: "AutogluonModels/ag-20230630_020944/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230630_020944/"
AutoGluon Version:  0.8.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Disk Space Avail:   200565.61 GB / 595935.30 GB (33.7%)
Train Data Rows:    8281
Train Data Columns: 10781
Tuning Data Rows:    2943
Tuning Data Columns: 10781
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during pre

Hyperparameters: {'batch_size': 128, 'lr': 0.0005, 'epochs': 10}
Epoch 1/10, Loss: 0.0056418848057873455
Epoch 2/10, Loss: 0.004856681927378203
Epoch 3/10, Loss: 0.004068589227784876
Epoch 4/10, Loss: 0.0033785005638441333
Epoch 5/10, Loss: 0.002801516175975461
Epoch 6/10, Loss: 0.0022911998981900326
Epoch 7/10, Loss: 0.0019319511211345973
Epoch 8/10, Loss: 0.0016738284438175505
Epoch 9/10, Loss: 0.001468047481662279
Epoch 10/10, Loss: 0.0013163420543220363


 20%|██████████▍                                         | 1/5 [02:45<11:00, 165.19s/it]

Hyperparameters: {'batch_size': 128, 'lr': 0.0012520653814999461, 'epochs': 10}
Epoch 1/10, Loss: 0.005437165595903271
Epoch 2/10, Loss: 0.004641821571283763
Epoch 3/10, Loss: 0.0037313649265997336
Epoch 4/10, Loss: 0.0027366700490997144
Epoch 5/10, Loss: 0.0020186217881216404
Epoch 6/10, Loss: 0.0016957681840664666
Epoch 7/10, Loss: 0.0014722569427359524
Epoch 8/10, Loss: 0.0012199352869318384
Epoch 9/10, Loss: 0.0009869540445601043
Epoch 10/10, Loss: 0.000877695745866003


 40%|████████████████████▊                               | 2/5 [05:42<08:36, 172.19s/it]

Hyperparameters: {'batch_size': 128, 'lr': 0.0026938830192854094, 'epochs': 10}
Epoch 1/10, Loss: 0.0052487633248743636
Epoch 2/10, Loss: 0.004758157975977997
Epoch 3/10, Loss: 0.0037370661745726695
Epoch 4/10, Loss: 0.0027313226591689967
Epoch 5/10, Loss: 0.0021009648833034723
Epoch 6/10, Loss: 0.0017860730015242564
Epoch 7/10, Loss: 0.0015121622677083148
Epoch 8/10, Loss: 0.0012387363992513573
Epoch 9/10, Loss: 0.0009716573903352191
Epoch 10/10, Loss: 0.0008301536837481831


 60%|███████████████████████████████▏                    | 3/5 [08:39<05:48, 174.39s/it]

Hyperparameters: {'batch_size': 128, 'lr': 0.0016051911333587626, 'epochs': 10}
Epoch 1/10, Loss: 0.005173405311401592
Epoch 2/10, Loss: 0.0046976013447607564
Epoch 3/10, Loss: 0.0038533715821164713
Epoch 4/10, Loss: 0.002889528865184261
Epoch 5/10, Loss: 0.002299545788272627
Epoch 6/10, Loss: 0.0020266810616294106
Epoch 7/10, Loss: 0.001740443860904388
Epoch 8/10, Loss: 0.001520208734602848
Epoch 9/10, Loss: 0.0012750837332441414
Epoch 10/10, Loss: 0.0011247650199128786


 80%|█████████████████████████████████████████▌          | 4/5 [11:35<02:55, 175.06s/it]

Hyperparameters: {'batch_size': 128, 'lr': 0.0012296071107325704, 'epochs': 10}
Epoch 1/10, Loss: 0.005581191471126572
Epoch 2/10, Loss: 0.004676360688683434
Epoch 3/10, Loss: 0.003548316672838421
Epoch 4/10, Loss: 0.0027399561159306427
Epoch 5/10, Loss: 0.0022333456782986038
Epoch 6/10, Loss: 0.001768866059731464
Epoch 7/10, Loss: 0.001463095377377159
Epoch 8/10, Loss: 0.0012938581958598467
Epoch 9/10, Loss: 0.0010961500327017928
Epoch 10/10, Loss: 0.0009896852964077173


100%|████████████████████████████████████████████████████| 5/5 [14:33<00:00, 174.68s/it]
Fitted model: ConvNetModel/T1 ...
	0.6983	 = Validation score   (accuracy)
	158.21s	 = Training   runtime
	4.42s	 = Validation runtime
Fitted model: ConvNetModel/T2 ...
	0.6772	 = Validation score   (accuracy)
	170.59s	 = Training   runtime
	4.57s	 = Validation runtime
Fitted model: ConvNetModel/T3 ...
	0.6922	 = Validation score   (accuracy)
	170.71s	 = Training   runtime
	4.47s	 = Validation runtime
Fitted model: ConvNetModel/T4 ...
	0.6898	 = Validation score   (accuracy)
	170.13s	 = Training   runtime
	4.09s	 = Validation runtime
Fitted model: ConvNetModel/T5 ...
	0.6867	 = Validation score   (accuracy)
	172.35s	 = Training   runtime
	4.26s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.704	 = Validation score   (accuracy)
	0.53s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 922.85s ... Best model: "WeightedEnsemble_L2"
TabularPr

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2af286668f70>

In [13]:
predictor.evaluate(one_hot_test_data, silent=True)

{'accuracy': 0.6092214663643235,
 'balanced_accuracy': 0.554387385741305,
 'mcc': 0.11600797649894112,
 'roc_auc': 0.6120068542871393,
 'f1': 0.7110117384013415,
 'precision': 0.6715945089757128,
 'recall': 0.7553444180522565}

In [14]:
predictor.evaluate(one_hot_valid_data1, silent=True)

{'accuracy': 0.7040434930343187,
 'balanced_accuracy': 0.5823321068170801,
 'mcc': 0.20353966510956048,
 'roc_auc': 0.6437852531617679,
 'f1': 0.8074286977669688,
 'precision': 0.7422764227642277,
 'recall': 0.8851187590887057}

In [15]:
leaderboard_hpo = predictor.leaderboard(one_hot_test_data, silent=True)

In [16]:
best_model_name = leaderboard_hpo[leaderboard_hpo['stack_level'] == 1]['model'].iloc[0]

predictor_info = predictor.info()
best_model_info = predictor_info['model_info'][best_model_name]

print(best_model_info)

print(f'Best Model Hyperparameters ({best_model_name}):')
print(best_model_info['hyperparameters'])

{'name': 'ConvNetModel/T3', 'model_type': 'ConvNetModel', 'problem_type': 'binary', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 170.71413469314575, 'num_classes': 2, 'quantile_levels': None, 'predict_time': 4.474642038345337, 'val_score': 0.6921508664627931, 'hyperparameters': {'batch_size': 128, 'lr': 0.0026938830192854094, 'epochs': 10}, 'hyperparameters_fit': {}, 'hyperparameters_nondefault': ['batch_size', 'lr', 'epochs'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None}, 'num_features': 10781, 'features': ['aa2_C', 'aa3_C', 'aa4_C', 'aa5_C', 'aa6_C', 'aa7_C', 'aa8_C', 'aa9_C', 'aa10_C', 'aa11_C', 'aa12_C', 'aa13_C', 'aa14_C', 'aa15_C', 'aa16_C',

In [29]:
from autogluon.tabular import TabularDataset, TabularPredictor

custom_hyperparameters = {ConvNetModel: {}}
# custom_hyperparameters = {NaiveBayesModel: [{}, {'var_smoothing': 0.00001}, {'var_smoothing': 0.000002}]}  # Train 3 NaiveBayes models with different hyperparameters
predictor1 = TabularPredictor(label=label, eval_metric="pearsonr")
predictor1.fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, hyperparameters=custom_hyperparameters,feature_generator=None,time_limit=1800)

No path specified. Models will be saved in: "AutogluonModels/ag-20230630_023946/"
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20230630_023946/"
AutoGluon Version:  0.8.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Disk Space Avail:   200545.73 GB / 595935.30 GB (33.7%)
Train Data Rows:    8281
Train Data Columns: 10781
Tuning Data Rows:    2943
Tuning Data Columns: 10781
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to prepr

Hyperparameters: {'batch_size': 64, 'lr': 1e-05, 'epochs': 10}
Epoch 1/10, Loss: 0.011847384748158284
Epoch 2/10, Loss: 0.010376686484697662
Epoch 3/10, Loss: 0.009373898426845243
Epoch 4/10, Loss: 0.008750129711972013
Epoch 5/10, Loss: 0.008285377631223716
Epoch 6/10, Loss: 0.007975237364642878
Epoch 7/10, Loss: 0.007706386104987276
Epoch 8/10, Loss: 0.007495529293769254
Epoch 9/10, Loss: 0.007374110414474832
Epoch 10/10, Loss: 0.007198305740950693


	0.2583	 = Validation score   (pearsonr)
	275.8s	 = Training   runtime
	4.22s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 1513.13s of remaining time.
	0.2583	 = Validation score   (pearsonr)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 288.74s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230630_023946/")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2af2884674c0>

In [30]:
predictor1.evaluate(one_hot_test_data, silent=True)

{'pearsonr': 0.20905030079025533,
 'accuracy': 0.6039304610733183,
 'balanced_accuracy': 0.6086142784480076,
 'mcc': 0.2090503007902555,
 'roc_auc': 0.6358413044873852,
 'f1': 0.6552631578947369,
 'precision': 0.7345132743362832,
 'recall': 0.5914489311163895}

In [31]:
predictor1.evaluate(one_hot_valid_data1, silent=True)

{'pearsonr': 0.258276471621472,
 'accuracy': 0.671423717295277,
 'balanced_accuracy': 0.6349802251795709,
 'mcc': 0.25827647162147194,
 'roc_auc': 0.6645080531441414,
 'f1': 0.7558697298661954,
 'precision': 0.7887249736564805,
 'recall': 0.7256422685409598}

In [32]:
from autogluon.common import space

ConvNetoptions =  {
    "batch_size":space.Categorical(128), 
    'lr': space.Real(1e-4, 1e-2, default=5e-4, log=True),
    "epochs":10, 
}
hyperparameters = get_hyperparameter_config('default')
custom_hyperparameters = {ConvNetModel: ConvNetoptions}
hyperparameters["ConvNetModel"] = ConvNetoptions
print()

tune_kwargs = {
                  "searcher": "bayes", # bayes
                  "scheduler": "local",
                  "num_trials": 5,
              }
predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1, 
              tuning_data=one_hot_valid_data1, 
              hyperparameters=custom_hyperparameters, 
              hyperparameter_tune_kwargs= "auto",
              feature_generator=None, )  

No path specified. Models will be saved in: "AutogluonModels/ag-20230630_025230/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230630_025230/"
AutoGluon Version:  0.8.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Disk Space Avail:   200543.49 GB / 595935.30 GB (33.7%)
Train Data Rows:    8281
Train Data Columns: 10781
Tuning Data Rows:    2943
Tuning Data Columns: 10781
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during pre

Hyperparameters: {'batch_size': 128, 'lr': 0.0005, 'epochs': 10}
Epoch 1/10, Loss: 0.005282002404582283
Epoch 2/10, Loss: 0.004434149196334772
Epoch 3/10, Loss: 0.0036115422583968006
Epoch 4/10, Loss: 0.0028793907060440057
Epoch 5/10, Loss: 0.0023020511609375613
Epoch 6/10, Loss: 0.0019339724480853882
Epoch 7/10, Loss: 0.0016982740731706308
Epoch 8/10, Loss: 0.0014651577738428042
Epoch 9/10, Loss: 0.001308158286569948
Epoch 10/10, Loss: 0.0011819134653154426


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [05:19<00:00, 319.28s/it]
Fitted model: ConvNetModel/T1 ...
	0.6721	 = Validation score   (accuracy)
	313.71s	 = Training   runtime
	4.23s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6721	 = Validation score   (accuracy)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 342.33s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230630_025230/")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2af29a1e75b0>

In [33]:
predictor1.evaluate(one_hot_test_data, silent=True)

{'pearsonr': 0.20905030079025533,
 'accuracy': 0.6039304610733183,
 'balanced_accuracy': 0.6086142784480076,
 'mcc': 0.2090503007902555,
 'roc_auc': 0.6358413044873852,
 'f1': 0.6552631578947369,
 'precision': 0.7345132743362832,
 'recall': 0.5914489311163895}

In [34]:
predictor1.evaluate(one_hot_valid_data1, silent=True)

{'pearsonr': 0.258276471621472,
 'accuracy': 0.671423717295277,
 'balanced_accuracy': 0.6349802251795709,
 'mcc': 0.25827647162147194,
 'roc_auc': 0.6645080531441414,
 'f1': 0.7558697298661954,
 'precision': 0.7887249736564805,
 'recall': 0.7256422685409598}

In [None]:
custom_hyperparameters = get_hyperparameter_config('default')

custom_hyperparameters[CustomRandomForestModel] = best_model_info['hyperparameters']

In [38]:
hyperparameters = get_hyperparameter_config('default')

{'XT': [{'criterion': 'gini',
   'ag_args': {'name_suffix': 'Gini',
    'problem_types': ['binary', 'multiclass']}},
  {'criterion': 'entropy',
   'ag_args': {'name_suffix': 'Entr',
    'problem_types': ['binary', 'multiclass']}},
  {'criterion': 'squared_error',
   'ag_args': {'name_suffix': 'MSE',
    'problem_types': ['regression', 'quantile']}}]}

In [71]:
from autogluon.common import space

ConvNetoptions =  {
    "batch_size":128, 
    'lr': space.Real(1e-6, 1e-1, default=5e-4, log=True),
    "epochs":10, 
}
tune_kwargs = {
                  "searcher": "grid", # bayes
                  "scheduler": "local",
                  "num_trials": 10,
              }

custom_hyperparameters = {ConvNetModel: ConvNetoptions}
hyperparameters = get_hyperparameter_config('default')
hyperparameters[ConvNetModel] = ConvNetoptions
print(hyperparameters)


{'NN_TORCH': {}, 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'], 'CAT': {}, 'XGB': {}, 'FASTAI': {}, 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}], <class '__main__.ConvNetModel'>: {'batch_size': 128, 'lr': Real: lower=1e-06, upp

In [68]:
predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1, 
              tuning_data=one_hot_valid_data1, 
              hyperparameters=custom_hyperparameters, 
              hyperparameter_tune_kwargs= tune_kwargs,
              feature_generator=None, )  


No path specified. Models will be saved in: "AutogluonModels/ag-20230630_083935/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230630_083935/"
AutoGluon Version:  0.8.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Disk Space Avail:   200382.14 GB / 595935.30 GB (33.6%)
Train Data Rows:    8281
Train Data Columns: 10781
Tuning Data Rows:    2943
Tuning Data Columns: 10781
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during pre

ValueError: AutoGluon did not successfully train any models

In [33]:
predictor.evaluate(one_hot_test_data, silent=True)

{'accuracy': 0.76,
 'balanced_accuracy': 0.5,
 'mcc': 0.0,
 'roc_auc': 0.44243421052631576,
 'f1': 0.8636363636363636,
 'precision': 0.76,
 'recall': 1.0}

In [34]:
predictor.evaluate(one_hot_valid_data1, silent=True)

{'accuracy': 0.8359788359788359,
 'balanced_accuracy': 0.5,
 'mcc': 0.0,
 'roc_auc': 0.5160269497754186,
 'f1': 0.9106628242074928,
 'precision': 0.8359788359788359,
 'recall': 1.0}

In [None]:
{'name': 'CustomRandomForestModel/T4', 'model_type': 'CustomRandomForestModel', 'problem_type': 'binary', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 0.5498244762420654, 'num_classes': 2, 'quantile_levels': None, 'predict_time': 0.057376861572265625, 'val_score': 0.855, 'hyperparameters': {'n_estimators': 300, 'n_jobs': -1, 'random_state': 0, 'max_depth': 26, 'max_features': 0.4459435365634299, 'criterion': 'entropy'}, 'hyperparameters_fit': {}, 'hyperparameters_nondefault': ['max_depth', 'max_features', 'criterion', 'n_estimators', 'n_jobs', 'random_state'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': ['int', 'float', 'category'], 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None}, 'num_features': 14, 'features': ['age', 'fnlwgt', 'education-num', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'], 'feature_metadata': <autogluon.common.features.feature_metadata.FeatureMetadata object at 0x7f852d7077f0>, 'memory_size': 4331543, 'compile_time': None}
Best Model Hyperparameters (CustomRandomForestModel/T4):
{'n_estimators': 300, 'n_jobs': -1, 'random_state': 0, 'max_depth': 26, 'max_features': 0.4459435365634299, 'criterion': 'entropy'}

In [None]:
{'NN_TORCH': {}, 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'], 'CAT': {}, 'XGB': {}, 'FASTAI': {}, 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}], <class '__main__.CustomRandomForestModel'>: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 0, 'max_depth': 26, 'max_features': 0.4459435365634299, 'criterion': 'entropy'}}