In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import numpy as np

In [2]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)
print(train_data.head())

label = 'occupation'
print("Summary of occupation column: \n", train_data['occupation'].describe())

new_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
test_data = new_data[5000:].copy()  # this should be separate data in your applications
y_test = test_data[label]
test_data_nolabel = test_data.drop(columns=[label])  # delete label column
val_data = new_data[:5000].copy() # 验证集

metric = 'accuracy' # we specify eval-metric just for demo (unnecessary as it's the default)

       age workclass  fnlwgt      education  education-num  \
6118    51   Private   39264   Some-college             10   
23204   58   Private   51662           10th              6   
29590   40   Private  326310   Some-college             10   
18116   37   Private  222450        HS-grad              9   
33964   62   Private  109190      Bachelors             13   

            marital-status        occupation    relationship    race      sex  \
6118    Married-civ-spouse   Exec-managerial            Wife   White   Female   
23204   Married-civ-spouse     Other-service            Wife   White   Female   
29590   Married-civ-spouse      Craft-repair         Husband   White     Male   
18116        Never-married             Sales   Not-in-family   White     Male   
33964   Married-civ-spouse   Exec-managerial         Husband   White     Male   

       capital-gain  capital-loss  hours-per-week  native-country   class  
6118              0             0              40   United-State

In [3]:
import autogluon.core as ag

nn_options = {  # specifies non-default hyperparameter values for neural network models
    'num_epochs': 10,  # number of training epochs (controls training time of NN models)
    'learning_rate': ag.space.Real(1e-4, 1e-2, default=5e-4, log=True),  # learning rate used in training (real-valued hyperparameter searched on log-scale)
    'activation': ag.space.Categorical('relu', 'softrelu', 'tanh'),  # activation function used in NN (categorical hyperparameter, default = first entry)
    'layers': ag.space.Categorical([100], [1000], [200, 100], [300, 200, 100]),  # each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use
    'dropout_prob': ag.space.Real(0.0, 0.5, default=0.1),  # dropout probability (real-valued hyperparameter)
}

gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'num_boost_round': 100,  # number of boosting rounds (controls training time of GBM models)
    'num_leaves': ag.space.Int(lower=26, upper=66, default=36),  # number of leaves in trees (integer hyperparameter)
}

hyperparameters = {  # hyperparameters of each model type
                   'GBM': gbm_options,
                   'NN': nn_options,  # NOTE: comment this line out if you get errors on Mac OSX
                  }  # When these keys are missing from hyperparameters dict, no models of that type are trained

time_limit = 2 * 60  # train various models for ~2 min
num_trials = 5  # try at most 5 different hyperparameter configurations for each type of model
search_strategy = 'auto'  # to tune hyperparameters using Bayesian optimization routine with a local scheduler

hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
    'num_trials': num_trials,
    'scheduler' : 'local',
    'searcher': search_strategy,
}

predictor = TabularPredictor(label=label, eval_metric=metric).fit(
    train_data, tuning_data=val_data, time_limit=time_limit,
    hyperparameters=hyperparameters, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
)

No path specified. Models will be saved in: "AutogluonModels/ag-20210628_121224/"
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "AutogluonModels/ag-20210628_121224/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Tuning Data Rows:    5000
Tuning Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	First 10 (of 15) unique label values:  [' Exec-managerial', ' Other-service', ' Craft-repair', ' Sales', ' Prof-specialty', ' Protective-serv', ' ?', ' Adm-clerical', ' Machine-op-inspct', ' Tech-support']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 10 examples that will be kept for training models: 0.978
Train Data Class Count: 12
Using Feature Generators to

	Time limit exceeded
Fitted model: LightGBM/T0 ...
	0.3033	 = Validation accuracy score
	22.35s	 = Training runtime
	0.09s	 = Validation runtime
Fitted model: LightGBM/T1 ...
	0.2729	 = Validation accuracy score
	28.62s	 = Training runtime
	0.11s	 = Validation runtime
Hyperparameter tuning model: NeuralNetMXNet ...


	Time limit exceeded
Fitted model: NeuralNetMXNet/T0 ...
	0.1704	 = Validation accuracy score
	11.27s	 = Training runtime
	0.95s	 = Validation runtime
Fitted model: NeuralNetMXNet/T1 ...
	0.1333	 = Validation accuracy score
	11.21s	 = Training runtime
	1.07s	 = Validation runtime
Fitted model: NeuralNetMXNet/T2 ...
	0.1359	 = Validation accuracy score
	11.15s	 = Training runtime
	0.89s	 = Validation runtime
Fitted model: NeuralNetMXNet/T3 ...
	0.2264	 = Validation accuracy score
	12.14s	 = Training runtime
	1.05s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 119.84s of the 7.69s of remaining time.
	0.3135	 = Validation accuracy score
	1.37s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 113.7s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210628_121224/")


In [4]:
y_pred = predictor.predict(test_data_nolabel)
print("Predictions:  ", list(y_pred)[:5])
perf = predictor.evaluate(test_data, auxiliary_metrics=False)

Predictions:   [' Exec-managerial', ' Craft-repair', ' Craft-repair', ' Other-service', ' Craft-repair']


Evaluation: accuracy on test data: 0.2885300901656532
Evaluations on test data:
{
    "accuracy": 0.2885300901656532
}


In [5]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                 model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L2   0.313512       4.159989  98.109442                0.001482           1.368279            2       True          7
1          LightGBM/T0   0.303260       0.090943  22.352601                0.090943          22.352601            1       True          1
2          LightGBM/T1   0.272914       0.111888  28.624230                0.111888          28.624230            1       True          2
3    NeuralNetMXNet/T3   0.226369       1.050502  12.140137                1.050502          12.140137            1       True          6
4    NeuralNetMXNet/T0   0.170392       0.947632  11.266991                0.947632          11.266991            1       True          3
5    NeuralNetMXNet/T2   0.135944       0.888370  11.150436                0.888370          11.150436        

In [6]:
predictor = TabularPredictor(label=label, eval_metric=metric).fit(train_data,
    num_bag_folds=5,  # num_bag_folds k折交叉验证多少次
    num_bag_sets=1, num_stack_levels=1,
    hyperparameters = {'NN': {'num_epochs': 2}, 'GBM': {'num_boost_round': 20}},  # last  argument is just for quick demo here, omit it in real applications
)

No path specified. Models will be saved in: "AutogluonModels/ag-20210628_121427/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210628_121427/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	First 10 (of 15) unique label values:  [' Exec-managerial', ' Other-service', ' Craft-repair', ' Sales', ' Prof-specialty', ' Protective-serv', ' ?', ' Adm-clerical', ' Machine-op-inspct', ' Tech-support']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 10 examples that will be kept for training models: 0.978
Train Data Class Count: 12
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	

In [7]:
y_pred = predictor.predict(test_data_nolabel)
print("Predictions:  ", list(y_pred)[:5])
perf = predictor.evaluate(test_data, auxiliary_metrics=False)

Predictions:   [' Exec-managerial', ' Sales', ' Craft-repair', ' Adm-clerical', ' ?']


Evaluation: accuracy on test data: 0.2814007129377228
Evaluations on test data:
{
    "accuracy": 0.2814007129377228
}


In [8]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                   model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    WeightedEnsemble_L2   0.310838       1.024340   9.257577                0.000692           0.231300            2       True          3
1        LightGBM_BAG_L1   0.306748       0.142460   2.006487                0.142460           2.006487            1       True          1
2        LightGBM_BAG_L2   0.286299       1.139725  10.697642                0.116076           1.671364            2       True          4
3    WeightedEnsemble_L3   0.286299       1.140403  10.926354                0.000678           0.228712            3       True          6
4  NeuralNetMXNet_BAG_L1   0.106339       0.881189   7.019791                0.881189           7.019791            1       True          2
5  NeuralNetMXNet_BAG_L2   0.087935       1.927694  16.310957                0.904046           7.

In [9]:
save_path = 'agModels-predictOccupation'  # folder where to store trained models

predictor = TabularPredictor(label=label, eval_metric=metric, path=save_path).fit(
    train_data, auto_stack=True,
    time_limit=30, hyperparameters={'NN': {'num_epochs': 2}, 'GBM': {'num_boost_round': 20}}  # last 2 arguments are for quick demo, omit them in real applications
)

Beginning AutoGluon training ... Time limit = 30s
AutoGluon will save models to "agModels-predictOccupation/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	First 10 (of 15) unique label values:  [' Exec-managerial', ' Other-service', ' Craft-repair', ' Sales', ' Prof-specialty', ' Protective-serv', ' ?', ' Adm-clerical', ' Machine-op-inspct', ' Tech-support']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 10 examples that will be kept for training models: 0.978
Train Data Class Count: 12
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    209898.9 MB
	Train Data (Original)  

In [10]:
predictor = TabularPredictor.load(save_path)  # `predictor.path` is another way to get the relative path needed to later load predictor.

In [11]:
datapoint = test_data_nolabel.iloc[[0]]  # Note: .iloc[0] won't work because it returns pandas Series instead of DataFrame
print(datapoint)
predictor.predict(datapoint)

      age workclass  fnlwgt      education  education-num marital-status  \
5000   49   Private  259087   Some-college             10       Divorced   

        relationship    race      sex  capital-gain  capital-loss  \
5000   Not-in-family   White   Female             0             0   

      hours-per-week  native-country   class  
5000              40   United-States   <=50K  


5000     Exec-managerial
Name: occupation, dtype: object

In [12]:
predictor.predict_proba(datapoint)  # returns a DataFrame that shows which probability corresponds to which class

Unnamed: 0,?,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
5000,0.038825,0.165695,0.0,0.134694,0.229323,0.015379,0.034922,0.050433,0.061511,0.0,0.089848,0.0,0.104733,0.017316,0.057322


In [13]:
predictor.get_model_best()

'LightGBM_BAG_L1'

In [14]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_BAG_L1,0.283498,0.314928,0.340698,0.311854,3.515773,0.340698,0.311854,3.515773,1,True,1
1,WeightedEnsemble_L2,0.283498,0.314928,0.343772,0.312542,3.726311,0.003074,0.000688,0.210538,2,True,3
2,NeuralNetMXNet_BAG_L1,0.122038,0.071575,10.59944,1.883255,14.68804,10.59944,1.883255,14.68804,1,True,2
