In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

# 分类

## 读取数据

In [2]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
6118,51,Private,39264,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
23204,58,Private,51662,10th,6,Married-civ-spouse,Other-service,Wife,White,Female,0,0,8,United-States,<=50K
29590,40,Private,326310,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,<=50K
18116,37,Private,222450,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,2339,40,El-Salvador,<=50K
33964,62,Private,109190,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K


In [3]:
train_data.info()

<class 'autogluon.core.dataset.TabularDataset'>
Int64Index: 500 entries, 6118 to 24772
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             500 non-null    int64 
 1   workclass       500 non-null    object
 2   fnlwgt          500 non-null    int64 
 3   education       500 non-null    object
 4   education-num   500 non-null    int64 
 5   marital-status  500 non-null    object
 6   occupation      500 non-null    object
 7   relationship    500 non-null    object
 8   race            500 non-null    object
 9   sex             500 non-null    object
 10  capital-gain    500 non-null    int64 
 11  capital-loss    500 non-null    int64 
 12  hours-per-week  500 non-null    int64 
 13  native-country  500 non-null    object
 14  class           500 non-null    object
dtypes: int64(6), object(9)
memory usage: 62.5+ KB


In [4]:
label = 'class'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count        500
unique         2
top        <=50K
freq         365
Name: class, dtype: object


## fit()

In [5]:
save_path = 'agModels-predictClass'  # specifies folder to store trained models
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "agModels-predictClass/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' >50K', ' <=50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =  >50K, class 0 =  <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive ( >50K) vs negative ( <=50K) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    217296.08 MB
	Trai

## 读取test data

In [6]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Loaded data from: https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,31,Private,169085,11th,7,Married-civ-spouse,Sales,Wife,White,Female,0,0,20,United-States
1,17,Self-emp-not-inc,226203,12th,8,Never-married,Sales,Own-child,White,Male,0,0,45,United-States
2,47,Private,54260,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,60,United-States
3,21,Private,176262,Some-college,10,Never-married,Exec-managerial,Own-child,White,Female,0,0,30,United-States
4,17,Private,241185,12th,8,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States


## 评估结果

In [7]:
predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file

y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
# 评估结果
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8397993653393387
Evaluations on test data:
{
    "accuracy": 0.8397993653393387,
    "balanced_accuracy": 0.7437076677780596,
    "mcc": 0.5295565206264157,
    "f1": 0.6242496998799519,
    "precision": 0.7038440714672441,
    "recall": 0.5608283002588438
}


Predictions:  
 0        <=50K
1        <=50K
2         >50K
3        <=50K
4        <=50K
         ...  
9764     <=50K
9765     <=50K
9766     <=50K
9767     <=50K
9768     <=50K
Name: class, Length: 9769, dtype: object


## 查看每个model情况

In [8]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.842666,0.85,0.124808,0.015478,0.438999,0.124808,0.015478,0.438999,1,True,11
1,RandomForestGini,0.841335,0.84,0.231724,0.119614,0.803098,0.231724,0.119614,0.803098,1,True,5
2,RandomForestEntr,0.840721,0.83,0.242968,0.115446,0.678261,0.242968,0.115446,0.678261,1,True,6
3,LightGBM,0.839799,0.85,0.032772,0.028593,2.708905,0.032772,0.028593,2.708905,1,True,4
4,WeightedEnsemble_L2,0.839799,0.85,0.035677,0.029467,3.381279,0.002905,0.000874,0.672374,2,True,14
5,LightGBMXT,0.83939,0.83,0.02231,0.029306,1.312838,0.02231,0.029306,1.312838,1,True,3
6,CatBoost,0.837957,0.84,0.030301,0.014467,1.045222,0.030301,0.014467,1.045222,1,True,7
7,ExtraTreesEntr,0.834783,0.82,0.232491,0.11609,0.676039,0.232491,0.11609,0.676039,1,True,9
8,ExtraTreesGini,0.834476,0.82,0.230504,0.116096,0.70777,0.230504,0.116096,0.70777,1,True,8
9,LightGBMLarge,0.827823,0.83,0.0269,0.036274,0.600974,0.0269,0.036274,0.600974,1,True,13


## predict_proba()

In [9]:
# 预测概率
pred_probs = predictor.predict_proba(test_data_nolab)
pred_probs.head(5)

Unnamed: 0,<=50K,>50K
0,0.949797,0.050203
1,0.945973,0.054027
2,0.433299,0.566701
3,0.991393,0.008607
4,0.949908,0.050092


## fit_summary() 查看fit情况

In [10]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0               XGBoost       0.85       0.015478   0.438999                0.015478           0.438999            1       True         11
1              LightGBM       0.85       0.028593   2.708905                0.028593           2.708905            1       True          4
2   WeightedEnsemble_L2       0.85       0.029467   3.381279                0.000874           0.672374            2       True         14
3              CatBoost       0.84       0.014467   1.045222                0.014467           1.045222            1       True          7
4       NeuralNetFastAI       0.84       0.039107   5.108123                0.039107           5.108123            1       True         10
5      RandomForestGini       0.84       0.119614   0.803098                0.119614           0.803098 

In [11]:
print("AutoGluon infers problem type is: ", predictor.problem_type) # 推断问题类型
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  binary
AutoGluon identified the following types of features:
('category', []) : 8 | ['workclass', 'education', 'marital-status', 'occupation', 'relationship', ...]
('int', [])      : 6 | ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', ...]


In [12]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.842666,0.85,0.112451,0.015478,0.438999,0.112451,0.015478,0.438999,1,True,11
1,RandomForestGini,0.841335,0.84,0.229361,0.119614,0.803098,0.229361,0.119614,0.803098,1,True,5
2,RandomForestEntr,0.840721,0.83,0.2336,0.115446,0.678261,0.2336,0.115446,0.678261,1,True,6
3,LightGBM,0.839799,0.85,0.053413,0.028593,2.708905,0.053413,0.028593,2.708905,1,True,4
4,WeightedEnsemble_L2,0.839799,0.85,0.056061,0.029467,3.381279,0.002648,0.000874,0.672374,2,True,14
5,LightGBMXT,0.83939,0.83,0.03311,0.029306,1.312838,0.03311,0.029306,1.312838,1,True,3
6,CatBoost,0.837957,0.84,0.02848,0.014467,1.045222,0.02848,0.014467,1.045222,1,True,7
7,ExtraTreesEntr,0.834783,0.82,0.231794,0.11609,0.676039,0.231794,0.11609,0.676039,1,True,9
8,ExtraTreesGini,0.834476,0.82,0.230221,0.116096,0.70777,0.230221,0.116096,0.70777,1,True,8
9,LightGBMLarge,0.827823,0.83,0.054976,0.036274,0.600974,0.054976,0.036274,0.600974,1,True,13


In [13]:
predictor.predict(test_data, model='LightGBM')

0        <=50K
1        <=50K
2         >50K
3        <=50K
4        <=50K
         ...  
9764     <=50K
9765     <=50K
9766     <=50K
9767     <=50K
9768     <=50K
Name: class, Length: 9769, dtype: object

In [14]:
time_limit = 60  # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
metric = 'roc_auc'  # specify your evaluation metric here
predictor = TabularPredictor(label, eval_metric=metric, path='agModels-predictClass_auc').fit(train_data, time_limit=time_limit, presets='best_quality')
predictor.leaderboard(test_data, silent=True)

Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "agModels-predictClass_auc/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' >50K', ' <=50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =  >50K, class 0 =  <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive ( >50K) vs negative ( <=50K) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.902783,0.887489,0.114899,0.076702,8.371448,0.114899,0.076702,8.371448,1.0,True,7.0
1,LightGBMXT_BAG_L1,0.900161,0.881380,0.336767,0.163652,3.531663,0.336767,0.163652,3.531663,1.0,True,3.0
2,WeightedEnsemble_L2,0.897607,0.900375,3.633572,0.842155,20.818748,0.006412,0.001966,1.863401,2.0,True,13.0
3,LightGBM_BAG_L1,0.892347,0.866991,0.191905,0.261608,5.636509,0.191905,0.261608,5.636509,1.0,True,4.0
4,XGBoost_BAG_L1,0.891681,0.866575,1.012777,0.080954,3.22421,1.012777,0.080954,3.22421,1.0,True,11.0
5,RandomForestEntr_BAG_L1,0.888119,0.886301,0.243448,0.136669,0.685285,0.243448,0.136669,0.685285,1.0,True,6.0
6,RandomForestGini_BAG_L1,0.886598,0.884698,0.228378,0.138456,0.786898,0.228378,0.138456,0.786898,1.0,True,5.0
7,ExtraTreesGini_BAG_L1,0.881065,0.892927,0.234939,0.135098,0.688073,0.234939,0.135098,0.688073,1.0,True,8.0
8,ExtraTreesEntr_BAG_L1,0.880851,0.893912,0.236822,0.1405,0.681697,0.236822,0.1405,0.681697,1.0,True,9.0
9,NeuralNetFastAI_BAG_L1,0.880547,0.860274,2.460286,0.187569,4.997181,2.460286,0.187569,4.997181,1.0,True,10.0


# 回归

In [15]:
age_column = 'age'
print("Summary of age variable: \n", train_data[age_column].describe())

Summary of age variable: 
 count    500.00000
mean      39.65200
std       13.52393
min       17.00000
25%       29.00000
50%       38.00000
75%       49.00000
max       85.00000
Name: age, dtype: float64


In [16]:
predictor_age = TabularPredictor(label=age_column, path="agModels-predictAge").fit(train_data, time_limit=60)
performance = predictor_age.evaluate(test_data)

Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "agModels-predictAge/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (85, 17, 39.652, 13.52393)
	If 'regression' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    214910.15 MB
	Train Data (Original)  Memory Usage: 0.32 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator..

In [17]:
predictor_age.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-10.504253,-11.327516,2.812784,0.556022,13.684486,0.008053,0.000758,0.677693,2,True,12
1,ExtraTreesMSE,-10.691115,-11.480752,0.142555,0.114969,0.575395,0.142555,0.114969,0.575395,1,True,7
2,RandomForestMSE,-10.746518,-11.602848,0.136341,0.118001,0.653862,0.136341,0.118001,0.653862,1,True,5
3,LightGBMXT,-10.753344,-11.814712,0.112751,0.039148,0.815584,0.112751,0.039148,0.815584,1,True,3
4,CatBoost,-10.800412,-11.744795,0.063836,0.017125,0.974211,0.063836,0.017125,0.974211,1,True,6
5,LightGBM,-10.972156,-11.929546,0.103317,0.043093,1.013631,0.103317,0.043093,1.013631,1,True,4
6,XGBoost,-11.121008,-12.17427,0.129467,0.017468,0.787892,0.129467,0.017468,0.787892,1,True,9
7,LightGBMLarge,-11.598649,-12.167606,0.093236,0.035035,2.295146,0.093236,0.035035,2.295146,1,True,11
8,NeuralNetMXNet,-13.352706,-13.542799,2.116464,0.20546,8.186219,2.116464,0.20546,8.186219,1,True,10
9,KNeighborsUnif,-14.902058,-15.686937,0.110952,0.108677,0.003114,0.110952,0.108677,0.003114,1,True,1
