In [1]:
import pickle
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder

from flaml import AutoML

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('income-data.csv')

In [3]:
X, y = data.drop(columns='IncomeLabel'), data.IncomeLabel

In [4]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, 
                                                test_size=0.2, 
                                                random_state=20130810)

In [5]:
data_train = Xtrain.assign(income=ytrain)

In [6]:
AUTOML_SETTINGS = {"time_budget": 360,  # total running time in seconds
                   "metric": 'f1',  
                   "estimator_list": ['lgbm'],  
                   "task": 'classification',  # task type    
                   "log_file_name": 'openmldata.log',  # flaml log file
                   "seed": 20130810,    # random seed
                   "verbose": 1
}

In [7]:
ct = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'),
                         make_column_selector(dtype_include=object))],
                        remainder="passthrough")

In [8]:
X_train = ct.fit_transform(Xtrain)
X_test = ct.transform(Xtest)

In [9]:
learner_automl = AutoML()

In [10]:
learner_automl.fit(X_train.toarray(), ytrain, **AUTOML_SETTINGS)

In [11]:
print(classification_report(ytest, learner_automl.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      4976
           1       0.70      0.66      0.68      1537

    accuracy                           0.85      6513
   macro avg       0.80      0.79      0.79      6513
weighted avg       0.85      0.85      0.85      6513



In [15]:
individual = (data.drop(columns='IncomeLabel')
                  .loc[0, :]
                  .to_dict())

In [32]:
individual

{'Age': 39.0,
 'Workclass': ' State-gov',
 'Education-Num': 13.0,
 'Marital Status': ' Never-married',
 'Occupation': ' Adm-clerical',
 'Relationship': ' Not-in-family',
 'Race': ' White',
 'Sex': ' Male',
 'Capital Gain': 2174.0,
 'Capital Loss': 0.0,
 'Hours per week': 40.0,
 'Country': ' United-States'}

In [36]:
def predict_single(individual, ct, model):
    X = ct.transform(pd.DataFrame([individual]))
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

In [37]:
predict_single(individual, ct, learner_automl)

0.11004927517910677

In [13]:
with open('income-prediction-model.bin', 'wb') as f_out:
    pickle.dump((ct, learner_automl), f_out)