### H2O AutoML

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
df = h2o.import_file('data.csv')

> Data Understanding

In [None]:
df.types

In [None]:
df.describe()

> Model Building

In [None]:
df_train,df_test,df_valid = df.split_frame(ratios=[.7, .15])

In [None]:
y = "Churn"
x = df.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0) #you dont need to exclude those models


In [None]:
aml.train(x = x, y = y, training_frame = df_train, validation_frame=df_valid)

In [None]:
lb = aml.leaderboard    # List of Models and their performances
lb.head()

In [None]:
df_pred=aml.leader.predict(df_test)   # predict with the best model
df_pred.head()

> Model Evaluation (Performance Analysis)

In [None]:
aml.leader.model_performance(df_test)

In [None]:
# Get the performance of a certain model
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0])
out

In [None]:
# dir(out)
out.confusion_matrix()

> Export Model

In [None]:
aml.leader.download_mojo(path = "./")

### PyCaret AutoML

In [None]:
#pycaret
# from pycaret.classification import * 
# from pycaret.regression import setup, compare_models, create_model, tune_model, plot_model, evaluate_model, save_model
from pycaret.classification import setup, compare_models, create_model, tune_model, plot_model, evaluate_model, save_model

X, y = load_iris(return_X_y=True, as_frame=True) 
X['target'] = y 
# Initialize classification setup 
# clf1 = setup(data=X, target='target', train_size = 0.8, 
#              preprocess = True, polynomial_features = True, 
#              polynomial_degree = 2, fix_imbalance = True,
#              fix_imbalance_method = 'SMOTE', feature_selection = True,
#              feature_selection_method = ' ', feature_selection_estimator = ,
#              n_features_to_select = 0.2) 


clf1 = setup(data=X, target='target', train_size = 0.8, session_id = 123)
# all_models = models()   #use this to visualize a table of models available in the model library.

# Compare models 
compare_results = compare_models(n_select=5)    #the best 5 models will be highlighted

compare_results


In [None]:
# Evaluate each model
for model in compare_results:
    evaluate_model(model) 

In [None]:
# Create a model
model = create_model('knn')     #change knn to any of the top 5 models from above
            # from pycaret.regression import models     #change to classification when needed, then do: exp = setup(X, y)
            # regression_models = models() # Get all regression model estimators  
            # print(regression_models) # Display the list of model names


# # Tune the model
tuned_model = tune_model(model)

# # Evaluate the model
evaluate_model(tuned_model)

# # Fit the model
final_model = tune_model(tuned_model)

# Save the final model in the "ML" folder
model_path = 'models/pycaret_ExtraTreesRegressor_r2'
save_model(final_model, model_path)

In [None]:
#EvalML (AutoML using EvalML doesn't just give you the best model, it also gives the best pipeline)
import evalml

# Timeseries (https://evalml.alteryx.com/en/stable/user_guide/timeseries.html?highlight=time%20series#AutoMLSearch-for-time-series-problems)



X, y = evalml.demos.load_breast_cancer()
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, regression=True) #change the problem type
                    #to see all problem types, use (evalml.problem_types.ProblemTypes.all_problem_types)
                    #to see all objectives, use (evalml.objectives.get_all_objective_names())

#Use EvalML's AutoML to perform the following steps:

#Step 1: search multiple ML methods and parameters
from evalml.automl import AutoMLSearch
automl = AutoMLSearch(problem_type='regression', objective='r2', 
                    additional_objectives=[ 'mse', 'mae', 'root mean squared error'])  #also change the problem type
automl.search(X_train, y_train)  

#Step 2: Rank each of the multiple ML algorithms to see their parameters and then choose the best
automl.rankings
# automl.describe_pipeline(automl.rankings.iloc[0]["id"]) #use this to describe each of the model/pipeline. change 0 to other values

#Step 3: Choose the best pipeline
best_pipeline=automl.best_pipeline
best_pipeline

#Step 4: You can evaluate other objective functions, or optimize the model for a specific objective
best_pipeline.score(X_test, y_test, objectives=["auc","f1","Precision","Recall"]) #evaluate other objective functions
automl_auc = AutoMLSearch(X_train=X_train, y_train=y_train,             #optimize step 1 for a specific objective
                        problem_type='binary',
                        objective='auc',
                        additional_objectives=['f1', 'precision'],
                        max_batches=1,
                        optimize_thresholds=True)

automl_auc.search()

#Step 5: Make predictions, save and load the model
best_pipeline.predict_proba(X_test).to_dataframe()
best_pipeline.save("models/model.pkl")
check_model=automl.load('models/model.pkl')
